diff options
Diffstat (limited to 'arch')
62 files changed, 5538 insertions, 2233 deletions
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h index ee541ce..c5f92a9 100644 --- a/arch/ia64/kvm/lapic.h +++ b/arch/ia64/kvm/lapic.h @@ -25,5 +25,6 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); #define kvm_apic_present(x) (true) +#define kvm_lapic_enabled(x) (true) #endif diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 6c5547d..18ea696 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -86,5 +86,6 @@ struct kvm_guest_debug_arch { #define KVM_INTERRUPT_SET -1U #define KVM_INTERRUPT_UNSET -2U +#define KVM_INTERRUPT_SET_LEVEL -3U #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index c5ea4cd..5b75046 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -58,6 +58,7 @@ #define BOOK3S_INTERRUPT_INST_STORAGE 0x400 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 #define BOOK3S_INTERRUPT_EXTERNAL 0x500 +#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600 #define BOOK3S_INTERRUPT_PROGRAM 0x700 #define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 @@ -84,7 +85,8 @@ #define BOOK3S_IRQPRIO_EXTERNAL 13 #define BOOK3S_IRQPRIO_DECREMENTER 14 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15 -#define BOOK3S_IRQPRIO_MAX 16 +#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 16 +#define BOOK3S_IRQPRIO_MAX 17 #define BOOK3S_HFLAG_DCBZ32 0x1 #define BOOK3S_HFLAG_SLB 0x2 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 8274a2d..d62e703 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -38,15 +38,6 @@ struct kvmppc_slb { bool class : 1; }; -struct kvmppc_sr { - u32 raw; - u32 vsid; - bool Ks : 1; - bool Kp : 1; - bool nx : 1; - bool valid : 1; -}; - struct kvmppc_bat { u64 raw; u32 bepi; @@ -69,6 +60,13 @@ struct kvmppc_sid_map { #define SID_MAP_NUM (1 << SID_MAP_BITS) #define SID_MAP_MASK (SID_MAP_NUM - 1) +#ifdef CONFIG_PPC_BOOK3S_64 +#define SID_CONTEXTS 1 +#else +#define SID_CONTEXTS 128 +#define VSID_POOL_SIZE (SID_CONTEXTS * 16) +#endif + struct kvmppc_vcpu_book3s { struct kvm_vcpu vcpu; struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; @@ -79,20 +77,22 @@ struct kvmppc_vcpu_book3s { u64 vsid; } slb_shadow[64]; u8 slb_shadow_max; - struct kvmppc_sr sr[16]; struct kvmppc_bat ibat[8]; struct kvmppc_bat dbat[8]; u64 hid[6]; u64 gqr[8]; int slb_nr; - u32 dsisr; u64 sdr1; u64 hior; u64 msr_mask; - u64 vsid_first; u64 vsid_next; +#ifdef CONFIG_PPC_BOOK3S_32 + u32 vsid_pool[VSID_POOL_SIZE]; +#else + u64 vsid_first; u64 vsid_max; - int context_id; +#endif + int context_id[SID_CONTEXTS]; ulong prog_flags; /* flags to inject when giving a 700 trap */ }; @@ -131,9 +131,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper, u32 val); extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); -extern u32 kvmppc_trampoline_lowmem; -extern u32 kvmppc_trampoline_enter; +extern ulong kvmppc_trampoline_lowmem; +extern ulong kvmppc_trampoline_enter; extern void kvmppc_rmcall(ulong srr0, ulong srr1); extern void kvmppc_load_up_fpu(void); extern void kvmppc_load_up_altivec(void); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b0b23c0..bba3b9b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include <linux/interrupt.h> #include <linux/types.h> #include <linux/kvm_types.h> +#include <linux/kvm_para.h> #include <asm/kvm_asm.h> #define KVM_MAX_VCPUS 1 @@ -41,12 +42,17 @@ #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 +#define HPTEG_HASH_BITS_PTE_LONG 12 #define HPTEG_HASH_BITS_VPTE 13 #define HPTEG_HASH_BITS_VPTE_LONG 5 #define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE) +#define HPTEG_HASH_NUM_PTE_LONG (1 << HPTEG_HASH_BITS_PTE_LONG) #define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE) #define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG) +/* Physical Address Mask - allowed range of real mode RAM access */ +#define KVM_PAM 0x0fffffffffffffffULL + struct kvm; struct kvm_run; struct kvm_vcpu; @@ -159,8 +165,10 @@ struct kvmppc_mmu { struct hpte_cache { struct hlist_node list_pte; + struct hlist_node list_pte_long; struct hlist_node list_vpte; struct hlist_node list_vpte_long; + struct rcu_head rcu_head; u64 host_va; u64 pfn; ulong slot; @@ -210,28 +218,20 @@ struct kvm_vcpu_arch { u32 cr; #endif - ulong msr; #ifdef CONFIG_PPC_BOOK3S ulong shadow_msr; ulong hflags; ulong guest_owned_ext; #endif u32 mmucr; - ulong sprg0; - ulong sprg1; - ulong sprg2; - ulong sprg3; ulong sprg4; ulong sprg5; ulong sprg6; ulong sprg7; - ulong srr0; - ulong srr1; ulong csrr0; ulong csrr1; ulong dsrr0; ulong dsrr1; - ulong dear; ulong esr; u32 dec; u32 decar; @@ -290,12 +290,17 @@ struct kvm_vcpu_arch { struct tasklet_struct tasklet; u64 dec_jiffies; unsigned long pending_exceptions; + struct kvm_vcpu_arch_shared *shared; + unsigned long magic_page_pa; /* phys addr to map the magic page to */ + unsigned long magic_page_ea; /* effect. addr to map the magic page to */ #ifdef CONFIG_PPC_BOOK3S struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; + struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; int hpte_cache_count; + spinlock_t mmu_lock; #endif }; diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index 2d48f6a..50533f9 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -20,16 +20,153 @@ #ifndef __POWERPC_KVM_PARA_H__ #define __POWERPC_KVM_PARA_H__ +#include <linux/types.h> + +struct kvm_vcpu_arch_shared { + __u64 scratch1; + __u64 scratch2; + __u64 scratch3; + __u64 critical; /* Guest may not get interrupts if == r1 */ + __u64 sprg0; + __u64 sprg1; + __u64 sprg2; + __u64 sprg3; + __u64 srr0; + __u64 srr1; + __u64 dar; + __u64 msr; + __u32 dsisr; + __u32 int_pending; /* Tells the guest if we have an interrupt */ + __u32 sr[16]; +}; + +#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ +#define HC_VENDOR_KVM (42 << 16) +#define HC_EV_SUCCESS 0 +#define HC_EV_UNIMPLEMENTED 12 + +#define KVM_FEATURE_MAGIC_PAGE 1 + +#define KVM_MAGIC_FEAT_SR (1 << 0) + #ifdef __KERNEL__ +#ifdef CONFIG_KVM_GUEST + +#include <linux/of.h> + +static inline int kvm_para_available(void) +{ + struct device_node *hyper_node; + + hyper_node = of_find_node_by_path("/hypervisor"); + if (!hyper_node) + return 0; + + if (!of_device_is_compatible(hyper_node, "linux,kvm")) + return 0; + + return 1; +} + +extern unsigned long kvm_hypercall(unsigned long *in, + unsigned long *out, + unsigned long nr); + +#else + static inline int kvm_para_available(void) { return 0; } +static unsigned long kvm_hypercall(unsigned long *in, + unsigned long *out, + unsigned long nr) +{ + return HC_EV_UNIMPLEMENTED; +} + +#endif + +static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2) +{ + unsigned long in[8]; + unsigned long out[8]; + unsigned long r; + + r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + *r2 = out[0]; + + return r; +} + +static inline long kvm_hypercall0(unsigned int nr) +{ + unsigned long in[8]; + unsigned long out[8]; + + return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); +} + +static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) +{ + unsigned long in[8]; + unsigned long out[8]; + + in[0] = p1; + return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); +} + +static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, + unsigned long p2) +{ + unsigned long in[8]; + unsigned long out[8]; + + in[0] = p1; + in[1] = p2; + return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); +} + +static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + unsigned long in[8]; + unsigned long out[8]; + + in[0] = p1; + in[1] = p2; + in[2] = p3; + return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); +} + +static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + unsigned long in[8]; + unsigned long out[8]; + + in[0] = p1; + in[1] = p2; + in[2] = p3; + in[3] = p4; + return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); +} + + static inline unsigned int kvm_arch_para_features(void) { - return 0; + unsigned long r; + + if (!kvm_para_available()) + return 0; + + if(kvm_hypercall0_1(KVM_HC_FEATURES, &r)) + return 0; + + return r; } #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 18d139e..ecb3bc7 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -107,6 +107,7 @@ extern int kvmppc_booke_init(void); extern void kvmppc_booke_exit(void); extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); +extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); /* * Cuts out inst bits with ordering according to spec. diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 4ed076a..36c30f3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -129,6 +129,8 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) obj-y += ppc_save_regs.o endif +obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o + # Disable GCOV in odd or sensitive code GCOV_PROFILE_prom_init.o := n GCOV_PROFILE_ftrace.o := n diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c3e0194..bd0df2e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -48,11 +48,11 @@ #ifdef CONFIG_PPC_ISERIES #include <asm/iseries/alpaca.h> #endif -#ifdef CONFIG_KVM +#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST) #include <linux/kvm_host.h> -#ifndef CONFIG_BOOKE -#include <asm/kvm_book3s.h> #endif +#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S) +#include <asm/kvm_book3s.h> #endif #ifdef CONFIG_PPC32 @@ -396,12 +396,13 @@ int main(void) DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); - DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr)); DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); + DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); + DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); /* book3s */ #ifdef CONFIG_PPC_BOOK3S @@ -466,6 +467,22 @@ int main(void) DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); #endif /* CONFIG_PPC_BOOK3S */ #endif + +#ifdef CONFIG_KVM_GUEST + DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared, + scratch1)); + DEFINE(KVM_MAGIC_SCRATCH2, offsetof(struct kvm_vcpu_arch_shared, + scratch2)); + DEFINE(KVM_MAGIC_SCRATCH3, offsetof(struct kvm_vcpu_arch_shared, + scratch3)); + DEFINE(KVM_MAGIC_INT, offsetof(struct kvm_vcpu_arch_shared, + int_pending)); + DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); + DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared, + critical)); + DEFINE(KVM_MAGIC_SR, offsetof(struct kvm_vcpu_arch_shared, sr)); +#endif + #ifdef CONFIG_44x DEFINE(PGD_T_LOG2, PGD_T_LOG2); DEFINE(PTE_T_LOG2, PTE_T_LOG2); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 39b0c48..9f8b01d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -299,6 +299,12 @@ slb_miss_user_pseries: b . /* prevent spec. execution */ #endif /* __DISABLED__ */ +/* KVM's trampoline code needs to be close to the interrupt handlers */ + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#include "../kvm/book3s_rmhandlers.S" +#endif + .align 7 .globl __end_interrupts __end_interrupts: diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index c571cd3..f0dd577 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -166,12 +166,6 @@ exception_marker: #include "exceptions-64s.S" #endif -/* KVM trampoline code needs to be close to the interrupt handlers */ - -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#include "../kvm/book3s_rmhandlers.S" -#endif - _GLOBAL(generic_secondary_thread_init) mr r24,r3 diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c new file mode 100644 index 0000000..428d0e5 --- /dev/null +++ b/arch/powerpc/kernel/kvm.c @@ -0,0 +1,596 @@ +/* + * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/init.h> +#include <linux/kvm_para.h> +#include <linux/slab.h> +#include <linux/of.h> + +#include <asm/reg.h> +#include <asm/sections.h> +#include <asm/cacheflush.h> +#include <asm/disassemble.h> + +#define KVM_MAGIC_PAGE (-4096L) +#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) + +#define KVM_INST_LWZ 0x80000000 +#define KVM_INST_STW 0x90000000 +#define KVM_INST_LD 0xe8000000 +#define KVM_INST_STD 0xf8000000 +#define KVM_INST_NOP 0x60000000 +#define KVM_INST_B 0x48000000 +#define KVM_INST_B_MASK 0x03ffffff +#define KVM_INST_B_MAX 0x01ffffff + +#define KVM_MASK_RT 0x03e00000 +#define KVM_RT_30 0x03c00000 +#define KVM_MASK_RB 0x0000f800 +#define KVM_INST_MFMSR 0x7c0000a6 +#define KVM_INST_MFSPR_SPRG0 0x7c1042a6 +#define KVM_INST_MFSPR_SPRG1 0x7c1142a6 +#define KVM_INST_MFSPR_SPRG2 0x7c1242a6 +#define KVM_INST_MFSPR_SPRG3 0x7c1342a6 +#define KVM_INST_MFSPR_SRR0 0x7c1a02a6 +#define KVM_INST_MFSPR_SRR1 0x7c1b02a6 +#define KVM_INST_MFSPR_DAR 0x7c1302a6 +#define KVM_INST_MFSPR_DSISR 0x7c1202a6 + +#define KVM_INST_MTSPR_SPRG0 0x7c1043a6 +#define KVM_INST_MTSPR_SPRG1 0x7c1143a6 +#define KVM_INST_MTSPR_SPRG2 0x7c1243a6 +#define KVM_INST_MTSPR_SPRG3 0x7c1343a6 +#define KVM_INST_MTSPR_SRR0 0x7c1a03a6 +#define KVM_INST_MTSPR_SRR1 0x7c1b03a6 +#define KVM_INST_MTSPR_DAR 0x7c1303a6 +#define KVM_INST_MTSPR_DSISR 0x7c1203a6 + +#define KVM_INST_TLBSYNC 0x7c00046c +#define KVM_INST_MTMSRD_L0 0x7c000164 +#define KVM_INST_MTMSRD_L1 0x7c010164 +#define KVM_INST_MTMSR 0x7c000124 + +#define KVM_INST_WRTEEI_0 0x7c000146 +#define KVM_INST_WRTEEI_1 0x7c008146 + +#define KVM_INST_MTSRIN 0x7c0001e4 + +static bool kvm_patching_worked = true; +static char kvm_tmp[1024 * 1024]; +static int kvm_tmp_index; + +static inline void kvm_patch_ins(u32 *inst, u32 new_inst) +{ + *inst = new_inst; + flush_icache_range((ulong)inst, (ulong)inst + 4); +} + +static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) +{ +#ifdef CONFIG_64BIT + kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); +#else + kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000fffc)); +#endif +} + +static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) +{ +#ifdef CONFIG_64BIT + kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); +#else + kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc)); +#endif +} + +static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) +{ + kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff)); +} + +static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) +{ +#ifdef CONFIG_64BIT + kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc)); +#else + kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc)); +#endif +} + +static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) +{ + kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc)); +} + +static void kvm_patch_ins_nop(u32 *inst) +{ + kvm_patch_ins(inst, KVM_INST_NOP); +} + +static void kvm_patch_ins_b(u32 *inst, int addr) +{ +#ifdef CONFIG_RELOCATABLE + /* On relocatable kernels interrupts handlers and our code + can be in different regions, so we don't patch them */ + + extern u32 __end_interrupts; + if ((ulong)inst < (ulong)&__end_interrupts) + return; +#endif + + kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK)); +} + +static u32 *kvm_alloc(int len) +{ + u32 *p; + + if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) { + printk(KERN_ERR "KVM: No more space (%d + %d)\n", + kvm_tmp_index, len); + kvm_patching_worked = false; + return NULL; + } + + p = (void*)&kvm_tmp[kvm_tmp_index]; + kvm_tmp_index += len; + + return p; +} + +extern u32 kvm_emulate_mtmsrd_branch_offs; +extern u32 kvm_emulate_mtmsrd_reg_offs; +extern u32 kvm_emulate_mtmsrd_orig_ins_offs; +extern u32 kvm_emulate_mtmsrd_len; +extern u32 kvm_emulate_mtmsrd[]; + +static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_mtmsrd_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4); + p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK; + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_mtmsrd_reg_offs] |= rt; + break; + } + + p[kvm_emulate_mtmsrd_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +extern u32 kvm_emulate_mtmsr_branch_offs; +extern u32 kvm_emulate_mtmsr_reg1_offs; +extern u32 kvm_emulate_mtmsr_reg2_offs; +extern u32 kvm_emulate_mtmsr_orig_ins_offs; +extern u32 kvm_emulate_mtmsr_len; +extern u32 kvm_emulate_mtmsr[]; + +static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_mtmsr_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4); + p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK; + + /* Make clobbered registers work too */ + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs], + magic_var(scratch2), KVM_RT_30); + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs], + magic_var(scratch1), KVM_RT_30); + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_mtmsr_reg1_offs] |= rt; + p[kvm_emulate_mtmsr_reg2_offs] |= rt; + break; + } + + p[kvm_emulate_mtmsr_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +#ifdef CONFIG_BOOKE + +extern u32 kvm_emulate_wrteei_branch_offs; +extern u32 kvm_emulate_wrteei_ee_offs; +extern u32 kvm_emulate_wrteei_len; +extern u32 kvm_emulate_wrteei[]; + +static void kvm_patch_ins_wrteei(u32 *inst) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_wrteei_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4); + p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK; + p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE); + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 + +extern u32 kvm_emulate_mtsrin_branch_offs; +extern u32 kvm_emulate_mtsrin_reg1_offs; +extern u32 kvm_emulate_mtsrin_reg2_offs; +extern u32 kvm_emulate_mtsrin_orig_ins_offs; +extern u32 kvm_emulate_mtsrin_len; +extern u32 kvm_emulate_mtsrin[]; + +static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_mtsrin_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4); + p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK; + p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10); + p[kvm_emulate_mtsrin_reg2_offs] |= rt; + p[kvm_emulate_mtsrin_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +#endif + +static void kvm_map_magic_page(void *data) +{ + u32 *features = data; + + ulong in[8]; + ulong out[8]; + + in[0] = KVM_MAGIC_PAGE; + in[1] = KVM_MAGIC_PAGE; + + kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE); + + *features = out[0]; +} + +static void kvm_check_ins(u32 *inst, u32 features) +{ + u32 _inst = *inst; + u32 inst_no_rt = _inst & ~KVM_MASK_RT; + u32 inst_rt = _inst & KVM_MASK_RT; + + switch (inst_no_rt) { + /* Loads */ + case KVM_INST_MFMSR: + kvm_patch_ins_ld(inst, magic_var(msr), inst_rt); + break; + case KVM_INST_MFSPR_SPRG0: + kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt); + break; + case KVM_INST_MFSPR_SPRG1: + kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt); + break; + case KVM_INST_MFSPR_SPRG2: + kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt); + break; + case KVM_INST_MFSPR_SPRG3: + kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt); + break; + case KVM_INST_MFSPR_SRR0: + kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt); + break; + case KVM_INST_MFSPR_SRR1: + kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt); + break; + case KVM_INST_MFSPR_DAR: + kvm_patch_ins_ld(inst, magic_var(dar), inst_rt); + break; + case KVM_INST_MFSPR_DSISR: + kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); + break; + + /* Stores */ + case KVM_INST_MTSPR_SPRG0: + kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt); + break; + case KVM_INST_MTSPR_SPRG1: + kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt); + break; + case KVM_INST_MTSPR_SPRG2: + kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt); + break; + case KVM_INST_MTSPR_SPRG3: + kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt); + break; + case KVM_INST_MTSPR_SRR0: + kvm_patch_ins_std(inst, magic_var(srr0), inst_rt); + break; + case KVM_INST_MTSPR_SRR1: + kvm_patch_ins_std(inst, magic_var(srr1), inst_rt); + break; + case KVM_INST_MTSPR_DAR: + kvm_patch_ins_std(inst, magic_var(dar), inst_rt); + break; + case KVM_INST_MTSPR_DSISR: + kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); + break; + + /* Nops */ + case KVM_INST_TLBSYNC: + kvm_patch_ins_nop(inst); + break; + + /* Rewrites */ + case KVM_INST_MTMSRD_L1: + kvm_patch_ins_mtmsrd(inst, inst_rt); + break; + case KVM_INST_MTMSR: + case KVM_INST_MTMSRD_L0: + kvm_patch_ins_mtmsr(inst, inst_rt); + break; + } + + switch (inst_no_rt & ~KVM_MASK_RB) { +#ifdef CONFIG_PPC_BOOK3S_32 + case KVM_INST_MTSRIN: + if (features & KVM_MAGIC_FEAT_SR) { + u32 inst_rb = _inst & KVM_MASK_RB; + kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb); + } + break; + break; +#endif + } + + switch (_inst) { +#ifdef CONFIG_BOOKE + case KVM_INST_WRTEEI_0: + case KVM_INST_WRTEEI_1: + kvm_patch_ins_wrteei(inst); + break; +#endif + } +} + +static void kvm_use_magic_page(void) +{ + u32 *p; + u32 *start, *end; + u32 tmp; + u32 features; + + /* Tell the host to map the magic page to -4096 on all CPUs */ + on_each_cpu(kvm_map_magic_page, &features, 1); + + /* Quick self-test to see if the mapping works */ + if (__get_user(tmp, (u32*)KVM_MAGIC_PAGE)) { + kvm_patching_worked = false; + return; + } + + /* Now loop through all code and find instructions */ + start = (void*)_stext; + end = (void*)_etext; + + for (p = start; p < end; p++) + kvm_check_ins(p, features); + + printk(KERN_INFO "KVM: Live patching for a fast VM %s\n", + kvm_patching_worked ? "worked" : "failed"); +} + +unsigned long kvm_hypercall(unsigned long *in, + unsigned long *out, + unsigned long nr) +{ + unsigned long register r0 asm("r0"); + unsigned long register r3 asm("r3") = in[0]; + unsigned long register r4 asm("r4") = in[1]; + unsigned long register r5 asm("r5") = in[2]; + unsigned long register r6 asm("r6") = in[3]; + unsigned long register r7 asm("r7") = in[4]; + unsigned long register r8 asm("r8") = in[5]; + unsigned long register r9 asm("r9") = in[6]; + unsigned long register r10 asm("r10") = in[7]; + unsigned long register r11 asm("r11") = nr; + unsigned long register r12 asm("r12"); + + asm volatile("bl kvm_hypercall_start" + : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), + "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), + "=r"(r12) + : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8), + "r"(r9), "r"(r10), "r"(r11) + : "memory", "cc", "xer", "ctr", "lr"); + + out[0] = r4; + out[1] = r5; + out[2] = r6; + out[3] = r7; + out[4] = r8; + out[5] = r9; + out[6] = r10; + out[7] = r11; + + return r3; +} +EXPORT_SYMBOL_GPL(kvm_hypercall); + +static int kvm_para_setup(void) +{ + extern u32 kvm_hypercall_start; + struct device_node *hyper_node; + u32 *insts; + int len, i; + + hyper_node = of_find_node_by_path("/hypervisor"); + if (!hyper_node) + return -1; + + insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len); + if (len % 4) + return -1; + if (len > (4 * 4)) + return -1; + + for (i = 0; i < (len / 4); i++) + kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]); + + return 0; +} + +static __init void kvm_free_tmp(void) +{ + unsigned long start, end; + + start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK; + end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK; + + /* Free the tmp space we don't need */ + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + init_page_count(virt_to_page(start)); + free_page(start); + totalram_pages++; + } +} + +static int __init kvm_guest_init(void) +{ + if (!kvm_para_available()) + goto free_tmp; + + if (kvm_para_setup()) + goto free_tmp; + + if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) + kvm_use_magic_page(); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Enable napping */ + powersave_nap = 1; +#endif + +free_tmp: + kvm_free_tmp(); + + return 0; +} + +postcore_initcall(kvm_guest_init); diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S new file mode 100644 index 0000000..f2b1b25 --- /dev/null +++ b/arch/powerpc/kernel/kvm_emul.S @@ -0,0 +1,302 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2010 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> + +/* Hypercall entry point. Will be patched with device tree instructions. */ + +.global kvm_hypercall_start +kvm_hypercall_start: + li r3, -1 + nop + nop + nop + blr + +#define KVM_MAGIC_PAGE (-4096) + +#ifdef CONFIG_64BIT +#define LL64(reg, offs, reg2) ld reg, (offs)(reg2) +#define STL64(reg, offs, reg2) std reg, (offs)(reg2) +#else +#define LL64(reg, offs, reg2) lwz reg, (offs + 4)(reg2) +#define STL64(reg, offs, reg2) stw reg, (offs + 4)(reg2) +#endif + +#define SCRATCH_SAVE \ + /* Enable critical section. We are critical if \ + shared->critical == r1 */ \ + STL64(r1, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); \ + \ + /* Save state */ \ + PPC_STL r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0); \ + PPC_STL r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0); \ + mfcr r31; \ + stw r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0); + +#define SCRATCH_RESTORE \ + /* Restore state */ \ + PPC_LL r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0); \ + lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0); \ + mtcr r30; \ + PPC_LL r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0); \ + \ + /* Disable critical section. We are critical if \ + shared->critical == r1 and r2 is always != r1 */ \ + STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); + +.global kvm_emulate_mtmsrd +kvm_emulate_mtmsrd: + + SCRATCH_SAVE + + /* Put MSR & ~(MSR_EE|MSR_RI) in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + lis r30, (~(MSR_EE | MSR_RI))@h + ori r30, r30, (~(MSR_EE | MSR_RI))@l + and r31, r31, r30 + + /* OR the register's (MSR_EE|MSR_RI) on MSR */ +kvm_emulate_mtmsrd_reg: + ori r30, r0, 0 + andi. r30, r30, (MSR_EE|MSR_RI) + or r31, r31, r30 + + /* Put MSR back into magic page */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Check if we have to fetch an interrupt */ + lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r31, 0 + beq+ no_check + + /* Check if we may trigger an interrupt */ + andi. r30, r30, MSR_EE + beq no_check + + SCRATCH_RESTORE + + /* Nag hypervisor */ +kvm_emulate_mtmsrd_orig_ins: + tlbsync + + b kvm_emulate_mtmsrd_branch + +no_check: + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_mtmsrd_branch: + b . +kvm_emulate_mtmsrd_end: + +.global kvm_emulate_mtmsrd_branch_offs +kvm_emulate_mtmsrd_branch_offs: + .long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4 + +.global kvm_emulate_mtmsrd_reg_offs +kvm_emulate_mtmsrd_reg_offs: + .long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4 + +.global kvm_emulate_mtmsrd_orig_ins_offs +kvm_emulate_mtmsrd_orig_ins_offs: + .long (kvm_emulate_mtmsrd_orig_ins - kvm_emulate_mtmsrd) / 4 + +.global kvm_emulate_mtmsrd_len +kvm_emulate_mtmsrd_len: + .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 + + +#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI) +#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS + +.global kvm_emulate_mtmsr +kvm_emulate_mtmsr: + + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Find the changed bits between old and new MSR */ +kvm_emulate_mtmsr_reg1: + ori r30, r0, 0 + xor r31, r30, r31 + + /* Check if we need to really do mtmsr */ + LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS) + and. r31, r31, r30 + + /* No critical bits changed? Maybe we can stay in the guest. */ + beq maybe_stay_in_guest + +do_mtmsr: + + SCRATCH_RESTORE + + /* Just fire off the mtmsr if it's critical */ +kvm_emulate_mtmsr_orig_ins: + mtmsr r0 + + b kvm_emulate_mtmsr_branch + +maybe_stay_in_guest: + + /* Get the target register in r30 */ +kvm_emulate_mtmsr_reg2: + ori r30, r0, 0 + + /* Check if we have to fetch an interrupt */ + lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r31, 0 + beq+ no_mtmsr + + /* Check if we may trigger an interrupt */ + andi. r31, r30, MSR_EE + beq no_mtmsr + + b do_mtmsr + +no_mtmsr: + + /* Put MSR into magic page because we don't call mtmsr */ + STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_mtmsr_branch: + b . +kvm_emulate_mtmsr_end: + +.global kvm_emulate_mtmsr_branch_offs +kvm_emulate_mtmsr_branch_offs: + .long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_reg1_offs +kvm_emulate_mtmsr_reg1_offs: + .long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_reg2_offs +kvm_emulate_mtmsr_reg2_offs: + .long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_orig_ins_offs +kvm_emulate_mtmsr_orig_ins_offs: + .long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_len +kvm_emulate_mtmsr_len: + .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 + + + +.global kvm_emulate_wrteei +kvm_emulate_wrteei: + + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Remove MSR_EE from old MSR */ + li r30, 0 + ori r30, r30, MSR_EE + andc r31, r31, r30 + + /* OR new MSR_EE onto the old MSR */ +kvm_emulate_wrteei_ee: + ori r31, r31, 0 + + /* Write new MSR value back */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_wrteei_branch: + b . +kvm_emulate_wrteei_end: + +.global kvm_emulate_wrteei_branch_offs +kvm_emulate_wrteei_branch_offs: + .long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4 + +.global kvm_emulate_wrteei_ee_offs +kvm_emulate_wrteei_ee_offs: + .long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4 + +.global kvm_emulate_wrteei_len +kvm_emulate_wrteei_len: + .long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4 + + +.global kvm_emulate_mtsrin +kvm_emulate_mtsrin: + + SCRATCH_SAVE + + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + andi. r31, r31, MSR_DR | MSR_IR + beq kvm_emulate_mtsrin_reg1 + + SCRATCH_RESTORE + +kvm_emulate_mtsrin_orig_ins: + nop + b kvm_emulate_mtsrin_branch + +kvm_emulate_mtsrin_reg1: + /* rX >> 26 */ + rlwinm r30,r0,6,26,29 + +kvm_emulate_mtsrin_reg2: + stw r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30) + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_mtsrin_branch: + b . +kvm_emulate_mtsrin_end: + +.global kvm_emulate_mtsrin_branch_offs +kvm_emulate_mtsrin_branch_offs: + .long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_reg1_offs +kvm_emulate_mtsrin_reg1_offs: + .long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_reg2_offs +kvm_emulate_mtsrin_reg2_offs: + .long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_orig_ins_offs +kvm_emulate_mtsrin_orig_ins_offs: + .long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_len +kvm_emulate_mtsrin_len: + .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 73c0a3f..74d0e74 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -43,7 +43,7 @@ int kvmppc_core_check_processor_compat(void) { int r; - if (strcmp(cur_cpu_spec->platform, "ppc440") == 0) + if (strncmp(cur_cpu_spec->platform, "ppc440", 6) == 0) r = 0; else r = -ENOTSUPP; @@ -72,6 +72,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) /* Since the guest can directly access the timebase, it must know the * real timebase frequency. Accordingly, it must see the state of * CCR1[TCS]. */ + /* XXX CCR1 doesn't exist on all 440 SoCs. */ vcpu->arch.ccr1 = mfspr(SPRN_CCR1); for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) @@ -123,8 +124,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!vcpu->arch.shared) + goto uninit_vcpu; + return vcpu; +uninit_vcpu: + kvm_vcpu_uninit(vcpu); free_vcpu: kmem_cache_free(kvm_vcpu_cache, vcpu_44x); out: @@ -135,6 +142,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); + free_page((unsigned long)vcpu->arch.shared); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 9b9b5cd..5f3cff8 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -47,6 +47,7 @@ #ifdef DEBUG void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) { + struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); struct kvmppc_44x_tlbe *tlbe; int i; @@ -221,14 +222,14 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { - unsigned int as = !!(vcpu->arch.msr & MSR_IS); + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as); } int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { - unsigned int as = !!(vcpu->arch.msr & MSR_DS); + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as); } @@ -354,7 +355,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf); stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags, - vcpu->arch.msr & MSR_PR); + vcpu->arch.shared->msr & MSR_PR); stlbe.tid = !(asid & 0xff); /* Keep track of the reference so we can properly release it later. */ @@ -423,7 +424,7 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, /* Does it match current guest AS? */ /* XXX what about IS != DS? */ - if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS)) + if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) return 0; gpa = get_tlb_raddr(tlbe); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a3cef30..e316847 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -17,6 +17,7 @@ #include <linux/kvm_host.h> #include <linux/err.h> #include <linux/slab.h> +#include "trace.h" #include <asm/reg.h> #include <asm/cputable.h> @@ -35,7 +36,6 @@ #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU /* #define EXIT_DEBUG */ -/* #define EXIT_DEBUG_SIMPLE */ /* #define DEBUG_EXT */ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, @@ -105,65 +105,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) kvmppc_giveup_ext(vcpu, MSR_VSX); } -#if defined(EXIT_DEBUG) -static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) -{ - u64 jd = mftb() - vcpu->arch.dec_jiffies; - return vcpu->arch.dec - jd; -} -#endif - static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_msr = vcpu->arch.msr; + ulong smsr = vcpu->arch.shared->msr; + /* Guest MSR values */ - vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | - MSR_BE | MSR_DE; + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; /* Process MSR values */ - vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | - MSR_EE; + smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ - vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext); + smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext); /* 64-bit Process MSR values */ #ifdef CONFIG_PPC_BOOK3S_64 - vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV; + smsr |= MSR_ISF | MSR_HV; #endif + vcpu->arch.shadow_msr = smsr; } void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) { - ulong old_msr = vcpu->arch.msr; + ulong old_msr = vcpu->arch.shared->msr; #ifdef EXIT_DEBUG printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); #endif msr &= to_book3s(vcpu)->msr_mask; - vcpu->arch.msr = msr; + vcpu->arch.shared->msr = msr; kvmppc_recalc_shadow_msr(vcpu); - if (msr & (MSR_WE|MSR_POW)) { + if (msr & MSR_POW) { if (!vcpu->arch.pending_exceptions) { kvm_vcpu_block(vcpu); vcpu->stat.halt_wakeup++; + + /* Unset POW bit after we woke up */ + msr &= ~MSR_POW; + vcpu->arch.shared->msr = msr; } } - if ((vcpu->arch.msr & (MSR_PR|MSR_IR|MSR_DR)) != + if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) != (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { kvmppc_mmu_flush_segments(vcpu); kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + + /* Preload magic page segment when in kernel mode */ + if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { + struct kvm_vcpu_arch *a = &vcpu->arch; + + if (msr & MSR_DR) + kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); + else + kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); + } } /* Preload FPU if it's enabled */ - if (vcpu->arch.msr & MSR_FP) + if (vcpu->arch.shared->msr & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); } void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { - vcpu->arch.srr0 = kvmppc_get_pc(vcpu); - vcpu->arch.srr1 = vcpu->arch.msr | flags; + vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); + vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags; kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec); vcpu->arch.mmu.reset_msr(vcpu); } @@ -180,6 +186,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; + case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break; case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; @@ -199,6 +206,9 @@ static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, { clear_bit(kvmppc_book3s_vec2irqprio(vec), &vcpu->arch.pending_exceptions); + + if (!vcpu->arch.pending_exceptions) + vcpu->arch.shared->int_pending = 0; } void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) @@ -237,13 +247,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); + unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; + + if (irq->irq == KVM_INTERRUPT_SET_LEVEL) + vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; + + kvmppc_book3s_queue_irqprio(vcpu, vec); } void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); } int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) @@ -251,14 +267,29 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) int deliver = 1; int vec = 0; ulong flags = 0ULL; + ulong crit_raw = vcpu->arch.shared->critical; + ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); + bool crit; + + /* Truncate crit indicators in 32 bit mode */ + if (!(vcpu->arch.shared->msr & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(vcpu->arch.shared->msr & MSR_PR); switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: - deliver = vcpu->arch.msr & MSR_EE; + deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: - deliver = vcpu->arch.msr & MSR_EE; + case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: + deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; case BOOK3S_IRQPRIO_SYSTEM_RESET: @@ -320,9 +351,27 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) return deliver; } +/* + * This function determines if an irqprio should be cleared once issued. + */ +static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) +{ + switch (priority) { + case BOOK3S_IRQPRIO_DECREMENTER: + /* DEC interrupts get cleared by mtdec */ + return false; + case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: + /* External interrupts get cleared by userspace */ + return false; + } + + return true; +} + void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; + unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; #ifdef EXIT_DEBUG @@ -332,8 +381,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) priority = __ffs(*pending); while (priority < BOOK3S_IRQPRIO_MAX) { if (kvmppc_book3s_irqprio_deliver(vcpu, priority) && - (priority != BOOK3S_IRQPRIO_DECREMENTER)) { - /* DEC interrupts get cleared by mtdec */ + clear_irqprio(vcpu, priority)) { clear_bit(priority, &vcpu->arch.pending_exceptions); break; } @@ -342,6 +390,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) BITS_PER_BYTE * sizeof(*pending), priority + 1); } + + /* Tell the guest about our interrupt status */ + if (*pending) + vcpu->arch.shared->int_pending = 1; + else if (old_pending) + vcpu->arch.shared->int_pending = 0; } void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) @@ -398,6 +452,25 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) } } +pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + ulong mp_pa = vcpu->arch.magic_page_pa; + + /* Magic page override */ + if (unlikely(mp_pa) && + unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) == + ((mp_pa & PAGE_MASK) & KVM_PAM))) { + ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; + pfn_t pfn; + + pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; + get_page(pfn_to_page(pfn)); + return pfn; + } + + return gfn_to_pfn(vcpu->kvm, gfn); +} + /* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to * emulate 32 bytes dcbz length. @@ -415,8 +488,10 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) int i; hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); - if (is_error_page(hpage)) + if (is_error_page(hpage)) { + kvm_release_page_clean(hpage); return; + } hpage_offset = pte->raddr & ~PAGE_MASK; hpage_offset &= ~0xFFFULL; @@ -437,14 +512,14 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, struct kvmppc_pte *pte) { - int relocated = (vcpu->arch.msr & (data ? MSR_DR : MSR_IR)); + int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); int r; if (relocated) { r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); } else { pte->eaddr = eaddr; - pte->raddr = eaddr & 0xffffffff; + pte->raddr = eaddr & KVM_PAM; pte->vpage = VSID_REAL | eaddr >> 12; pte->may_read = true; pte->may_write = true; @@ -533,6 +608,13 @@ mmio: static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { + ulong mp_pa = vcpu->arch.magic_page_pa; + + if (unlikely(mp_pa) && + unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { + return 1; + } + return kvm_is_visible_gfn(vcpu->kvm, gfn); } @@ -545,8 +627,8 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, int page_found = 0; struct kvmppc_pte pte; bool is_mmio = false; - bool dr = (vcpu->arch.msr & MSR_DR) ? true : false; - bool ir = (vcpu->arch.msr & MSR_IR) ? true : false; + bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false; + bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false; u64 vsid; relocated = data ? dr : ir; @@ -558,12 +640,12 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.may_execute = true; pte.may_read = true; pte.may_write = true; - pte.raddr = eaddr & 0xffffffff; + pte.raddr = eaddr & KVM_PAM; pte.eaddr = eaddr; pte.vpage = eaddr >> 12; } - switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { case 0: pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); break; @@ -571,7 +653,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, case MSR_IR: vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); - if ((vcpu->arch.msr & (MSR_DR|MSR_IR)) == MSR_DR) + if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR) pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); else pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); @@ -594,20 +676,23 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ - vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); - to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr; - vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->msr |= + (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ - vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); - to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; - to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT; - vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); + vcpu->arch.shared->dsisr = + to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; + vcpu->arch.shared->msr |= + (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ - vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); } else if (!is_mmio && kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { @@ -695,9 +780,11 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu) ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); if (ret == -ENOENT) { - vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 33, 1); - vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 34, 36, 0); - vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0); + ulong msr = vcpu->arch.shared->msr; + + msr = kvmppc_set_field(msr, 33, 33, 1); + msr = kvmppc_set_field(msr, 34, 36, 0); + vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); return EMULATE_AGAIN; } @@ -736,7 +823,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) return RESUME_GUEST; - if (!(vcpu->arch.msr & msr)) { + if (!(vcpu->arch.shared->msr & msr)) { kvmppc_book3s_queue_irqprio(vcpu, exit_nr); return RESUME_GUEST; } @@ -796,16 +883,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; -#ifdef EXIT_DEBUG - printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu), - kvmppc_get_dec(vcpu), to_svcpu(vcpu)->shadow_srr1); -#elif defined (EXIT_DEBUG_SIMPLE) - if ((exit_nr != 0x900) && (exit_nr != 0x500)) - printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu), - vcpu->arch.msr); -#endif + + trace_kvm_book3s_exit(exit_nr, vcpu); kvm_resched(vcpu); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -836,9 +915,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } else { - vcpu->arch.msr |= to_svcpu(vcpu)->shadow_srr1 & 0x58000000; + vcpu->arch.shared->msr |= + to_svcpu(vcpu)->shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } break; @@ -861,17 +940,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); } else { - vcpu->arch.dear = dar; - to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dar = dar; + vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); - kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFUL); r = RESUME_GUEST; } break; } case BOOK3S_INTERRUPT_DATA_SEGMENT: if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { - vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); + vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_SEGMENT); } @@ -904,7 +982,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, program_interrupt: flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; - if (vcpu->arch.msr & MSR_PR) { + if (vcpu->arch.shared->msr & MSR_PR) { #ifdef EXIT_DEBUG printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); #endif @@ -941,10 +1019,10 @@ program_interrupt: break; } case BOOK3S_INTERRUPT_SYSCALL: - // XXX make user settable if (vcpu->arch.osi_enabled && (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { + /* MOL hypercalls */ u64 *gprs = run->osi.gprs; int i; @@ -953,8 +1031,13 @@ program_interrupt: gprs[i] = kvmppc_get_gpr(vcpu, i); vcpu->arch.osi_needed = 1; r = RESUME_HOST_NV; - + } else if (!(vcpu->arch.shared->msr & MSR_PR) && + (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { + /* KVM PV hypercalls */ + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + r = RESUME_GUEST; } else { + /* Guest syscalls */ vcpu->stat.syscall_exits++; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; @@ -989,9 +1072,9 @@ program_interrupt: } case BOOK3S_INTERRUPT_ALIGNMENT: if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { - to_book3s(vcpu)->dsisr = kvmppc_alignment_dsisr(vcpu, + vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu, kvmppc_get_last_inst(vcpu)); - vcpu->arch.dear = kvmppc_alignment_dar(vcpu, + vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu, kvmppc_get_last_inst(vcpu)); kvmppc_book3s_queue_irqprio(vcpu, exit_nr); } @@ -1031,9 +1114,7 @@ program_interrupt: } } -#ifdef EXIT_DEBUG - printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, kvmppc_get_pc(vcpu), r); -#endif + trace_kvm_book3s_reenter(r, vcpu); return r; } @@ -1052,14 +1133,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->ctr = kvmppc_get_ctr(vcpu); regs->lr = kvmppc_get_lr(vcpu); regs->xer = kvmppc_get_xer(vcpu); - regs->msr = vcpu->arch.msr; - regs->srr0 = vcpu->arch.srr0; - regs->srr1 = vcpu->arch.srr1; + regs->msr = vcpu->arch.shared->msr; + regs->srr0 = vcpu->arch.shared->srr0; + regs->srr1 = vcpu->arch.shared->srr1; regs->pid = vcpu->arch.pid; - regs->sprg0 = vcpu->arch.sprg0; - regs->sprg1 = vcpu->arch.sprg1; - regs->sprg2 = vcpu->arch.sprg2; - regs->sprg3 = vcpu->arch.sprg3; + regs->sprg0 = vcpu->arch.shared->sprg0; + regs->sprg1 = vcpu->arch.shared->sprg1; + regs->sprg2 = vcpu->arch.shared->sprg2; + regs->sprg3 = vcpu->arch.shared->sprg3; regs->sprg5 = vcpu->arch.sprg4; regs->sprg6 = vcpu->arch.sprg5; regs->sprg7 = vcpu->arch.sprg6; @@ -1080,12 +1161,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvmppc_set_lr(vcpu, regs->lr); kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); - vcpu->arch.srr0 = regs->srr0; - vcpu->arch.srr1 = regs->srr1; - vcpu->arch.sprg0 = regs->sprg0; - vcpu->arch.sprg1 = regs->sprg1; - vcpu->arch.sprg2 = regs->sprg2; - vcpu->arch.sprg3 = regs->sprg3; + vcpu->arch.shared->srr0 = regs->srr0; + vcpu->arch.shared->srr1 = regs->srr1; + vcpu->arch.shared->sprg0 = regs->sprg0; + vcpu->arch.shared->sprg1 = regs->sprg1; + vcpu->arch.shared->sprg2 = regs->sprg2; + vcpu->arch.shared->sprg3 = regs->sprg3; vcpu->arch.sprg5 = regs->sprg4; vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; @@ -1111,10 +1192,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv; } } else { - for (i = 0; i < 16; i++) { - sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw; - sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw; - } + for (i = 0; i < 16; i++) + sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i]; + for (i = 0; i < 8; i++) { sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; @@ -1225,6 +1305,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) struct kvmppc_vcpu_book3s *vcpu_book3s; struct kvm_vcpu *vcpu; int err = -ENOMEM; + unsigned long p; vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) @@ -1242,6 +1323,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_shadow_vcpu; + p = __get_free_page(GFP_KERNEL|__GFP_ZERO); + /* the real shared page fills the last 4k of our page */ + vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096); + if (!p) + goto uninit_vcpu; + vcpu->arch.host_retip = kvm_return_point; vcpu->arch.host_msr = mfmsr(); #ifdef CONFIG_PPC_BOOK3S_64 @@ -1268,10 +1355,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) err = kvmppc_mmu_init(vcpu); if (err < 0) - goto free_shadow_vcpu; + goto uninit_vcpu; return vcpu; +uninit_vcpu: + kvm_vcpu_uninit(vcpu); free_shadow_vcpu: kfree(vcpu_book3s->shadow_vcpu); free_vcpu: @@ -1284,6 +1373,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); kvm_vcpu_uninit(vcpu); kfree(vcpu_book3s->shadow_vcpu); vfree(vcpu_book3s); @@ -1346,7 +1436,7 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) local_irq_enable(); /* Preload FPU if it's enabled */ - if (vcpu->arch.msr & MSR_FP) + if (vcpu->arch.shared->msr & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); ret = __kvmppc_vcpu_entry(kvm_run, vcpu); diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 3292d76..c8cefdd 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -58,14 +58,39 @@ static inline bool check_debug_ip(struct kvm_vcpu *vcpu) #endif } +static inline u32 sr_vsid(u32 sr_raw) +{ + return sr_raw & 0x0fffffff; +} + +static inline bool sr_valid(u32 sr_raw) +{ + return (sr_raw & 0x80000000) ? false : true; +} + +static inline bool sr_ks(u32 sr_raw) +{ + return (sr_raw & 0x40000000) ? true: false; +} + +static inline bool sr_kp(u32 sr_raw) +{ + return (sr_raw & 0x20000000) ? true: false; +} + +static inline bool sr_nx(u32 sr_raw) +{ + return (sr_raw & 0x10000000) ? true: false; +} + static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data); static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); -static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr) +static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr) { - return &vcpu_book3s->sr[(eaddr >> 28) & 0xf]; + return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf]; } static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, @@ -87,7 +112,7 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) } static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, - struct kvmppc_sr *sre, gva_t eaddr, + u32 sre, gva_t eaddr, bool primary) { u32 page, hash, pteg, htabmask; @@ -96,7 +121,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 page = (eaddr & 0x0FFFFFFF) >> 12; htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0; - hash = ((sre->vsid ^ page) << 6); + hash = ((sr_vsid(sre) ^ page) << 6); if (!primary) hash = ~hash; hash &= htabmask; @@ -104,8 +129,8 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", - vcpu_book3s->vcpu.arch.pc, eaddr, vcpu_book3s->sdr1, pteg, - sre->vsid); + kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, + sr_vsid(sre)); r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) @@ -113,10 +138,9 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 return r | (pteg & ~PAGE_MASK); } -static u32 kvmppc_mmu_book3s_32_get_ptem(struct kvmppc_sr *sre, gva_t eaddr, - bool primary) +static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary) { - return ((eaddr & 0x0fffffff) >> 22) | (sre->vsid << 7) | + return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) | (primary ? 0 : 0x40) | 0x80000000; } @@ -133,7 +157,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, else bat = &vcpu_book3s->ibat[i]; - if (vcpu->arch.msr & MSR_PR) { + if (vcpu->arch.shared->msr & MSR_PR) { if (!bat->vp) continue; } else { @@ -180,17 +204,17 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, bool primary) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); - struct kvmppc_sr *sre; + u32 sre; hva_t ptegp; u32 pteg[16]; u32 ptem = 0; int i; int found = 0; - sre = find_sr(vcpu_book3s, eaddr); + sre = find_sr(vcpu, eaddr); dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28, - sre->vsid, sre->raw); + sr_vsid(sre), sre); pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); @@ -214,8 +238,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); pp = pteg[i+1] & 3; - if ((sre->Kp && (vcpu->arch.msr & MSR_PR)) || - (sre->Ks && !(vcpu->arch.msr & MSR_PR))) + if ((sr_kp(sre) && (vcpu->arch.shared->msr & MSR_PR)) || + (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR))) pp |= 4; pte->may_write = false; @@ -269,7 +293,7 @@ no_page_found: dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n", to_book3s(vcpu)->sdr1, ptegp); for (i=0; i<16; i+=2) { - dprintk_pte(" %02d: 0x%x - 0x%x (0x%llx)\n", + dprintk_pte(" %02d: 0x%x - 0x%x (0x%x)\n", i, pteg[i], pteg[i+1], ptem); } } @@ -281,8 +305,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data) { int r; + ulong mp_ea = vcpu->arch.magic_page_ea; pte->eaddr = eaddr; + + /* Magic page override */ + if (unlikely(mp_ea) && + unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && + !(vcpu->arch.shared->msr & MSR_PR)) { + pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); + pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff); + pte->raddr &= KVM_PAM; + pte->may_execute = true; + pte->may_read = true; + pte->may_write = true; + + return 0; + } + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); if (r < 0) r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); @@ -295,30 +335,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) { - return to_book3s(vcpu)->sr[srnum].raw; + return vcpu->arch.shared->sr[srnum]; } static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, ulong value) { - struct kvmppc_sr *sre; - - sre = &to_book3s(vcpu)->sr[srnum]; - - /* Flush any left-over shadows from the previous SR */ - - /* XXX Not necessary? */ - /* kvmppc_mmu_pte_flush(vcpu, ((u64)sre->vsid) << 28, 0xf0000000ULL); */ - - /* And then put in the new SR */ - sre->raw = value; - sre->vsid = (value & 0x0fffffff); - sre->valid = (value & 0x80000000) ? false : true; - sre->Ks = (value & 0x40000000) ? true : false; - sre->Kp = (value & 0x20000000) ? true : false; - sre->nx = (value & 0x10000000) ? true : false; - - /* Map the new segment */ + vcpu->arch.shared->sr[srnum] = value; kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); } @@ -331,19 +354,19 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid) { ulong ea = esid << SID_SHIFT; - struct kvmppc_sr *sr; + u32 sr; u64 gvsid = esid; - if (vcpu->arch.msr & (MSR_DR|MSR_IR)) { - sr = find_sr(to_book3s(vcpu), ea); - if (sr->valid) - gvsid = sr->vsid; + if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { + sr = find_sr(vcpu, ea); + if (sr_valid(sr)) + gvsid = sr_vsid(sr); } /* In case we only have one of MSR_IR or MSR_DR set, let's put that in the real-mode context (and hope RM doesn't access high memory) */ - switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { case 0: *vsid = VSID_REAL | esid; break; @@ -354,8 +377,8 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, *vsid = VSID_REAL_DR | gvsid; break; case MSR_DR|MSR_IR: - if (sr->valid) - *vsid = sr->vsid; + if (sr_valid(sr)) + *vsid = sr_vsid(sr); else *vsid = VSID_BAT | gvsid; break; @@ -363,7 +386,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, BUG(); } - if (vcpu->arch.msr & MSR_PR) + if (vcpu->arch.shared->msr & MSR_PR) *vsid |= VSID_PR; return 0; diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 0b51ef8..9fecbfb 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -19,7 +19,6 @@ */ #include <linux/kvm_host.h> -#include <linux/hash.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -77,7 +76,14 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) * a hash, so we don't waste cycles on looping */ static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) { - return hash_64(gvsid, SID_MAP_BITS); + return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); } @@ -86,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) struct kvmppc_sid_map *map; u16 sid_map_mask; - if (vcpu->arch.msr & MSR_PR) + if (vcpu->arch.shared->msr & MSR_PR) gvsid |= VSID_PR; sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); @@ -147,8 +153,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) struct hpte_cache *pte; /* Get host physical address for gpa */ - hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); - if (kvm_is_error_hva(hpaddr)) { + hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); + if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); return -EINVAL; @@ -253,7 +259,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) u16 sid_map_mask; static int backwards_map = 0; - if (vcpu->arch.msr & MSR_PR) + if (vcpu->arch.shared->msr & MSR_PR) gvsid |= VSID_PR; /* We might get collisions that trap in preceding order, so let's @@ -269,18 +275,15 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) backwards_map = !backwards_map; /* Uh-oh ... out of mappings. Let's flush! */ - if (vcpu_book3s->vsid_next >= vcpu_book3s->vsid_max) { - vcpu_book3s->vsid_next = vcpu_book3s->vsid_first; + if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) { + vcpu_book3s->vsid_next = 0; memset(vcpu_book3s->sid_map, 0, sizeof(struct kvmppc_sid_map) * SID_MAP_NUM); kvmppc_mmu_pte_flush(vcpu, 0, 0); kvmppc_mmu_flush_segments(vcpu); } - map->host_vsid = vcpu_book3s->vsid_next; - - /* Would have to be 111 to be completely aligned with the rest of - Linux, but that is just way too little space! */ - vcpu_book3s->vsid_next+=1; + map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next]; + vcpu_book3s->vsid_next++; map->guest_vsid = gvsid; map->valid = true; @@ -327,40 +330,38 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) { + int i; + kvmppc_mmu_hpte_destroy(vcpu); preempt_disable(); - __destroy_context(to_book3s(vcpu)->context_id); + for (i = 0; i < SID_CONTEXTS; i++) + __destroy_context(to_book3s(vcpu)->context_id[i]); preempt_enable(); } /* From mm/mmu_context_hash32.c */ -#define CTX_TO_VSID(ctx) (((ctx) * (897 * 16)) & 0xffffff) +#define CTX_TO_VSID(c, id) ((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff) int kvmppc_mmu_init(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int err; ulong sdr1; + int i; + int j; - err = __init_new_context(); - if (err < 0) - return -1; - vcpu3s->context_id = err; - - vcpu3s->vsid_max = CTX_TO_VSID(vcpu3s->context_id + 1) - 1; - vcpu3s->vsid_first = CTX_TO_VSID(vcpu3s->context_id); - -#if 0 /* XXX still doesn't guarantee uniqueness */ - /* We could collide with the Linux vsid space because the vsid - * wraps around at 24 bits. We're safe if we do our own space - * though, so let's always set the highest bit. */ + for (i = 0; i < SID_CONTEXTS; i++) { + err = __init_new_context(); + if (err < 0) + goto init_fail; + vcpu3s->context_id[i] = err; - vcpu3s->vsid_max |= 0x00800000; - vcpu3s->vsid_first |= 0x00800000; -#endif - BUG_ON(vcpu3s->vsid_max < vcpu3s->vsid_first); + /* Remember context id for this combination */ + for (j = 0; j < 16; j++) + vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j); + } - vcpu3s->vsid_next = vcpu3s->vsid_first; + vcpu3s->vsid_next = 0; /* Remember where the HTAB is */ asm ( "mfsdr1 %0" : "=r"(sdr1) ); @@ -370,4 +371,14 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) kvmppc_mmu_hpte_init(vcpu); return 0; + +init_fail: + for (j = 0; j < i; j++) { + if (!vcpu3s->context_id[j]) + continue; + + __destroy_context(to_book3s(vcpu)->context_id[j]); + } + + return -1; } diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 4025ea2..d7889ef 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -163,6 +163,22 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, bool found = false; bool perm_err = false; int second = 0; + ulong mp_ea = vcpu->arch.magic_page_ea; + + /* Magic page override */ + if (unlikely(mp_ea) && + unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && + !(vcpu->arch.shared->msr & MSR_PR)) { + gpte->eaddr = eaddr; + gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); + gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff); + gpte->raddr &= KVM_PAM; + gpte->may_execute = true; + gpte->may_read = true; + gpte->may_write = true; + + return 0; + } slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); if (!slbe) @@ -180,9 +196,9 @@ do_second: goto no_page_found; } - if ((vcpu->arch.msr & MSR_PR) && slbe->Kp) + if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp) key = 4; - else if (!(vcpu->arch.msr & MSR_PR) && slbe->Ks) + else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks) key = 4; for (i=0; i<16; i+=2) { @@ -381,7 +397,7 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) for (i = 1; i < vcpu_book3s->slb_nr; i++) vcpu_book3s->slb[i].valid = false; - if (vcpu->arch.msr & MSR_IR) { + if (vcpu->arch.shared->msr & MSR_IR) { kvmppc_mmu_flush_segments(vcpu); kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); } @@ -445,14 +461,15 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, ulong ea = esid << SID_SHIFT; struct kvmppc_slb *slb; u64 gvsid = esid; + ulong mp_ea = vcpu->arch.magic_page_ea; - if (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); if (slb) gvsid = slb->vsid; } - switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { case 0: *vsid = VSID_REAL | esid; break; @@ -464,7 +481,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, break; case MSR_DR|MSR_IR: if (!slb) - return -ENOENT; + goto no_slb; *vsid = gvsid; break; @@ -473,10 +490,21 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, break; } - if (vcpu->arch.msr & MSR_PR) + if (vcpu->arch.shared->msr & MSR_PR) *vsid |= VSID_PR; return 0; + +no_slb: + /* Catch magic page case */ + if (unlikely(mp_ea) && + unlikely(esid == (mp_ea >> SID_SHIFT)) && + !(vcpu->arch.shared->msr & MSR_PR)) { + *vsid = VSID_REAL | esid; + return 0; + } + + return -EINVAL; } static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 384179a..fa2f084 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -20,7 +20,6 @@ */ #include <linux/kvm_host.h> -#include <linux/hash.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -28,24 +27,9 @@ #include <asm/machdep.h> #include <asm/mmu_context.h> #include <asm/hw_irq.h> +#include "trace.h" #define PTE_SIZE 12 -#define VSID_ALL 0 - -/* #define DEBUG_MMU */ -/* #define DEBUG_SLB */ - -#ifdef DEBUG_MMU -#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__) -#else -#define dprintk_mmu(a, ...) do { } while(0) -#endif - -#ifdef DEBUG_SLB -#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__) -#else -#define dprintk_slb(a, ...) do { } while(0) -#endif void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { @@ -58,34 +42,39 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) * a hash, so we don't waste cycles on looping */ static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) { - return hash_64(gvsid, SID_MAP_BITS); + return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); } + static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; u16 sid_map_mask; - if (vcpu->arch.msr & MSR_PR) + if (vcpu->arch.shared->msr & MSR_PR) gvsid |= VSID_PR; sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); map = &to_book3s(vcpu)->sid_map[sid_map_mask]; - if (map->guest_vsid == gvsid) { - dprintk_slb("SLB: Searching: 0x%llx -> 0x%llx\n", - gvsid, map->host_vsid); + if (map->valid && (map->guest_vsid == gvsid)) { + trace_kvm_book3s_slb_found(gvsid, map->host_vsid); return map; } map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask]; - if (map->guest_vsid == gvsid) { - dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n", - gvsid, map->host_vsid); + if (map->valid && (map->guest_vsid == gvsid)) { + trace_kvm_book3s_slb_found(gvsid, map->host_vsid); return map; } - dprintk_slb("SLB: Searching %d/%d: 0x%llx -> not found\n", - sid_map_mask, SID_MAP_MASK - sid_map_mask, gvsid); + trace_kvm_book3s_slb_fail(sid_map_mask, gvsid); return NULL; } @@ -101,18 +90,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) struct kvmppc_sid_map *map; /* Get host physical address for gpa */ - hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); - if (kvm_is_error_hva(hpaddr)) { + hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); + if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); return -EINVAL; } hpaddr <<= PAGE_SHIFT; -#if PAGE_SHIFT == 12 -#elif PAGE_SHIFT == 16 - hpaddr |= orig_pte->raddr & 0xf000; -#else -#error Unknown page size -#endif + hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); /* and write the mapping ea -> hpa into the pt */ vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); @@ -161,10 +145,7 @@ map_again: } else { struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n", - ((rflags & HPTE_R_PP) == 3) ? '-' : 'w', - (rflags & HPTE_R_N) ? '-' : 'x', - orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr); + trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte); /* The ppc_md code may give us a secondary entry even though we asked for a primary. Fix up. */ @@ -191,7 +172,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) u16 sid_map_mask; static int backwards_map = 0; - if (vcpu->arch.msr & MSR_PR) + if (vcpu->arch.shared->msr & MSR_PR) gvsid |= VSID_PR; /* We might get collisions that trap in preceding order, so let's @@ -219,8 +200,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) map->guest_vsid = gvsid; map->valid = true; - dprintk_slb("SLB: New mapping at %d: 0x%llx -> 0x%llx\n", - sid_map_mask, gvsid, map->host_vsid); + trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid); return map; } @@ -292,7 +272,7 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; - dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid); + trace_kvm_book3s_slbmte(slb_vsid, slb_esid); return 0; } @@ -306,7 +286,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) { kvmppc_mmu_hpte_destroy(vcpu); - __destroy_context(to_book3s(vcpu)->context_id); + __destroy_context(to_book3s(vcpu)->context_id[0]); } int kvmppc_mmu_init(struct kvm_vcpu *vcpu) @@ -317,10 +297,10 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) err = __init_new_context(); if (err < 0) return -1; - vcpu3s->context_id = err; + vcpu3s->context_id[0] = err; - vcpu3s->vsid_max = ((vcpu3s->context_id + 1) << USER_ESID_BITS) - 1; - vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS; + vcpu3s->vsid_max = ((vcpu3s->context_id[0] + 1) << USER_ESID_BITS) - 1; + vcpu3s->vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS; vcpu3s->vsid_next = vcpu3s->vsid_first; kvmppc_mmu_hpte_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index c85f906..4668465 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -73,8 +73,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (get_xop(inst)) { case OP_19_XOP_RFID: case OP_19_XOP_RFI: - kvmppc_set_pc(vcpu, vcpu->arch.srr0); - kvmppc_set_msr(vcpu, vcpu->arch.srr1); + kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0); + kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); *advance = 0; break; @@ -86,14 +86,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr); + kvmppc_set_gpr(vcpu, get_rt(inst), + vcpu->arch.shared->msr); break; case OP_31_XOP_MTMSRD: { ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); if (inst & 0x10000) { - vcpu->arch.msr &= ~(MSR_RI | MSR_EE); - vcpu->arch.msr |= rs & (MSR_RI | MSR_EE); + vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE); + vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE); } else kvmppc_set_msr(vcpu, rs); break; @@ -204,14 +205,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, ra = kvmppc_get_gpr(vcpu, get_ra(inst)); addr = (ra + rb) & ~31ULL; - if (!(vcpu->arch.msr & MSR_SF)) + if (!(vcpu->arch.shared->msr & MSR_SF)) addr &= 0xffffffff; vaddr = addr; r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { *advance = 0; - vcpu->arch.dear = vaddr; + vcpu->arch.shared->dar = vaddr; to_svcpu(vcpu)->fault_dar = vaddr; dsisr = DSISR_ISSTORE; @@ -220,7 +221,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, else if (r == -EPERM) dsisr |= DSISR_PROTFAULT; - to_book3s(vcpu)->dsisr = dsisr; + vcpu->arch.shared->dsisr = dsisr; to_svcpu(vcpu)->fault_dsisr = dsisr; kvmppc_book3s_queue_irqprio(vcpu, @@ -263,7 +264,7 @@ void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper, } } -static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn) +static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_bat *bat; @@ -285,35 +286,7 @@ static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn) BUG(); } - if (sprn % 2) - return bat->raw >> 32; - else - return bat->raw; -} - -static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val) -{ - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); - struct kvmppc_bat *bat; - - switch (sprn) { - case SPRN_IBAT0U ... SPRN_IBAT3L: - bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2]; - break; - case SPRN_IBAT4U ... SPRN_IBAT7L: - bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)]; - break; - case SPRN_DBAT0U ... SPRN_DBAT3L: - bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2]; - break; - case SPRN_DBAT4U ... SPRN_DBAT7L: - bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)]; - break; - default: - BUG(); - } - - kvmppc_set_bat(vcpu, bat, !(sprn % 2), val); + return bat; } int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) @@ -326,10 +299,10 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: - to_book3s(vcpu)->dsisr = spr_val; + vcpu->arch.shared->dsisr = spr_val; break; case SPRN_DAR: - vcpu->arch.dear = spr_val; + vcpu->arch.shared->dar = spr_val; break; case SPRN_HIOR: to_book3s(vcpu)->hior = spr_val; @@ -338,12 +311,16 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_IBAT4U ... SPRN_IBAT7L: case SPRN_DBAT0U ... SPRN_DBAT3L: case SPRN_DBAT4U ... SPRN_DBAT7L: - kvmppc_write_bat(vcpu, sprn, (u32)spr_val); + { + struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn); + + kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val); /* BAT writes happen so rarely that we're ok to flush * everything here */ kvmppc_mmu_pte_flush(vcpu, 0, 0); kvmppc_mmu_flush_segments(vcpu); break; + } case SPRN_HID0: to_book3s(vcpu)->hid[0] = spr_val; break; @@ -433,16 +410,24 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_IBAT4U ... SPRN_IBAT7L: case SPRN_DBAT0U ... SPRN_DBAT3L: case SPRN_DBAT4U ... SPRN_DBAT7L: - kvmppc_set_gpr(vcpu, rt, kvmppc_read_bat(vcpu, sprn)); + { + struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn); + + if (sprn % 2) + kvmppc_set_gpr(vcpu, rt, bat->raw >> 32); + else + kvmppc_set_gpr(vcpu, rt, bat->raw); + break; + } case SPRN_SDR1: kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); break; case SPRN_DSISR: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr); break; case SPRN_DAR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; case SPRN_HIOR: kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 4868d4a..79751d8 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -21,6 +21,7 @@ #include <linux/kvm_host.h> #include <linux/hash.h> #include <linux/slab.h> +#include "trace.h" #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -30,14 +31,6 @@ #define PTE_SIZE 12 -/* #define DEBUG_MMU */ - -#ifdef DEBUG_MMU -#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__) -#else -#define dprintk_mmu(a, ...) do { } while(0) -#endif - static struct kmem_cache *hpte_cache; static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) @@ -45,6 +38,12 @@ static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE); } +static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr) +{ + return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE, + HPTEG_HASH_BITS_PTE_LONG); +} + static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) { return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE); @@ -60,77 +59,128 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { u64 index; + trace_kvm_book3s_mmu_map(pte); + + spin_lock(&vcpu->arch.mmu_lock); + /* Add to ePTE list */ index = kvmppc_mmu_hash_pte(pte->pte.eaddr); - hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); + hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); + + /* Add to ePTE_long list */ + index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr); + hlist_add_head_rcu(&pte->list_pte_long, + &vcpu->arch.hpte_hash_pte_long[index]); /* Add to vPTE list */ index = kvmppc_mmu_hash_vpte(pte->pte.vpage); - hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); + hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); /* Add to vPTE_long list */ index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); - hlist_add_head(&pte->list_vpte_long, - &vcpu->arch.hpte_hash_vpte_long[index]); + hlist_add_head_rcu(&pte->list_vpte_long, + &vcpu->arch.hpte_hash_vpte_long[index]); + + spin_unlock(&vcpu->arch.mmu_lock); +} + +static void free_pte_rcu(struct rcu_head *head) +{ + struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head); + kmem_cache_free(hpte_cache, pte); } static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { - dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n", - pte->pte.eaddr, pte->pte.vpage, pte->host_va); + trace_kvm_book3s_mmu_invalidate(pte); /* Different for 32 and 64 bit */ kvmppc_mmu_invalidate_pte(vcpu, pte); + spin_lock(&vcpu->arch.mmu_lock); + + /* pte already invalidated in between? */ + if (hlist_unhashed(&pte->list_pte)) { + spin_unlock(&vcpu->arch.mmu_lock); + return; + } + + hlist_del_init_rcu(&pte->list_pte); + hlist_del_init_rcu(&pte->list_pte_long); + hlist_del_init_rcu(&pte->list_vpte); + hlist_del_init_rcu(&pte->list_vpte_long); + if (pte->pte.may_write) kvm_release_pfn_dirty(pte->pfn); else kvm_release_pfn_clean(pte->pfn); - hlist_del(&pte->list_pte); - hlist_del(&pte->list_vpte); - hlist_del(&pte->list_vpte_long); + spin_unlock(&vcpu->arch.mmu_lock); vcpu->arch.hpte_cache_count--; - kmem_cache_free(hpte_cache, pte); + call_rcu(&pte->rcu_head, free_pte_rcu); } static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) { struct hpte_cache *pte; - struct hlist_node *node, *tmp; + struct hlist_node *node; int i; + rcu_read_lock(); + for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; - hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) invalidate_pte(vcpu, pte); } + + rcu_read_unlock(); } static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) { struct hlist_head *list; - struct hlist_node *node, *tmp; + struct hlist_node *node; struct hpte_cache *pte; /* Find the list of entries in the map */ list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; + rcu_read_lock(); + /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_safe(pte, node, tmp, list, list_pte) + hlist_for_each_entry_rcu(pte, node, list, list_pte) if ((pte->pte.eaddr & ~0xfffUL) == guest_ea) invalidate_pte(vcpu, pte); + + rcu_read_unlock(); } -void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) +static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) { - u64 i; + struct hlist_head *list; + struct hlist_node *node; + struct hpte_cache *pte; - dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n", - vcpu->arch.hpte_cache_count, guest_ea, ea_mask); + /* Find the list of entries in the map */ + list = &vcpu->arch.hpte_hash_pte_long[ + kvmppc_mmu_hash_pte_long(guest_ea)]; + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, node, list, list_pte_long) + if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} + +void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) +{ + trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask); guest_ea &= ea_mask; switch (ea_mask) { @@ -138,9 +188,7 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) kvmppc_mmu_pte_flush_page(vcpu, guest_ea); break; case 0x0ffff000: - /* 32-bit flush w/o segment, go through all possible segments */ - for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL) - kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL); + kvmppc_mmu_pte_flush_long(vcpu, guest_ea); break; case 0: /* Doing a complete flush -> start from scratch */ @@ -156,39 +204,46 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) { struct hlist_head *list; - struct hlist_node *node, *tmp; + struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xfffffffffULL; list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; + rcu_read_lock(); + /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte) + hlist_for_each_entry_rcu(pte, node, list, list_vpte) if ((pte->pte.vpage & vp_mask) == guest_vp) invalidate_pte(vcpu, pte); + + rcu_read_unlock(); } /* Flush with mask 0xffffff000 */ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) { struct hlist_head *list; - struct hlist_node *node, *tmp; + struct hlist_node *node; struct hpte_cache *pte; u64 vp_mask = 0xffffff000ULL; list = &vcpu->arch.hpte_hash_vpte_long[ kvmppc_mmu_hash_vpte_long(guest_vp)]; + rcu_read_lock(); + /* Check the list for matching entries and invalidate */ - hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) if ((pte->pte.vpage & vp_mask) == guest_vp) invalidate_pte(vcpu, pte); + + rcu_read_unlock(); } void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) { - dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n", - vcpu->arch.hpte_cache_count, guest_vp, vp_mask); + trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask); guest_vp &= vp_mask; switch(vp_mask) { @@ -206,21 +261,24 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) { - struct hlist_node *node, *tmp; + struct hlist_node *node; struct hpte_cache *pte; int i; - dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n", - vcpu->arch.hpte_cache_count, pa_start, pa_end); + trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end); + + rcu_read_lock(); for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; - hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) + hlist_for_each_entry_rcu(pte, node, list, list_vpte_long) if ((pte->pte.raddr >= pa_start) && (pte->pte.raddr < pa_end)) invalidate_pte(vcpu, pte); } + + rcu_read_unlock(); } struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) @@ -254,11 +312,15 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) /* init hpte lookup hashes */ kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); + kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long, + ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long)); kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); + spin_lock_init(&vcpu->arch.mmu_lock); + return 0; } diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index 35a701f..7b0ee96c 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -165,14 +165,15 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) { u64 dsisr; + struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; - vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 36, 0); - vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0); - vcpu->arch.dear = eaddr; + shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0); + shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0); + shared->dar = eaddr; /* Page Fault */ dsisr = kvmppc_set_field(0, 33, 33, 1); if (is_store) - to_book3s(vcpu)->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); + shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); } @@ -658,7 +659,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) if (!kvmppc_inst_is_paired_single(vcpu, inst)) return EMULATE_FAIL; - if (!(vcpu->arch.msr & MSR_FP)) { + if (!(vcpu->arch.shared->msr & MSR_FP)) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL); return EMULATE_AGAIN; } diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 506d5c3..2b9c908 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -202,8 +202,25 @@ _GLOBAL(kvmppc_rmcall) #if defined(CONFIG_PPC_BOOK3S_32) #define STACK_LR INT_FRAME_SIZE+4 + +/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */ +#define MSR_EXT_START \ + PPC_STL r20, _NIP(r1); \ + mfmsr r20; \ + LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \ + andc r3,r20,r3; /* Disable DR,EE */ \ + mtmsr r3; \ + sync + +#define MSR_EXT_END \ + mtmsr r20; /* Enable DR,EE */ \ + sync; \ + PPC_LL r20, _NIP(r1) + #elif defined(CONFIG_PPC_BOOK3S_64) #define STACK_LR _LINK +#define MSR_EXT_START +#define MSR_EXT_END #endif /* @@ -215,19 +232,12 @@ _GLOBAL(kvmppc_load_up_ ## what); \ PPC_STLU r1, -INT_FRAME_SIZE(r1); \ mflr r3; \ PPC_STL r3, STACK_LR(r1); \ - PPC_STL r20, _NIP(r1); \ - mfmsr r20; \ - LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \ - andc r3,r20,r3; /* Disable DR,EE */ \ - mtmsr r3; \ - sync; \ + MSR_EXT_START; \ \ bl FUNC(load_up_ ## what); \ \ - mtmsr r20; /* Enable DR,EE */ \ - sync; \ + MSR_EXT_END; \ PPC_LL r3, STACK_LR(r1); \ - PPC_LL r20, _NIP(r1); \ mtlr r3; \ addi r1, r1, INT_FRAME_SIZE; \ blr @@ -242,10 +252,10 @@ define_load_up(vsx) .global kvmppc_trampoline_lowmem kvmppc_trampoline_lowmem: - .long kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START + PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START .global kvmppc_trampoline_enter kvmppc_trampoline_enter: - .long kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START + PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 8d4e35f..77575d0 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -62,9 +62,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) { int i; - printk("pc: %08lx msr: %08lx\n", vcpu->arch.pc, vcpu->arch.msr); + printk("pc: %08lx msr: %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr); printk("lr: %08lx ctr: %08lx\n", vcpu->arch.lr, vcpu->arch.ctr); - printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1); + printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0, + vcpu->arch.shared->srr1); printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions); @@ -130,13 +131,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL); + unsigned int prio = BOOKE_IRQPRIO_EXTERNAL; + + if (irq->irq == KVM_INTERRUPT_SET_LEVEL) + prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL; + + kvmppc_booke_queue_irqprio(vcpu, prio); } void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); + clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } /* Deliver the interrupt of the corresponding priority, if possible. */ @@ -146,6 +153,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, int allowed = 0; ulong uninitialized_var(msr_mask); bool update_esr = false, update_dear = false; + ulong crit_raw = vcpu->arch.shared->critical; + ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); + bool crit; + bool keep_irq = false; + + /* Truncate crit indicators in 32 bit mode */ + if (!(vcpu->arch.shared->msr & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(vcpu->arch.shared->msr & MSR_PR); + + if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) { + priority = BOOKE_IRQPRIO_EXTERNAL; + keep_irq = true; + } switch (priority) { case BOOKE_IRQPRIO_DTLB_MISS: @@ -169,36 +196,38 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, break; case BOOKE_IRQPRIO_CRITICAL: case BOOKE_IRQPRIO_WATCHDOG: - allowed = vcpu->arch.msr & MSR_CE; + allowed = vcpu->arch.shared->msr & MSR_CE; msr_mask = MSR_ME; break; case BOOKE_IRQPRIO_MACHINE_CHECK: - allowed = vcpu->arch.msr & MSR_ME; + allowed = vcpu->arch.shared->msr & MSR_ME; msr_mask = 0; break; case BOOKE_IRQPRIO_EXTERNAL: case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: - allowed = vcpu->arch.msr & MSR_EE; + allowed = vcpu->arch.shared->msr & MSR_EE; + allowed = allowed && !crit; msr_mask = MSR_CE|MSR_ME|MSR_DE; break; case BOOKE_IRQPRIO_DEBUG: - allowed = vcpu->arch.msr & MSR_DE; + allowed = vcpu->arch.shared->msr & MSR_DE; msr_mask = MSR_ME; break; } if (allowed) { - vcpu->arch.srr0 = vcpu->arch.pc; - vcpu->arch.srr1 = vcpu->arch.msr; + vcpu->arch.shared->srr0 = vcpu->arch.pc; + vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) vcpu->arch.esr = vcpu->arch.queued_esr; if (update_dear == true) - vcpu->arch.dear = vcpu->arch.queued_dear; - kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask); + vcpu->arch.shared->dar = vcpu->arch.queued_dear; + kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); - clear_bit(priority, &vcpu->arch.pending_exceptions); + if (!keep_irq) + clear_bit(priority, &vcpu->arch.pending_exceptions); } return allowed; @@ -208,6 +237,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; + unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; priority = __ffs(*pending); @@ -219,6 +249,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) BITS_PER_BYTE * sizeof(*pending), priority + 1); } + + /* Tell the guest about our interrupt status */ + if (*pending) + vcpu->arch.shared->int_pending = 1; + else if (old_pending) + vcpu->arch.shared->int_pending = 0; } /** @@ -265,7 +301,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOKE_INTERRUPT_PROGRAM: - if (vcpu->arch.msr & MSR_PR) { + if (vcpu->arch.shared->msr & MSR_PR) { /* Program traps generated by user-level software must be handled * by the guest kernel. */ kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); @@ -337,7 +373,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOKE_INTERRUPT_SYSCALL: - kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL); + if (!(vcpu->arch.shared->msr & MSR_PR) && + (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { + /* KVM PV hypercalls */ + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + r = RESUME_GUEST; + } else { + /* Guest syscalls */ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL); + } kvmppc_account_exit(vcpu, SYSCALL_EXITS); r = RESUME_GUEST; break; @@ -466,15 +510,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { + int i; + vcpu->arch.pc = 0; - vcpu->arch.msr = 0; + vcpu->arch.shared->msr = 0; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; - /* Eye-catching number so we know if the guest takes an interrupt - * before it's programmed its own IVPR. */ + /* Eye-catching numbers so we know if the guest takes an interrupt + * before it's programmed its own IVPR/IVORs. */ vcpu->arch.ivpr = 0x55550000; + for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) + vcpu->arch.ivor[i] = 0x7700 | i * 4; kvmppc_init_timing_stats(vcpu); @@ -490,14 +538,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->ctr = vcpu->arch.ctr; regs->lr = vcpu->arch.lr; regs->xer = kvmppc_get_xer(vcpu); - regs->msr = vcpu->arch.msr; - regs->srr0 = vcpu->arch.srr0; - regs->srr1 = vcpu->arch.srr1; + regs->msr = vcpu->arch.shared->msr; + regs->srr0 = vcpu->arch.shared->srr0; + regs->srr1 = vcpu->arch.shared->srr1; regs->pid = vcpu->arch.pid; - regs->sprg0 = vcpu->arch.sprg0; - regs->sprg1 = vcpu->arch.sprg1; - regs->sprg2 = vcpu->arch.sprg2; - regs->sprg3 = vcpu->arch.sprg3; + regs->sprg0 = vcpu->arch.shared->sprg0; + regs->sprg1 = vcpu->arch.shared->sprg1; + regs->sprg2 = vcpu->arch.shared->sprg2; + regs->sprg3 = vcpu->arch.shared->sprg3; regs->sprg5 = vcpu->arch.sprg4; regs->sprg6 = vcpu->arch.sprg5; regs->sprg7 = vcpu->arch.sprg6; @@ -518,12 +566,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.lr = regs->lr; kvmppc_set_xer(vcpu, regs->xer); kvmppc_set_msr(vcpu, regs->msr); - vcpu->arch.srr0 = regs->srr0; - vcpu->arch.srr1 = regs->srr1; - vcpu->arch.sprg0 = regs->sprg0; - vcpu->arch.sprg1 = regs->sprg1; - vcpu->arch.sprg2 = regs->sprg2; - vcpu->arch.sprg3 = regs->sprg3; + vcpu->arch.shared->srr0 = regs->srr0; + vcpu->arch.shared->srr1 = regs->srr1; + vcpu->arch.shared->sprg0 = regs->sprg0; + vcpu->arch.shared->sprg1 = regs->sprg1; + vcpu->arch.shared->sprg2 = regs->sprg2; + vcpu->arch.shared->sprg3 = regs->sprg3; vcpu->arch.sprg5 = regs->sprg4; vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg7 = regs->sprg6; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index d59bcca..492bb70 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -46,7 +46,9 @@ #define BOOKE_IRQPRIO_FIT 17 #define BOOKE_IRQPRIO_DECREMENTER 18 #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 -#define BOOKE_IRQPRIO_MAX 19 +/* Internal pseudo-irqprio for level triggered externals */ +#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 +#define BOOKE_IRQPRIO_MAX 20 extern unsigned long kvmppc_booke_handlers; @@ -54,12 +56,12 @@ extern unsigned long kvmppc_booke_handlers; * changing. */ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) { - if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR)) + if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR)) kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR); - vcpu->arch.msr = new_msr; + vcpu->arch.shared->msr = new_msr; - if (vcpu->arch.msr & MSR_WE) { + if (vcpu->arch.shared->msr & MSR_WE) { kvm_vcpu_block(vcpu); kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); }; diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index cbc790e..1260f5f 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -31,8 +31,8 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) { - vcpu->arch.pc = vcpu->arch.srr0; - kvmppc_set_msr(vcpu, vcpu->arch.srr1); + vcpu->arch.pc = vcpu->arch.shared->srr0; + kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); } int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, @@ -62,7 +62,7 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case OP_31_XOP_MFMSR: rt = get_rt(inst); - kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); break; @@ -74,13 +74,13 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case OP_31_XOP_WRTEE: rs = get_rs(inst); - vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) + vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE) | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); break; case OP_31_XOP_WRTEEI: - vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) + vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE) | (inst & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); break; @@ -105,7 +105,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_DEAR: - vcpu->arch.dear = spr_val; break; + vcpu->arch.shared->dar = spr_val; break; case SPRN_ESR: vcpu->arch.esr = spr_val; break; case SPRN_DBCR0: @@ -200,7 +200,7 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_IVPR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; case SPRN_DEAR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; case SPRN_ESR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; case SPRN_DBCR0: diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 380a78c..0498469 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -415,7 +415,8 @@ lightweight_exit: lwz r8, VCPU_GPR(r8)(r4) lwz r3, VCPU_PC(r4) mtsrr0 r3 - lwz r3, VCPU_MSR(r4) + lwz r3, VCPU_SHARED(r4) + lwz r3, VCPU_SHARED_MSR(r3) oris r3, r3, KVMPPC_MSR_MASK@h ori r3, r3, KVMPPC_MSR_MASK@l mtsrr1 r3 diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index e8a00b0..71750f2 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -117,8 +117,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto uninit_vcpu; + vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!vcpu->arch.shared) + goto uninit_tlb; + return vcpu; +uninit_tlb: + kvmppc_e500_tlb_uninit(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); free_vcpu: @@ -131,6 +137,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu_e500); diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 21011e1..d6d6d47 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -226,8 +226,7 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); stlbe->mas1 = 0; - trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - stlbe->mas3, stlbe->mas7); + trace_kvm_stlb_inval(index_of(tlbsel, esel)); } static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -298,7 +297,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, /* Get reference to new page. */ new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); if (is_error_page(new_page)) { - printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); + printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", + (long)gfn); kvm_release_page_clean(new_page); return; } @@ -314,10 +314,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, - vcpu_e500->vcpu.arch.msr & MSR_PR); + vcpu_e500->vcpu.arch.shared->msr & MSR_PR); stlbe->mas3 = (hpaddr & MAS3_RPN) | e500_shadow_mas3_attrib(gtlbe->mas3, - vcpu_e500->vcpu.arch.msr & MSR_PR); + vcpu_e500->vcpu.arch.shared->msr & MSR_PR); stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, @@ -576,28 +576,28 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { - unsigned int as = !!(vcpu->arch.msr & MSR_IS); + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); } int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { - unsigned int as = !!(vcpu->arch.msr & MSR_DS); + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); } void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) { - unsigned int as = !!(vcpu->arch.msr & MSR_IS); + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); } void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) { - unsigned int as = !!(vcpu->arch.msr & MSR_DS); + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index d28e301..458946b 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -171,7 +171,7 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, /* Does it match current guest AS? */ /* XXX what about IS != DS? */ - if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS)) + if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) return 0; gpa = get_tlb_raddr(tlbe); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index b83ba58..c64fd29 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -242,9 +242,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) switch (sprn) { case SPRN_SRR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0); + break; case SPRN_SRR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1); + break; case SPRN_PVR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; case SPRN_PIR: @@ -261,13 +263,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) kvmppc_set_gpr(vcpu, rt, get_tb()); break; case SPRN_SPRG0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0); + break; case SPRN_SPRG1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1); + break; case SPRN_SPRG2: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2); + break; case SPRN_SPRG3: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3); + break; /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ @@ -320,9 +326,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) rs = get_rs(inst); switch (sprn) { case SPRN_SRR0: - vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs); + break; case SPRN_SRR1: - vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs); + break; /* XXX We need to context-switch the timebase for * watchdog and FIT. */ @@ -337,13 +345,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case SPRN_SPRG0: - vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs); + break; case SPRN_SPRG1: - vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs); + break; case SPRN_SPRG2: - vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs); + break; case SPRN_SPRG3: - vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs); + break; default: emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 72a4ad8..2f87a16 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -38,9 +38,56 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); + return !(v->arch.shared->msr & MSR_WE) || + !!(v->arch.pending_exceptions); } +int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) +{ + int nr = kvmppc_get_gpr(vcpu, 11); + int r; + unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3); + unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4); + unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5); + unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6); + unsigned long r2 = 0; + + if (!(vcpu->arch.shared->msr & MSR_SF)) { + /* 32 bit mode */ + param1 &= 0xffffffff; + param2 &= 0xffffffff; + param3 &= 0xffffffff; + param4 &= 0xffffffff; + } + + switch (nr) { + case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: + { + vcpu->arch.magic_page_pa = param1; + vcpu->arch.magic_page_ea = param2; + + r2 = KVM_MAGIC_FEAT_SR; + + r = HC_EV_SUCCESS; + break; + } + case HC_VENDOR_KVM | KVM_HC_FEATURES: + r = HC_EV_SUCCESS; +#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */ + r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); +#endif + + /* Second return value is in r4 */ + break; + default: + r = HC_EV_UNIMPLEMENTED; + break; + } + + kvmppc_set_gpr(vcpu, 4, r2); + + return r; +} int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -145,8 +192,10 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_SEGSTATE: case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_UNSET_IRQ: + case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: case KVM_CAP_PPC_OSI: + case KVM_CAP_PPC_GET_PVINFO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -534,16 +583,53 @@ out: return r; } +static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) +{ + u32 inst_lis = 0x3c000000; + u32 inst_ori = 0x60000000; + u32 inst_nop = 0x60000000; + u32 inst_sc = 0x44000002; + u32 inst_imm_mask = 0xffff; + + /* + * The hypercall to get into KVM from within guest context is as + * follows: + * + * lis r0, r0, KVM_SC_MAGIC_R0@h + * ori r0, KVM_SC_MAGIC_R0@l + * sc + * nop + */ + pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask); + pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); + pvinfo->hcall[2] = inst_sc; + pvinfo->hcall[3] = inst_nop; + + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + void __user *argp = (void __user *)arg; long r; switch (ioctl) { + case KVM_PPC_GET_PVINFO: { + struct kvm_ppc_pvinfo pvinfo; + r = kvm_vm_ioctl_get_pvinfo(&pvinfo); + if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) { + r = -EFAULT; + goto out; + } + + break; + } default: r = -ENOTTY; } +out: return r; } diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index a8e8400..3aca1b0 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -98,6 +98,245 @@ TRACE_EVENT(kvm_gtlb_write, __entry->word1, __entry->word2) ); + +/************************************************************************* + * Book3S trace points * + *************************************************************************/ + +#ifdef CONFIG_PPC_BOOK3S + +TRACE_EVENT(kvm_book3s_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, srr1 ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->srr1 = to_svcpu(vcpu)->shadow_srr1; + ), + + TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", + __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, + __entry->srr1) +); + +TRACE_EVENT(kvm_book3s_reenter, + TP_PROTO(int r, struct kvm_vcpu *vcpu), + TP_ARGS(r, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, r ) + __field( unsigned long, pc ) + ), + + TP_fast_assign( + __entry->r = r; + __entry->pc = kvmppc_get_pc(vcpu); + ), + + TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, + TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, + struct kvmppc_pte *orig_pte), + TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + + TP_STRUCT__entry( + __field( unsigned char, flag_w ) + __field( unsigned char, flag_x ) + __field( unsigned long, eaddr ) + __field( unsigned long, hpteg ) + __field( unsigned long, va ) + __field( unsigned long long, vpage ) + __field( unsigned long, hpaddr ) + ), + + TP_fast_assign( + __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; + __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; + __entry->eaddr = orig_pte->eaddr; + __entry->hpteg = hpteg; + __entry->va = va; + __entry->vpage = orig_pte->vpage; + __entry->hpaddr = hpaddr; + ), + + TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", + __entry->flag_w, __entry->flag_x, __entry->eaddr, + __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_va ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_va = pte->host_va; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_va, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_va ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_va = pte->host_va; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_va, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, + TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, + unsigned long long p2), + TP_ARGS(type, vcpu, p1, p2), + + TP_STRUCT__entry( + __field( int, count ) + __field( unsigned long long, p1 ) + __field( unsigned long long, p2 ) + __field( const char *, type ) + ), + + TP_fast_assign( + __entry->count = vcpu->arch.hpte_cache_count; + __entry->p1 = p1; + __entry->p2 = p2; + __entry->type = type; + ), + + TP_printk("Flush %d %sPTEs: %llx - %llx", + __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, + TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), + TP_ARGS(gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned long long, gvsid ) + __field( unsigned long long, hvsid ) + ), + + TP_fast_assign( + __entry->gvsid = gvsid; + __entry->hvsid = hvsid; + ), + + TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), + TP_ARGS(sid_map_mask, gvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, gvsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->gvsid = gvsid; + ), + + TP_printk("%x/%x: %llx", __entry->sid_map_mask, + SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, + unsigned long long hvsid), + TP_ARGS(sid_map_mask, gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, guest_vsid ) + __field( unsigned long long, host_vsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->guest_vsid = gvsid; + __entry->host_vsid = hvsid; + ), + + TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, + __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, + TP_PROTO(u64 slb_vsid, u64 slb_esid), + TP_ARGS(slb_vsid, slb_esid), + + TP_STRUCT__entry( + __field( u64, slb_vsid ) + __field( u64, slb_esid ) + ), + + TP_fast_assign( + __entry->slb_vsid = slb_vsid; + __entry->slb_esid = slb_esid; + ), + + TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +#endif /* CONFIG_PPC_BOOK3S */ + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 81c9208..956154f 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -21,6 +21,16 @@ source "arch/powerpc/platforms/44x/Kconfig" source "arch/powerpc/platforms/40x/Kconfig" source "arch/powerpc/platforms/amigaone/Kconfig" +config KVM_GUEST + bool "KVM Guest support" + default y + ---help--- + This option enables various optimizations for running under the KVM + hypervisor. Overhead for the kernel when not running inside KVM should + be minimal. + + In case of doubt, say Y + config PPC_NATIVE bool depends on 6xx || PPC64 diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 42e512b..287d7bb 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,6 +5,7 @@ header-y += chsc.h header-y += cmb.h header-y += dasd.h header-y += debug.h +header-y += kvm_virtio.h header-y += monwriter.h header-y += qeth.h header-y += schid.h diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h index acdfdff..72f6141 100644 --- a/arch/s390/include/asm/kvm_virtio.h +++ b/arch/s390/include/asm/kvm_virtio.h @@ -54,4 +54,11 @@ struct kvm_vqconfig { * This is pagesize for historical reasons. */ #define KVM_S390_VIRTIO_RING_ALIGN 4096 + +/* These values are supposed to be in ext_params on an interrupt */ +#define VIRTIO_PARAM_MASK 0xff +#define VIRTIO_PARAM_VRING_INTERRUPT 0x0 +#define VIRTIO_PARAM_CONFIG_CHANGED 0x1 +#define VIRTIO_PARAM_DEV_ADD 0x2 + #endif diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1f99ecf..b36c6b3 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -139,6 +139,7 @@ struct x86_emulate_ops { void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); + void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); @@ -156,7 +157,10 @@ struct operand { unsigned long orig_val; u64 orig_val64; }; - unsigned long *ptr; + union { + unsigned long *reg; + unsigned long mem; + } addr; union { unsigned long val; u64 val64; @@ -190,6 +194,7 @@ struct decode_cache { bool has_seg_override; u8 seg_override; unsigned int d; + int (*execute)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ @@ -197,17 +202,16 @@ struct decode_cache { u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; - u8 use_modrm_ea; + u8 modrm_seg; bool rip_relative; - unsigned long modrm_ea; - void *modrm_ptr; - unsigned long modrm_val; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; }; struct x86_emulate_ctxt { + struct x86_emulate_ops *ops; + /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -220,12 +224,11 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - bool restart; /* restart string instruction after writeback */ + bool perm_ok; /* do not check permissions if true */ int exception; /* exception that happens during emulation or -1 */ u32 error_code; /* error code for exception */ bool error_code_valid; - unsigned long cr2; /* faulted address in case of #PF */ /* decode cache */ struct decode_cache decode; @@ -249,13 +252,14 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt); +#define EMULATION_FAILED -1 +#define EMULATION_OK 0 +#define EMULATION_RESTART 1 +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code); - +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c52e2eb..9e6fe39 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -236,10 +236,14 @@ struct kvm_pio_request { */ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); + unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); + void (*inject_page_fault)(struct kvm_vcpu *vcpu); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, u32 *error); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -249,13 +253,18 @@ struct kvm_mmu { int root_level; int shadow_root_level; union kvm_mmu_page_role base_role; + bool direct_map; u64 *pae_root; + u64 *lm_root; u64 rsvd_bits_mask[2][4]; + + bool nx; + + u64 pdptrs[4]; /* pae */ }; struct kvm_vcpu_arch { - u64 host_tsc; /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -272,7 +281,6 @@ struct kvm_vcpu_arch { unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; - u64 pdptrs[4]; /* pae */ u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ @@ -282,7 +290,41 @@ struct kvm_vcpu_arch { u64 ia32_misc_enable_msr; bool tpr_access_reporting; + /* + * Paging state of the vcpu + * + * If the vcpu runs in guest mode with two level paging this still saves + * the paging mode of the l1 guest. This context is always used to + * handle faults. + */ struct kvm_mmu mmu; + + /* + * Paging state of an L2 guest (used for nested npt) + * + * This context will save all necessary information to walk page tables + * of the an L2 guest. This context is only initialized for page table + * walking and not for faulting since we never handle l2 page faults on + * the host. + */ + struct kvm_mmu nested_mmu; + + /* + * Pointer to the mmu context currently used for + * gva_to_gpa translations. + */ + struct kvm_mmu *walk_mmu; + + /* + * This struct is filled with the necessary information to propagate a + * page fault into the guest + */ + struct { + u64 address; + unsigned error_code; + bool nested; + } fault; + /* only needed in kvm_pv_mmu_op() path, but it's hot so * put it here to avoid allocation */ struct kvm_pv_mmu_op_buffer mmu_op_buffer; @@ -336,9 +378,15 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; - unsigned int hv_clock_tsc_khz; + unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + u64 last_host_tsc; + u64 last_guest_tsc; + u64 last_kernel_ns; + u64 last_tsc_nsec; + u64 last_tsc_write; + bool tsc_catchup; bool nmi_pending; bool nmi_injected; @@ -367,9 +415,9 @@ struct kvm_vcpu_arch { }; struct kvm_arch { - unsigned int n_free_mmu_pages; + unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; + unsigned int n_max_mmu_pages; atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -394,8 +442,14 @@ struct kvm_arch { gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; - u64 vm_init_tsc; s64 kvmclock_offset; + spinlock_t tsc_write_lock; + u64 last_tsc_nsec; + u64 last_tsc_offset; + u64 last_tsc_write; + u32 virtual_tsc_khz; + u32 virtual_tsc_mult; + s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; @@ -505,6 +559,7 @@ struct kvm_x86_ops { void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code, bool reinject); + void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); @@ -517,11 +572,16 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + + void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); bool (*has_wbinvd_exit)(void); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + const struct trace_print_flags *exit_reasons_str; }; @@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, - u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu); +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t gfn, void *data, int offset, int len, + u32 access); +void kvm_propagate_fault(struct kvm_vcpu *vcpu); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 05eba5e..7b562b6 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void) return cpuid_eax(KVM_CPUID_FEATURES); } +#ifdef CONFIG_KVM_GUEST +void __init kvm_guest_init(void); +#else +#define kvm_guest_init() do { } while (0) #endif +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 986f779..83c4bb1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -198,6 +198,7 @@ #define MSR_IA32_TSC 0x00000010 #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a +#define MSR_EBC_FREQUENCY_ID 0x0000002c #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define FEATURE_CONTROL_LOCKED (1<<0) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index cd02f32..7f7e577 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index eb9b76c..ca43ce3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -128,13 +128,15 @@ static struct clocksource kvm_clock = { static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); - int low, high; + int low, high, ret; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(msr_kvm_system_time, low, high); + return ret; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 239427c..bab3b9e 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); + return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, + shadow->tsc_shift); } /* diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 970bbd4..ddc131f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -64,6 +64,13 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. +config KVM_MMU_AUDIT + bool "Audit KVM MMU" + depends on KVM && TRACEPOINTS + ---help--- + This option adds a R/W kVM module parameter 'mmu_audit', which allows + audit KVM MMU at runtime. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 66ca98a..38b6e8d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,7 +9,7 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> @@ -51,13 +51,13 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstAcc (4<<1) /* Destination Accumulator */ #define DstDI (5<<1) /* Destination is in ES:(E)DI */ #define DstMem64 (6<<1) /* 64bit memory operand */ +#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ -#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ #define SrcReg (1<<4) /* Register operand. */ #define SrcMem (2<<4) /* Memory operand. */ #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ @@ -71,6 +71,7 @@ #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ #define SrcAcc (0xd<<4) /* Source Accumulator */ +#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -82,8 +83,10 @@ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ +#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ +#define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) @@ -92,285 +95,30 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) +#define Src2Imm (4<<29) #define Src2Mask (7<<29) -enum { - Group1_80, Group1_81, Group1_82, Group1_83, - Group1A, Group3_Byte, Group3, Group4, Group5, Group7, - Group8, Group9, +#define X2(x...) x, x +#define X3(x...) X2(x), x +#define X4(x...) X2(x), X2(x) +#define X5(x...) X4(x), x +#define X6(x...) X4(x), X2(x) +#define X7(x...) X4(x), X3(x) +#define X8(x...) X4(x), X4(x) +#define X16(x...) X8(x), X8(x) + +struct opcode { + u32 flags; + union { + int (*execute)(struct x86_emulate_ctxt *ctxt); + struct opcode *group; + struct group_dual *gdual; + } u; }; -static u32 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - /* 0x60 - 0x67 */ - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ - SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x78 - 0x7F */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x80 - 0x87 */ - Group | Group1_80, Group | Group1_81, - Group | Group1_82, Group | Group1_83, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, - ImplicitOps | SrcMem16 | ModRM, Group | Group1A, - /* 0x90 - 0x97 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x98 - 0x9F */ - 0, 0, SrcImmFAddr | No64, 0, - ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, - ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, - ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, - /* 0xA8 - 0xAF */ - DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, - ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, - ByteOp | DstDI | String, DstDI | String, - /* 0xB0 - 0xB7 */ - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - /* 0xB8 - 0xBF */ - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, - ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, - ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, - /* 0xE8 - 0xEF */ - SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmFAddr | No64, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, - SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, - /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, -}; - -static u32 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, - 0, ImplicitOps, ImplicitOps | Priv, 0, - ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, - 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps | Priv, ModRM | Priv, - ModRM | ImplicitOps | Priv, ModRM | Priv, - 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, - ImplicitOps, ImplicitOps | Priv, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, 0, 0, - /* 0xA8 - 0xAF */ - ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp | Lock, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, - ModRM, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - 0, DstMem | SrcReg | ModRM | BitOp | Lock, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, - Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, - 0, 0, 0, Group | GroupDual | Group9, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -static u32 group_table[] = { - [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM, - [Group1_81*8] = - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM, - [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64, - [Group1_83*8] = - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM, - [Group1A*8] = - DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, - [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group3*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, - 0, 0, 0, 0, 0, 0, - [Group5*8] = - DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, - SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, - SrcMem | ModRM | Stack, 0, - [Group7*8] = - 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, - [Group8*8] = - 0, 0, 0, 0, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, - [Group9*8] = - 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, -}; - -static u32 group2_table[] = { - [Group7*8] = - SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov | Priv, 0, - [Group9*8] = - 0, 0, 0, 0, 0, 0, 0, 0, +struct group_dual { + struct opcode mod012[8]; + struct opcode mod3[8]; }; /* EFLAGS bit definitions. */ @@ -392,6 +140,9 @@ static u32 group2_table[] = { #define EFLG_PF (1<<2) #define EFLG_CF (1<<0) +#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a +#define EFLG_RESERVED_ONE_MASK 2 + /* * Instruction emulation: * Most instructions are emulated directly via a fragment of inline assembly @@ -444,13 +195,13 @@ static u32 group2_table[] = { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ + : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ "=&r" (_tmp) \ : _y ((_src).val), "i" (EFLAGS_MASK)); \ } while (0) @@ -463,13 +214,13 @@ static u32 group2_table[] = { \ switch ((_dst).bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ break; \ } \ } while (0) @@ -479,7 +230,7 @@ static u32 group2_table[] = { unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ @@ -566,6 +317,74 @@ static u32 group2_table[] = { } \ } while (0) +#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "1") \ + _op _suffix " %5; " \ + _POST_EFLAGS("0", "4", "1") \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "1") \ + "1: \n\t" \ + _op _suffix " %6; " \ + "2: \n\t" \ + _POST_EFLAGS("0", "5", "1") \ + ".pushsection .fixup,\"ax\" \n\t" \ + "3: movb $1, %4 \n\t" \ + "jmp 2b \n\t" \ + ".popsection \n\t" \ + _ASM_EXTABLE(1b, 3b) \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ + case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ + case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ + } \ + } while (0) + +#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "b", _ex); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "w", _ex); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "l", _ex); \ + break; \ + case 8: ON64( \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "q", _ex)); \ + break; \ + } \ + } while (0) + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ @@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, ctxt->exception = vec; ctxt->error_code = error; ctxt->error_code_valid = valid; - ctxt->restart = false; } static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) @@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) emulate_exception(ctxt, GP_VECTOR, err, true); } -static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, - int err) +static void emulate_pf(struct x86_emulate_ctxt *ctxt) { - ctxt->cr2 = addr; - emulate_exception(ctxt, PF_VECTOR, err, true); + emulate_exception(ctxt, PF_VECTOR, 0, true); } static void emulate_ud(struct x86_emulate_ctxt *ctxt) @@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) emulate_exception(ctxt, TS_VECTOR, err, true); } +static int emulate_de(struct x86_emulate_ctxt *ctxt) +{ + emulate_exception(ctxt, DE_VECTOR, 0, false); + return X86EMUL_PROPAGATE_FAULT; +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - void *ptr, + ulong addr, u16 *size, unsigned long *address, int op_bytes) { int rc; @@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu, NULL); + rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu, NULL); + rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); return rc; } @@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags) return (!!rc ^ (condition & 1)); } +static void fetch_register_operand(struct operand *op) +{ + switch (op->bytes) { + case 1: + op->val = *(u8 *)op->addr.reg; + break; + case 2: + op->val = *(u16 *)op->addr.reg; + break; + case 4: + op->val = *(u32 *)op->addr.reg; + break; + case 8: + op->val = *(u64 *)op->addr.reg; + break; + } +} + static void decode_register_operand(struct operand *op, struct decode_cache *c, int inhibit_bytereg) @@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op, reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; + op->addr.reg = decode_register(reg, c->regs, highbyte_regs); op->bytes = 1; } else { - op->ptr = decode_register(reg, c->regs, 0); + op->addr.reg = decode_register(reg, c->regs, 0); op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->ptr; - break; - case 4: - op->val = *(u32 *)op->ptr; - break; - case 8: - op->val = *(u64 *) op->ptr; - break; - } } + fetch_register_operand(op); op->orig_val = op->val; } static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + struct operand *op) { struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; int rc = X86EMUL_CONTINUE; + ulong modrm_ea = 0; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_mod |= (c->modrm & 0xc0) >> 6; c->modrm_reg |= (c->modrm & 0x38) >> 3; c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; - c->use_modrm_ea = 1; + c->modrm_seg = VCPU_SREG_DS; if (c->modrm_mod == 3) { - c->modrm_ptr = decode_register(c->modrm_rm, + op->type = OP_REG; + op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); - c->modrm_val = *(unsigned long *)c->modrm_ptr; + fetch_register_operand(op); return rc; } + op->type = OP_MEM; + if (c->ad_bytes == 2) { unsigned bx = c->regs[VCPU_REGS_RBX]; unsigned bp = c->regs[VCPU_REGS_RBP]; @@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (c->modrm_mod) { case 0: if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->eip); break; case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->eip); break; } switch (c->modrm_rm) { case 0: - c->modrm_ea += bx + si; + modrm_ea += bx + si; break; case 1: - c->modrm_ea += bx + di; + modrm_ea += bx + di; break; case 2: - c->modrm_ea += bp + si; + modrm_ea += bp + si; break; case 3: - c->modrm_ea += bp + di; + modrm_ea += bp + di; break; case 4: - c->modrm_ea += si; + modrm_ea += si; break; case 5: - c->modrm_ea += di; + modrm_ea += di; break; case 6: if (c->modrm_mod != 0) - c->modrm_ea += bp; + modrm_ea += bp; break; case 7: - c->modrm_ea += bx; + modrm_ea += bx; break; } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_SS); - c->modrm_ea = (u16)c->modrm_ea; + c->modrm_seg = VCPU_SREG_SS; + modrm_ea = (u16)modrm_ea; } else { /* 32/64-bit ModR/M decode. */ if ((c->modrm_rm & 7) == 4) { @@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, scale = sib >> 6; if ((base_reg & 7) == 5 && c->modrm_mod == 0) - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); else - c->modrm_ea += c->regs[base_reg]; + modrm_ea += c->regs[base_reg]; if (index_reg != 4) - c->modrm_ea += c->regs[index_reg] << scale; + modrm_ea += c->regs[index_reg] << scale; } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) c->rip_relative = 1; } else - c->modrm_ea += c->regs[c->modrm_rm]; + modrm_ea += c->regs[c->modrm_rm]; switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); break; case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); break; } } + op->addr.mem = modrm_ea; done: return rc; } static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + struct operand *op) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; + op->type = OP_MEM; switch (c->ad_bytes) { case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); + op->addr.mem = insn_fetch(u16, 2, c->eip); break; case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); + op->addr.mem = insn_fetch(u32, 4, c->eip); break; case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); + op->addr.mem = insn_fetch(u64, 8, c->eip); break; } done: return rc; } -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +static void fetch_bit_operand(struct decode_cache *c) { - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group; - - - /* we cannot decode insn before we complete previous rep insn */ - WARN_ON(ctxt->restart); - - c->eip = ctxt->eip; - c->fetch.start = c->fetch.end = c->eip; - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - c->d = opcode_table[c->b]; - if (c->d == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; - } - } - - if (c->d & Group) { - group = c->d & GroupMask; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - group = (group << 3) + ((c->modrm >> 3) & 7); - if ((c->d & GroupDual) && (c->modrm >> 6) == 3) - c->d = group2_table[group]; - else - c->d = group_table[group]; - } - - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); - - if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, ops, c); - - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; - - if (c->rip_relative) - c->modrm_ea += c->eip; - - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - c->src.val = c->modrm_val; - c->src.ptr = c->modrm_ptr; - break; - } - c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; - c->src.val = 0; - break; - case SrcImm: - case SrcImmU: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - if ((c->d & SrcMask) == SrcImmU) { - switch (c->src.bytes) { - case 1: - c->src.val &= 0xff; - break; - case 2: - c->src.val &= 0xffff; - break; - case 4: - c->src.val &= 0xffffffff; - break; - } - } - break; - case SrcImmByte: - case SrcImmUByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - if ((c->d & SrcMask) == SrcImmByte) - c->src.val = insn_fetch(s8, 1, c->eip); - else - c->src.val = insn_fetch(u8, 1, c->eip); - break; - case SrcAcc: - c->src.type = OP_REG; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->src.bytes) { - case 1: - c->src.val = *(u8 *)c->src.ptr; - break; - case 2: - c->src.val = *(u16 *)c->src.ptr; - break; - case 4: - c->src.val = *(u32 *)c->src.ptr; - break; - case 8: - c->src.val = *(u64 *)c->src.ptr; - break; - } - break; - case SrcOne: - c->src.bytes = 1; - c->src.val = 1; - break; - case SrcSI: - c->src.type = OP_MEM; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *) - register_address(c, seg_override_base(ctxt, ops, c), - c->regs[VCPU_REGS_RSI]); - c->src.val = 0; - break; - case SrcImmFAddr: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = c->op_bytes + 2; - insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); - break; - case SrcMemFAddr: - c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; - c->src.bytes = c->op_bytes + 2; - break; - } + long sv = 0, mask; - /* - * Decode and fetch the second source operand: register, memory - * or immediate. - */ - switch (c->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; - break; - case Src2ImmByte: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 1; - c->src2.val = insn_fetch(u8, 1, c->eip); - break; - case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; - break; - } + if (c->dst.type == OP_MEM && c->src.type == OP_REG) { + mask = ~(c->dst.bytes * 8 - 1); - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - case DstMem64: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.type = OP_REG; - c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.ptr = c->modrm_ptr; - break; - } - c->dst.type = OP_MEM; - c->dst.ptr = (unsigned long *)c->modrm_ea; - if ((c->d & DstMask) == DstMem64) - c->dst.bytes = 8; - else - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); + if (c->src.bytes == 2) + sv = (s16)c->src.val & (s16)mask; + else if (c->src.bytes == 4) + sv = (s32)c->src.val & (s32)mask; - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - break; - case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->dst.bytes) { - case 1: - c->dst.val = *(u8 *)c->dst.ptr; - break; - case 2: - c->dst.val = *(u16 *)c->dst.ptr; - break; - case 4: - c->dst.val = *(u32 *)c->dst.ptr; - break; - case 8: - c->dst.val = *(u64 *)c->dst.ptr; - break; - } - c->dst.orig_val = c->dst.val; - break; - case DstDI: - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *) - register_address(c, es_base(ctxt, ops), - c->regs[VCPU_REGS_RDI]); - c->dst.val = 0; - break; + c->dst.addr.mem += (sv >> 3); } -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; } static int read_emulated(struct x86_emulate_ctxt *ctxt, @@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, addr = dt.address + index * 8; ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); return ret; } @@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, addr = dt.address + index * 8; ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); return ret; } @@ -1573,6 +1068,25 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static void write_register_operand(struct operand *op) +{ + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (op->bytes) { + case 1: + *(u8 *)op->addr.reg = (u8)op->val; + break; + case 2: + *(u16 *)op->addr.reg = (u16)op->val; + break; + case 4: + *op->addr.reg = (u32)op->val; + break; /* 64b: zero-extend */ + case 8: + *op->addr.reg = op->val; + break; + } +} + static inline int writeback(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, switch (c->dst.type) { case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } + write_register_operand(&c->dst); break; case OP_MEM: if (c->lock_prefix) rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, + c->dst.addr.mem, &c->dst.orig_val, &c->dst.val, c->dst.bytes, @@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, ctxt->vcpu); else rc = ops->write_emulated( - (unsigned long)c->dst.ptr, + c->dst.addr.mem, &c->dst.val, c->dst.bytes, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, - (unsigned long)c->dst.ptr, err); + emulate_pf(ctxt); if (rc != X86EMUL_CONTINUE) return rc; break; @@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), - c->regs[VCPU_REGS_RSP]); + c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), + c->regs[VCPU_REGS_RSP]); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, *(unsigned long *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt); + return rc; } @@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, return rc; } +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + struct desc_ptr dt; + gva_t cs_addr; + gva_t eip_addr; + u16 cs, eip; + u32 err; + + /* TODO: Add limit checks */ + c->src.val = ctxt->eflags; + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); + + c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = c->eip; + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + ops->get_idt(&dt, ctxt->vcpu); + + eip_addr = dt.address + (irq << 2); + cs_addr = dt.address + (irq << 2) + 2; + + rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = eip; + + return rc; +} + +static int emulate_int(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_int_real(ctxt, ops, irq); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* Protected mode interrupts unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + +static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + unsigned long temp_eip = 0; + unsigned long temp_eflags = 0; + unsigned long cs = 0; + unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | + EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | + EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ + unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; + + /* TODO: Add stack limit check */ + + rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (temp_eip & ~0xffff) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = temp_eip; + + + if (c->op_bytes == 4) + ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); + else if (c->op_bytes == 2) { + ctxt->eflags &= ~0xffff; + ctxt->eflags |= temp_eflags; + } + + ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + + return rc; +} + +static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops* ops) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_iret_real(ctxt, ops); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* iret from protected mode unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; + unsigned long *rax = &c->regs[VCPU_REGS_RAX]; + unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; + u8 de = 0; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, case 3: /* neg */ emulate_1op("neg", c->dst, ctxt->eflags); break; + case 4: /* mul */ + emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 5: /* imul */ + emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 6: /* div */ + emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, + ctxt->eflags, de); + break; + case 7: /* idiv */ + emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, + ctxt->eflags, de); + break; default: - return 0; + return X86EMUL_UNHANDLEABLE; } - return 1; + if (de) + return emulate_de(ctxt); + return X86EMUL_CONTINUE; } static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, @@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, return rc; } +static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + unsigned short sel; + int rc; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ops, sel, seg); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.val = c->src.val; + return rc; +} + static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { + if (ctxt->perm_ok) + return true; + if (emulator_bad_iopl(ctxt, ops)) if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) return false; + + ctxt->perm_ok = true; + return true; } @@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } } @@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } } @@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct x86_emulate_ops *ops = ctxt->ops; struct decode_cache *c = &ctxt->decode; int rc; @@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; register_address_increment(c, &c->regs[reg], df * op->bytes); - op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); + op->addr.mem = register_address(c, base, c->regs[reg]); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ + emulate_push(ctxt, ctxt->ops); + return X86EMUL_CONTINUE; +} + +static int em_das(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u8 al, old_al; + bool af, cf, old_cf; + + cf = ctxt->eflags & X86_EFLAGS_CF; + al = c->dst.val; + + old_al = al; + old_cf = cf; + cf = false; + af = ctxt->eflags & X86_EFLAGS_AF; + if ((al & 0x0f) > 9 || af) { + al -= 6; + cf = old_cf | (al >= 250); + af = true; + } else { + af = false; + } + if (old_al > 0x99 || old_cf) { + al -= 0x60; + cf = true; + } + + c->dst.val = al; + /* Set PF, ZF, SF */ + c->src.type = OP_IMM; + c->src.val = 0; + c->src.bytes = 1; + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); + if (cf) + ctxt->eflags |= X86_EFLAGS_CF; + if (af) + ctxt->eflags |= X86_EFLAGS_AF; + return X86EMUL_CONTINUE; +} + +static int em_call_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u16 sel, old_cs; + ulong old_eip; + int rc; + + old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_eip = c->eip; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) + return X86EMUL_CONTINUE; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + + c->src.val = old_cs; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = old_eip; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + return X86EMUL_CONTINUE; +} + +static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + c->dst.type = OP_REG; + c->dst.addr.reg = &c->eip; + c->dst.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); + return X86EMUL_CONTINUE; +} + +static int em_imul(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_imul_3op(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.val = c->src2.val; + return em_imul(ctxt); +} + +static int em_cwd(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_REG; + c->dst.bytes = c->src.bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; + c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); + + return X86EMUL_CONTINUE; +} + +static int em_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); + struct decode_cache *c = &ctxt->decode; + u64 tsc = 0; + + if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + c->regs[VCPU_REGS_RAX] = (u32)tsc; + c->regs[VCPU_REGS_RDX] = tsc >> 32; + return X86EMUL_CONTINUE; +} + +static int em_mov(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + c->dst.val = c->src.val; + return X86EMUL_CONTINUE; +} + +#define D(_y) { .flags = (_y) } +#define N D(0) +#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } + +#define D2bv(_f) D((_f) | ByteOp), D(_f) +#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) + +#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ + D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ + D2bv(((_f) & ~Lock) | DstAcc | SrcImm) + + +static struct opcode group1[] = { + X7(D(Lock)), N +}; + +static struct opcode group1A[] = { + D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, +}; + +static struct opcode group3[] = { + D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + X4(D(SrcMem | ModRM)), +}; + +static struct opcode group4[] = { + D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + N, N, N, N, N, N, +}; + +static struct opcode group5[] = { + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + D(SrcMem | ModRM | Stack), + I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), + D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), + D(SrcMem | ModRM | Stack), N, +}; + +static struct group_dual group7 = { { + N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), + D(SrcMem | ModRM | ByteOp | Priv | NoAccess), +}, { + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, +} }; + +static struct opcode group8[] = { + N, N, N, N, + D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), +}; + +static struct group_dual group9 = { { + N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, +}, { + N, N, N, N, N, N, N, N, +} }; + +static struct opcode group11[] = { + I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), +}; + +static struct opcode opcode_table[256] = { + /* 0x00 - 0x07 */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x08 - 0x0F */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), N, + /* 0x10 - 0x17 */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x18 - 0x1F */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x20 - 0x27 */ + D6ALU(Lock), N, N, + /* 0x28 - 0x2F */ + D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + /* 0x30 - 0x37 */ + D6ALU(Lock), N, N, + /* 0x38 - 0x3F */ + D6ALU(0), N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg)), + /* 0x50 - 0x57 */ + X8(I(SrcReg | Stack, em_push)), + /* 0x58 - 0x5F */ + X8(D(DstReg | Stack)), + /* 0x60 - 0x67 */ + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, N, N, N, + /* 0x68 - 0x6F */ + I(SrcImm | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), + I(SrcImmByte | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), + D2bv(DstDI | Mov | String), /* insb, insw/insd */ + D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + /* 0x70 - 0x7F */ + X16(D(SrcImmByte)), + /* 0x80 - 0x87 */ + G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), + G(DstMem | SrcImm | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), + G(DstMem | SrcImmByte | ModRM | Group, group1), + D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), + /* 0x88 - 0x8F */ + I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), + I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), + D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), + D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), + /* 0x90 - 0x97 */ + X8(D(SrcAcc | DstReg)), + /* 0x98 - 0x9F */ + D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), + I(SrcImmFAddr | No64, em_call_far), N, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + /* 0xA0 - 0xA7 */ + I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), + I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), + I2bv(SrcSI | DstDI | Mov | String, em_mov), + D2bv(SrcSI | DstDI | String), + /* 0xA8 - 0xAF */ + D2bv(DstAcc | SrcImm), + I2bv(SrcAcc | DstDI | Mov | String, em_mov), + I2bv(SrcSI | DstAcc | Mov | String, em_mov), + D2bv(SrcAcc | DstDI | String), + /* 0xB0 - 0xB7 */ + X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), + /* 0xB8 - 0xBF */ + X8(I(DstReg | SrcImm | Mov, em_mov)), + /* 0xC0 - 0xC7 */ + D2bv(DstMem | SrcImmByte | ModRM), + I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), + D(ImplicitOps | Stack), + D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), + G(ByteOp, group11), G(0, group11), + /* 0xC8 - 0xCF */ + N, N, N, D(ImplicitOps | Stack), + D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + /* 0xD0 - 0xD7 */ + D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), + N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + X4(D(SrcImmByte)), + D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + /* 0xE8 - 0xEF */ + D(SrcImm | Stack), D(SrcImm | ImplicitOps), + D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), + D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + /* 0xF0 - 0xF7 */ + N, N, N, N, + D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + /* 0xF8 - 0xFF */ + D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), +}; + +static struct opcode twobyte_table[256] = { + /* 0x00 - 0x0F */ + N, GD(0, &group7), N, N, + N, D(ImplicitOps), D(ImplicitOps | Priv), N, + D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + N, D(ImplicitOps | ModRM), N, N, + /* 0x10 - 0x1F */ + N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, + /* 0x20 - 0x2F */ + D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), + D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + N, N, N, N, + N, N, N, N, N, N, N, N, + /* 0x30 - 0x3F */ + D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), + D(ImplicitOps | Priv), N, + D(ImplicitOps), D(ImplicitOps | Priv), N, N, + N, N, N, N, N, N, N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg | SrcMem | ModRM | Mov)), + /* 0x50 - 0x5F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x60 - 0x6F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x70 - 0x7F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x80 - 0x8F */ + X16(D(SrcImm)), + /* 0x90 - 0x9F */ + X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), + /* 0xA0 - 0xA7 */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), N, N, + /* 0xA8 - 0xAF */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), + D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), + /* 0xB0 - 0xB7 */ + D2bv(DstMem | SrcReg | ModRM | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xB8 - 0xBF */ + N, N, + G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xC0 - 0xCF */ + D2bv(DstMem | SrcReg | ModRM | Lock), + N, D(DstMem | SrcReg | ModRM | Mov), + N, N, N, GD(0, &group9), + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xDF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xE0 - 0xEF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xF0 - 0xFF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N +}; + +#undef D +#undef N +#undef G +#undef GD +#undef I + +#undef D2bv +#undef I2bv +#undef D6ALU + +static unsigned imm_size(struct decode_cache *c) +{ + unsigned size; + + size = (c->d & ByteOp) ? 1 : c->op_bytes; + if (size == 8) + size = 4; + return size; +} + +static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned size, bool sign_extension) +{ + struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; + int rc = X86EMUL_CONTINUE; + + op->type = OP_IMM; + op->bytes = size; + op->addr.mem = c->eip; + /* NB. Immediates are sign-extended as necessary. */ + switch (op->bytes) { + case 1: + op->val = insn_fetch(s8, 1, c->eip); + break; + case 2: + op->val = insn_fetch(s16, 2, c->eip); + break; + case 4: + op->val = insn_fetch(s32, 4, c->eip); + break; + } + if (!sign_extension) { + switch (op->bytes) { + case 1: + op->val &= 0xff; + break; + case 2: + op->val &= 0xffff; + break; + case 4: + op->val &= 0xffffffff; + break; + } + } +done: + return rc; } int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +x86_decode_insn(struct x86_emulate_ctxt *ctxt) { + struct x86_emulate_ops *ops = ctxt->ops; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, dual, goffset; + struct opcode opcode, *g_mod012, *g_mod3; + struct operand memop = { .type = OP_NONE }; + + c->eip = ctxt->eip; + c->fetch.start = c->fetch.end = c->eip; + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + opcode = opcode_table[c->b]; + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + opcode = twobyte_table[c->b]; + } + c->d = opcode.flags; + + if (c->d & Group) { + dual = c->d & GroupDual; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + if (c->d & GroupDual) { + g_mod012 = opcode.u.gdual->mod012; + g_mod3 = opcode.u.gdual->mod3; + } else + g_mod012 = g_mod3 = opcode.u.group; + + c->d &= ~(Group | GroupDual); + + goffset = (c->modrm >> 3) & 7; + + if ((c->modrm >> 6) == 3) + opcode = g_mod3[goffset]; + else + opcode = g_mod012[goffset]; + c->d |= opcode.flags; + } + + c->execute = opcode.u.execute; + + /* Unrecognised? */ + if (c->d == 0 || (c->d & Undefined)) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + if (c->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + c->op_bytes = 8; + else + c->op_bytes = 4; + } + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) { + rc = decode_modrm(ctxt, ops, &memop); + if (!c->has_seg_override) + set_seg_override(c, c->modrm_seg); + } else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops, &memop); + if (rc != X86EMUL_CONTINUE) + goto done; + + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); + + if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) + memop.addr.mem += seg_override_base(ctxt, ops, c); + + if (memop.type == OP_MEM && c->ad_bytes != 8) + memop.addr.mem = (u32)memop.addr.mem; + + if (memop.type == OP_MEM && c->rip_relative) + memop.addr.mem += c->eip; + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + memop.bytes = 2; + goto srcmem_common; + case SrcMem32: + memop.bytes = 4; + goto srcmem_common; + case SrcMem: + memop.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + srcmem_common: + c->src = memop; + break; + case SrcImmU16: + rc = decode_imm(ctxt, &c->src, 2, false); + break; + case SrcImm: + rc = decode_imm(ctxt, &c->src, imm_size(c), true); + break; + case SrcImmU: + rc = decode_imm(ctxt, &c->src, imm_size(c), false); + break; + case SrcImmByte: + rc = decode_imm(ctxt, &c->src, 1, true); + break; + case SrcImmUByte: + rc = decode_imm(ctxt, &c->src, 1, false); + break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; + fetch_register_operand(&c->src); + break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + case SrcSI: + c->src.type = OP_MEM; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.addr.mem = + register_address(c, seg_override_base(ctxt, ops, c), + c->regs[VCPU_REGS_RSI]); + c->src.val = 0; + break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.addr.mem = c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + memop.bytes = c->op_bytes + 2; + goto srcmem_common; + break; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + rc = decode_imm(ctxt, &c->src2, 1, true); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; + case Src2Imm: + rc = decode_imm(ctxt, &c->src2, imm_size(c), true); + break; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstImmUByte: + c->dst.type = OP_IMM; + c->dst.addr.mem = c->eip; + c->dst.bytes = 1; + c->dst.val = insn_fetch(u8, 1, c->eip); + break; + case DstMem: + case DstMem64: + c->dst = memop; + if ((c->d & DstMask) == DstMem64) + c->dst.bytes = 8; + else + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->d & BitOp) + fetch_bit_operand(c); + c->dst.orig_val = c->dst.val; + break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; + fetch_register_operand(&c->dst); + c->dst.orig_val = c->dst.val; + break; + case DstDI: + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.addr.mem = + register_address(c, es_base(ctxt, ops), + c->regs[VCPU_REGS_RDI]); + c->dst.val = 0; + break; + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + default: + c->dst.type = OP_NONE; /* Disable writeback. */ + return 0; + } + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if (((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) + && (((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) + || ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + return true; + + return false; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +{ + struct x86_emulate_ops *ops = ctxt->ops; u64 msr_data; struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; + int irq; /* Used for int 3, int, and into */ ctxt->decode.mem_read.pos = 0; @@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) goto done; } + if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { + emulate_ud(ctxt); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { emulate_gp(ctxt, 0); @@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->rep_prefix && (c->d & String)) { - ctxt->restart = true; /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - string_done: - ctxt->restart = false; ctxt->eip = c->eip; goto done; } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) - goto string_done; - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) - goto string_done; - } - c->eip = ctxt->eip; } - if (c->src.type == OP_MEM) { - rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, + if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { + rc = read_emulated(ctxt, ops, c->src.addr.mem, c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, + rc = read_emulated(ctxt, ops, c->src2.addr.mem, &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, + rc = read_emulated(ctxt, ops, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) special_insn: + if (c->execute) { + rc = c->execute(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } + if (c->twobyte) goto twobyte_insn; @@ -2653,8 +3099,6 @@ special_insn: break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x08 ... 0x0d: or: /* or */ @@ -2672,8 +3116,6 @@ special_insn: break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x18 ... 0x1d: sbb: /* sbb */ @@ -2684,8 +3126,6 @@ special_insn: break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x20 ... 0x25: and: /* and */ @@ -2709,58 +3149,29 @@ special_insn: case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt, ops); - break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x60: /* pusha */ rc = emulate_pusha(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; c->dst.val = (s32) c->src.val; break; - case 0x68: /* push imm */ - case 0x6a: /* push imm8 */ - emulate_push(ctxt, ops); - break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - c->dst.bytes)) { - emulate_gp(ctxt, 0); - goto done; - } - if (!pio_in_emulated(ctxt, ops, c->dst.bytes, - c->regs[VCPU_REGS_RDX], &c->dst.val)) - goto done; /* IO is needed, skip writeback */ - break; + c->src.val = c->regs[VCPU_REGS_RDX]; + goto do_io_in; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - c->src.bytes)) { - emulate_gp(ctxt, 0); - goto done; - } - ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], - &c->src.val, 1, ctxt->vcpu); - - c->dst.type = OP_NONE; /* nothing to writeback */ + c->dst.val = c->regs[VCPU_REGS_RDX]; + goto do_io_out; break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) @@ -2793,29 +3204,15 @@ special_insn: case 0x86 ... 0x87: /* xchg */ xchg: /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; - break; - case 4: - *c->src.ptr = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.ptr = c->dst.val; - break; - } + c->src.val = c->dst.val; + write_register_operand(&c->src); /* * Write back the memory destination with implicit LOCK * prefix. */ - c->dst.val = c->src.val; + c->dst.val = c->src.orig_val; c->lock_prefix = 1; break; - case 0x88 ... 0x8b: /* mov */ - goto mov; case 0x8c: /* mov r/m, sreg */ if (c->modrm_reg > VCPU_SREG_GS) { emulate_ud(ctxt); @@ -2824,7 +3221,7 @@ special_insn: c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); break; case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_ea; + c->dst.val = c->src.addr.mem; break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; @@ -2847,76 +3244,87 @@ special_insn: } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; - case 0x90: /* nop / xchg r8,rax */ - if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { - c->dst.type = OP_NONE; /* nop */ + case 0x90 ... 0x97: /* nop / xchg reg, rax */ + if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) break; - } - case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = OP_REG; - c->src.bytes = c->op_bytes; - c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; - c->src.val = *(c->src.ptr); goto xchg; + case 0x98: /* cbw/cwde/cdqe */ + switch (c->op_bytes) { + case 2: c->dst.val = (s8)c->dst.val; break; + case 4: c->dst.val = (s16)c->dst.val; break; + case 8: c->dst.val = (s32)c->dst.val; break; + } + break; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; emulate_push(ctxt, ops); break; case 0x9d: /* popf */ c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.addr.reg = &ctxt->eflags; c->dst.bytes = c->op_bytes; rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; break; - case 0xa0 ... 0xa3: /* mov */ - case 0xa4 ... 0xa5: /* movs */ - goto mov; case 0xa6 ... 0xa7: /* cmps */ c->dst.type = OP_NONE; /* Disable writeback. */ - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xaa ... 0xab: /* stos */ - c->dst.val = c->regs[VCPU_REGS_RAX]; - break; - case 0xac ... 0xad: /* lods */ - goto mov; case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xb0 ... 0xbf: /* mov r, imm */ - goto mov; + goto cmp; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; - c->dst.ptr = &c->eip; + c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; goto pop_instruction; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; + case 0xc4: /* les */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); + break; + case 0xc5: /* lds */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; + break; + case 0xcc: /* int3 */ + irq = 3; + goto do_interrupt; + case 0xcd: /* int n */ + irq = c->src.val; + do_interrupt: + rc = emulate_int(ctxt, ops, irq); + break; + case 0xce: /* into */ + if (ctxt->eflags & EFLG_OF) { + irq = 4; + goto do_interrupt; + } + break; + case 0xcf: /* iret */ + rc = emulate_iret(ctxt, ops); break; case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; emulate_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && + (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) + jmp_rel(c, c->src.val); + break; + case 0xe3: /* jcxz/jecxz/jrcxz */ + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) + jmp_rel(c, c->src.val); + break; case 0xe4: /* inb */ case 0xe5: /* in */ goto do_io_in; @@ -2964,15 +3372,16 @@ special_insn: break; case 0xee: /* out dx,al */ case 0xef: /* out dx,(e/r)ax */ - c->src.val = c->regs[VCPU_REGS_RDX]; + c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->dst.val, + c->src.bytes)) { emulate_gp(ctxt, 0); goto done; } - ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, - ctxt->vcpu); + ops->pio_out_emulated(c->src.bytes, c->dst.val, + &c->src.val, 1, ctxt->vcpu); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -2981,24 +3390,22 @@ special_insn: case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf6 ... 0xf7: /* Grp3 */ - if (!emulate_grp3(ctxt, ops)) - goto cannot_emulate; + rc = emulate_grp3(ctxt, ops); break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf9: /* stc */ + ctxt->eflags |= EFLG_CF; break; case 0xfa: /* cli */ if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); goto done; - } else { + } else ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - } break; case 0xfb: /* sti */ if (emulator_bad_iopl(ctxt, ops)) { @@ -3007,29 +3414,29 @@ special_insn: } else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfd: /* std */ ctxt->eflags |= EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfe: /* Grp4 */ grp45: rc = emulate_grp45(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xff: /* Grp5 */ if (c->modrm_reg == 5) goto jump_far; goto grp45; + default: + goto cannot_emulate; } + if (rc != X86EMUL_CONTINUE) + goto done; + writeback: rc = writeback(ctxt, ops); if (rc != X86EMUL_CONTINUE) @@ -3050,25 +3457,32 @@ writeback: &c->dst); if (c->rep_prefix && (c->d & String)) { - struct read_cache *rc = &ctxt->decode.io_read; + struct read_cache *r = &ctxt->decode.io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - /* - * Re-enter guest when pio read ahead buffer is empty or, - * if it is not used, after each 1024 iteration. - */ - if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || - (rc->end != 0 && rc->end == rc->pos)) - ctxt->restart = false; + + if (!string_insn_completed(ctxt)) { + /* + * Re-enter guest when pio read ahead buffer is empty + * or, if it is not used, after each 1024 iteration. + */ + if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && + (r->end == 0 || r->end != r->pos)) { + /* + * Reset read cache. Usually happens before + * decode, but since instruction is restarted + * we have to do it here. + */ + ctxt->decode.mem_read.end = 0; + return EMULATION_RESTART; + } + goto done; /* skip rip writeback */ + } } - /* - * reset read cache here in case string instruction is restared - * without decoding - */ - ctxt->decode.mem_read.end = 0; + ctxt->eip = c->eip; done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (c->b) { @@ -3091,7 +3505,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, + rc = read_descriptor(ctxt, ops, c->src.addr.mem, &size, &address, c->op_bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -3104,14 +3518,12 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; break; default: goto cannot_emulate; } } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, + rc = read_descriptor(ctxt, ops, c->src.addr.mem, &size, &address, c->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3126,7 +3538,7 @@ twobyte_insn: c->dst.val = ops->get_cr(0, ctxt->vcpu); break; case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | + ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | (c->src.val & 0x0f), ctxt->vcpu); c->dst.type = OP_NONE; break; @@ -3134,7 +3546,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, c->modrm_ea); + emulate_invlpg(ctxt->vcpu, c->src.addr.mem); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -3144,23 +3556,16 @@ twobyte_insn: break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x06: emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; break; case 0x09: /* wbinvd */ kvm_emulate_wbinvd(ctxt->vcpu); - c->dst.type = OP_NONE; break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; break; case 0x20: /* mov cr, reg */ switch (c->modrm_reg) { @@ -3170,8 +3575,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; } - c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); - c->dst.type = OP_NONE; /* no writeback */ + c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); break; case 0x21: /* mov from dr to reg */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && @@ -3179,11 +3583,10 @@ twobyte_insn: emulate_ud(ctxt); goto done; } - ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); - c->dst.type = OP_NONE; /* no writeback */ + ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { + if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { emulate_gp(ctxt, 0); goto done; } @@ -3196,7 +3599,7 @@ twobyte_insn: goto done; } - if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & + if (ops->set_dr(c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), ctxt->vcpu) < 0) { /* #UD condition is already handled by the code above */ @@ -3215,7 +3618,6 @@ twobyte_insn: goto done; } rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; break; case 0x32: /* rdmsr */ @@ -3227,21 +3629,12 @@ twobyte_insn: c->regs[VCPU_REGS_RDX] = msr_data >> 32; } rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ rc = emulate_sysenter(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x35: /* sysexit */ rc = emulate_sysexit(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; @@ -3251,15 +3644,15 @@ twobyte_insn: case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; + break; + case 0x90 ... 0x9f: /* setcc r/m8 */ + c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xa3: bt: /* bt */ @@ -3277,13 +3670,9 @@ twobyte_insn: break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xab: bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; case 0xac: /* shrd imm8, r, r/m */ @@ -3306,15 +3695,22 @@ twobyte_insn: } else { /* Failure: write the value we saw to EAX. */ c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; } break; + case 0xb2: /* lss */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); + break; case 0xb3: btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); break; + case 0xb4: /* lfs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); + break; + case 0xb5: /* lgs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); + break; case 0xb6 ... 0xb7: /* movzx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (u8) c->src.val @@ -3334,15 +3730,43 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); break; + case 0xbc: { /* bsf */ + u8 zf; + __asm__ ("bsf %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } + case 0xbd: { /* bsr */ + u8 zf; + __asm__ ("bsr %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } case 0xbe ... 0xbf: /* movsx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : (s16) c->src.val; break; + case 0xc0 ... 0xc1: /* xadd */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + /* Write back the register source. */ + c->src.val = c->dst.orig_val; + write_register_operand(&c->src); + break; case 0xc3: /* movnti */ c->dst.bytes = c->op_bytes; c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : @@ -3350,10 +3774,14 @@ twobyte_insn: break; case 0xc7: /* Grp9 (cmpxchg8b) */ rc = emulate_grp9(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; + default: + goto cannot_emulate; } + + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; cannot_emulate: diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index ddeb231..efad723 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,7 +5,7 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -int pit_has_pending_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - - if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) - return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; -} - static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4b7b73c..f628234 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { - raw_spin_lock(&s->lock); + spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) @@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s) s->wakeup_needed = false; - raw_spin_unlock(&s->lock); + spin_unlock(&s->lock); if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { @@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s) if (!found) return; + kvm_make_request(KVM_REQ_EVENT, found); kvm_vcpu_kick(found); } } @@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - kvm_pic_reset(s); /* init */ - /* - * deassert a pending interrupt - */ - pic_irq_request(s->pics_state->kvm, 0); - s->init_state = 1; s->init4 = val & 1; + s->last_irr = 0; + s->imr = 0; + s->priority_add = 0; + s->special_mask = 0; + s->read_reg_select = 0; + if (!s->init4) { + s->special_fully_nested_mode = 0; + s->auto_eoi = 0; + } + s->init_state = 1; if (val & 0x02) printk(KERN_ERR "single mode not supported"); if (val & 0x08) @@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - raw_spin_lock_init(&s->lock); + spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 2095a04..7e06ba1 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,7 +1,7 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -33,12 +33,7 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - int ret; - - ret = pit_has_pending_timer(vcpu); - ret |= apic_has_pending_timer(vcpu); - - return ret; + return apic_has_pending_timer(vcpu); } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 63c3145..ba910d1 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -60,7 +60,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - raw_spinlock_t lock; + spinlock_t lock; bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6491ac8..975bb45 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); - return vcpu->arch.pdptrs[index]; + return vcpu->arch.walk_mmu->pdptrs[index]; +} + +static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) +{ + load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); + + return mmu->pdptrs[index]; } static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 22b06f7..413f897 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,7 +5,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Authors: * Dor Laor <dor.laor@qumranet.com> @@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static void apic_update_ppr(struct kvm_lapic *apic) { - u32 tpr, isrv, ppr; + u32 tpr, isrv, ppr, old_ppr; int isr; + old_ppr = apic_get_reg(apic, APIC_PROCPRI); tpr = apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - apic_set_reg(apic, APIC_PROCPRI, ppr); + if (old_ppr != ppr) { + apic_set_reg(apic, APIC_PROCPRI, ppr); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + } } static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) @@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; @@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_EDGE_TRIG; if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 311f6da..908ea54 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,7 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -49,15 +49,25 @@ */ bool tdp_enabled = false; -#undef MMU_DEBUG +enum { + AUDIT_PRE_PAGE_FAULT, + AUDIT_POST_PAGE_FAULT, + AUDIT_PRE_PTE_WRITE, + AUDIT_POST_PTE_WRITE, + AUDIT_PRE_SYNC, + AUDIT_POST_SYNC +}; -#undef AUDIT +char *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write", + "pre sync", + "post sync" +}; -#ifdef AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} -#endif +#undef MMU_DEBUG #ifdef MMU_DEBUG @@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} #endif -#if defined(MMU_DEBUG) || defined(AUDIT) +#ifdef MMU_DEBUG static int dbg = 0; module_param(dbg, bool, 0644); #endif @@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644); } #endif +#define PTE_PREFETCH_NUM 8 + #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 @@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; +static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; @@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) #endif } +static bool spte_has_volatile_bits(u64 spte) +{ + if (!shadow_accessed_mask) + return false; + + if (!is_shadow_present_pte(spte)) + return false; + + if ((spte & shadow_accessed_mask) && + (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) + return false; + + return true; +} + +static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) && !(new_spte & bit_mask); +} + static void update_spte(u64 *sptep, u64 new_spte) { - u64 old_spte; + u64 mask, old_spte = *sptep; + + WARN_ON(!is_rmap_spte(new_spte)); + + new_spte |= old_spte & shadow_dirty_mask; - if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || - !is_rmap_spte(*sptep)) + mask = shadow_accessed_mask; + if (is_writable_pte(old_spte)) + mask |= shadow_dirty_mask; + + if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) __set_spte(sptep, new_spte); - else { + else old_spte = __xchg_spte(sptep, new_spte); - if (old_spte & shadow_accessed_mask) - mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); - } + + if (!shadow_accessed_mask) + return; + + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 4); + rmap_desc_cache, 4 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc->sptes[0] = (u64 *)*rmapp; desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; + ++count; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); @@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) - ; + ++count; desc->sptes[i] = spte; } return count; @@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); BUG(); } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + rmap_printk("rmap_remove: %p 1->0\n", spte); if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", - spte, *spte); + printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); BUG(); } *rmapp = 0; } else { - rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + rmap_printk("rmap_remove: %p many->many\n", spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { @@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } - pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); + pr_err("rmap_remove: %p many->many\n", spte); BUG(); } } @@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte) pfn_t pfn; u64 old_spte = *sptep; - if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || - old_spte & shadow_accessed_mask) { + if (!spte_has_volatile_bits(old_spte)) __set_spte(sptep, new_spte); - } else + else old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) return; + pfn = spte_to_pfn(old_spte); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writable_pte(old_spte)) + if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) kvm_set_pfn_dirty(pfn); } @@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) } spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) { - pfn_t pfn; - - spte = rmap_next(kvm, rmapp, NULL); - pfn = spte_to_pfn(*spte); - kvm_set_pfn_dirty(pfn); - } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; @@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt) } #endif +/* + * This value is the sum of all of the kvm instances's + * kvm->arch.n_used_mmu_pages values. We need a global, + * aggregate version in order to make the slab shrinker + * faster + */ +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +{ + kvm->arch.n_used_mmu_pages += nr; + percpu_counter_add(&kvm_total_used_mmu_pages, nr); +} + static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); @@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->role.direct) __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); - ++kvm->arch.n_free_mmu_pages; + kvm_mod_used_mmu_pages(kvm, -1); } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; - --vcpu->kvm->arch.n_free_mmu_pages; + kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!vcpu->arch.mmu.direct_map + && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, iterator->addr = addr; iterator->shadow_addr = vcpu->arch.mmu.root_hpa; iterator->level = vcpu->arch.mmu.shadow_root_level; + + if (iterator->level == PT64_ROOT_LEVEL && + vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && + !vcpu->arch.mmu.direct_map) + --iterator->level; + if (iterator->level == PT32E_ROOT_LEVEL) { iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; @@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, /* * Changing the number of mmu pages allocated to the vm - * Note: if kvm_nr_mmu_pages is too small, you will get dead lock + * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) { - int used_pages; LIST_HEAD(invalid_list); - - used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; - used_pages = max(0, used_pages); - /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages && + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_prepare_zap_page(kvm, page, - &invalid_list); + kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - kvm_nr_mmu_pages = used_pages; - kvm->arch.n_free_mmu_pages = 0; + goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } - else - kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_alloc_mmu_pages; - kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; + kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); int r; - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); + pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); @@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: zap %lx %x\n", + pgprintk("%s: zap %llx %x\n", __func__, gfn, sp->role.word); kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } @@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * whether the guest actually used the pte (in order to detect * demand paging). */ - spte = shadow_base_present_pte | shadow_dirty_mask; + spte = shadow_base_present_pte; if (!speculative) spte |= shadow_accessed_mask; if (!dirty) @@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (!tdp_enabled && write_fault && !is_write_protection(vcpu) - && !user_fault)) { + || (!vcpu->arch.mmu.direct_map && write_fault + && !is_write_protection(vcpu) && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { @@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_WRITABLE_MASK; - if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) + if (!vcpu->arch.mmu.direct_map + && !(pte_access & ACC_WRITE_MASK)) spte &= ~PT_USER_MASK; /* @@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %lx, marking ro\n", + pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; @@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - if (is_writable_pte(*sptep) && !is_writable_pte(spte)) - kvm_set_pfn_dirty(pfn); update_spte(sptep, spte); done: return ret; @@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", + " user_fault %d gfn %llx\n", __func__, *sptep, pt_access, write_fault, user_fault, gfn); @@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %lx new %lx\n", + pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); @@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, *sptep, sptep); @@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } +static struct kvm_memory_slot * +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + if (!slot) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + hva = gfn_to_hva_memslot(slot, gfn); + + return hva_to_pfn_atomic(vcpu->kvm, hva); +} + +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + struct page *pages[PTE_PREFETCH_NUM]; + unsigned access = sp->role.access; + int i, ret; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + return -1; + + ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + if (ret <= 0) + return -1; + + for (i = 0; i < ret; i++, gfn++, start++) + mmu_set_spte(vcpu, start, ACC_ALL, + access, 0, 0, 1, NULL, + sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); + + return 0; +} + +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *sptep) +{ + u64 *spte, *start = NULL; + int i; + + WARN_ON(!sp->role.direct); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { + if (!start) + continue; + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + break; + start = NULL; + } else if (!start) + start = spte; + } +} + +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) +{ + struct kvm_mmu_page *sp; + + /* + * Since it's no accessed bit on EPT, it's no way to + * distinguish between actually accessed translations + * and prefetched, so disable pte prefetch if EPT is + * enabled. + */ + if (!shadow_accessed_mask) + return; + + sp = page_header(__pa(sptep)); + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + __direct_pte_prefetch(vcpu, sp, sptep); +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int level, gfn_t gfn, pfn_t pfn) { @@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, level, gfn, pfn, false, true); + direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; } @@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, __set_spte(iterator.sptep, __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); + | shadow_user_mask | shadow_x_mask + | shadow_accessed_mask); } } return pt_write; } -static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) { - char buf[1]; - void __user *hva; - int r; + siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_MCEERR_AR; + info.si_addr = (void __user *)address; + info.si_addr_lsb = PAGE_SHIFT; - /* Touch the page, so send SIGBUS */ - hva = (void __user *)gfn_to_hva(kvm, gfn); - r = copy_from_user(buf, hva, 1); + send_sig_info(SIGBUS, &info, tsk); } static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); if (is_hwpoison_pfn(pfn)) { - kvm_send_hwpoison_signal(kvm, gfn); + kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); return 0; } else if (is_fault_pfn(pfn)) return -EFAULT; @@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; spin_lock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && + (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || + vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); @@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { - int i; - gfn_t root_gfn; struct kvm_mmu_page *sp; - int direct = 0; - u64 pdptr; - - root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + unsigned i; if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, + 1, ACC_ALL, NULL); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, + PT32_ROOT_LEVEL, 1, ACC_ALL, + NULL); + root = __pa(sp->spt); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + } else + BUG(); + + return 0; +} + +static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + u64 pdptr, pm_mask; + gfn_t root_gfn; + int i; + + root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; + + if (mmu_check_root(vcpu, root_gfn)) + return 1; + + /* + * Do we shadow a long mode page table? If so we need to + * write-protect the guests page table root. + */ + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - if (mmu_check_root(vcpu, root_gfn)) - return 1; - if (tdp_enabled) { - direct = 1; - root_gfn = 0; - } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, direct, - ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, + 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } - direct = !is_paging(vcpu); + + /* + * We shadow a 32 bit page table. This may be a legacy 2-level + * or a PAE 3-level page table. In either case we need to be aware that + * the shadow page table may be a PAE or a long mode page table. + */ + pm_mask = PT_PRESENT_MASK; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) + pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read(vcpu, i); + pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; - } else if (vcpu->arch.mmu.root_level == 0) - root_gfn = 0; - if (mmu_check_root(vcpu, root_gfn)) - return 1; - if (tdp_enabled) { - direct = 1; - root_gfn = i << 30; + if (mmu_check_root(vcpu, root_gfn)) + return 1; } spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, direct, + PT32_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu.pae_root[i] = root | pm_mask; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + + /* + * If we shadow a 32 bit page table with a long mode page + * table we enter this path. + */ + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.lm_root == NULL) { + /* + * The additional page necessary for this is only + * allocated on demand. + */ + + u64 *lm_root; + + lm_root = (void*)get_zeroed_page(GFP_KERNEL); + if (lm_root == NULL) + return 1; + + lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + + vcpu->arch.mmu.lm_root = lm_root; + } + + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + } + return 0; } +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mmu.direct_map) + return mmu_alloc_direct_roots(vcpu); + else + return mmu_alloc_shadow_roots(vcpu); +} + static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; + if (vcpu->arch.mmu.direct_map) + return; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } + trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, return vaddr; } +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + if (error) + *error = 0; + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { @@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static int nonpaging_init_context(struct kvm_vcpu *vcpu) +static int nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - struct kvm_mmu *context = &vcpu->arch.mmu; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->direct_map = true; + context->nx = false; return 0; } @@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static void inject_page_fault(struct kvm_vcpu *vcpu, - u64 addr, - u32 err_code) +static unsigned long get_cr3(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr3; +} + +static void inject_page_fault(struct kvm_vcpu *vcpu) { - kvm_inject_page_fault(vcpu, addr, err_code); + vcpu->arch.mmu.inject_page_fault(vcpu); } static void paging_free(struct kvm_vcpu *vcpu) @@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) { int bit7; bit7 = (gpte >> 7) & 1; - return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } #define PTTYPE 64 @@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { - struct kvm_mmu *context = &vcpu->arch.mmu; int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; - if (!is_nx(vcpu)) + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (level) { case PT32_ROOT_LEVEL: @@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) } } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +static int paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { - struct kvm_mmu *context = &vcpu->arch.mmu; + context->nx = is_nx(vcpu); + + reset_rsvds_bits_mask(vcpu, context, level); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; + context->direct_map = false; return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu) +static int paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); - return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); + return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu) +static int paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - struct kvm_mmu *context = &vcpu->arch.mmu; + context->nx = false; + + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->direct_map = false; return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu) +static int paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); - return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); + return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.walk_mmu; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; @@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; + context->direct_map = true; + context->set_cr3 = kvm_x86_ops->set_tdp_cr3; + context->get_cr3 = get_cr3; + context->inject_page_fault = kvm_inject_page_fault; + context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { + context->nx = false; context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { - reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { - reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { - reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); + context->nx = false; + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) return 0; } -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { int r; - ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu); + r = paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu); + r = paging32_init_context(vcpu, context); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); return r; } +EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); + + vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; + vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + + return r; +} + +static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; + + g_context->get_cr3 = get_cr3; + g_context->inject_page_fault = kvm_inject_page_fault; + + /* + * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The + * translation of l2_gpa to l1_gpa addresses is done using the + * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa + * functions between mmu and nested_mmu are swapped. + */ + if (!is_paging(vcpu)) { + g_context->nx = false; + g_context->root_level = 0; + g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + } else if (is_long_mode(vcpu)) { + g_context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); + g_context->root_level = PT64_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else if (is_pae(vcpu)) { + g_context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); + g_context->root_level = PT32E_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else { + g_context->nx = false; + reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); + g_context->root_level = PT32_ROOT_LEVEL; + g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + } + + return 0; +} static int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = bad_pfn; - if (tdp_enabled) + if (mmu_is_nested(vcpu)) + return init_kvm_nested_mmu(vcpu); + else if (tdp_enabled) return init_kvm_tdp_mmu(vcpu); else return init_kvm_softmmu(vcpu); @@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); out: return r; } @@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_unload); static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } - if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) return; ++vcpu->kvm->stat.mmu_pte_updated; @@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, "pre pte write"); + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { @@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - kvm_mmu_audit(vcpu, "post pte write"); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); @@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (tdp_enabled) + if (vcpu->arch.mmu.direct_map) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); @@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - int free_pages; LIST_HEAD(invalid_list); - free_pages = vcpu->kvm->arch.n_free_mmu_pages; - while (free_pages < KVM_REFILL_PAGES && + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); + if (vcpu->arch.mmu.lm_root != NULL) + free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) return init_kvm_mmu(vcpu); } -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; @@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; - int cache_count = 0; + + if (nr_to_scan == 0) + goto out; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages, idx, freed_pages; + int idx, freed_pages; LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - npages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - cache_count += npages; - if (!kvm_freed && nr_to_scan > 0 && npages > 0) { + if (!kvm_freed && nr_to_scan > 0 && + kvm->arch.n_used_mmu_pages > 0) { freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); - cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; @@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) spin_unlock(&kvm_lock); - return cache_count; +out: + return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { @@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void) void kvm_mmu_module_exit(void) { mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); } @@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + goto nomem; + register_shrinker(&mmu_shrinker); return 0; @@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) } EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); -#ifdef AUDIT - -static const char *audit_msg; - -static gva_t canonicalize(gva_t gva) -{ -#ifdef CONFIG_X86_64 - gva = (long long)(gva << 16) >> 16; +#ifdef CONFIG_KVM_MMU_AUDIT +#include "mmu_audit.c" +#else +static void mmu_audit_disable(void) { } #endif - return gva; -} - - -typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); - -static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, - inspect_spte_fn fn) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = sp->spt[i]; - - if (is_shadow_present_pte(ent)) { - if (!is_last_spte(ent, sp->role.level)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(kvm, child, fn); - } else - fn(kvm, &sp->spt[i]); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); - return; - } - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root && VALID_PAGE(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); - } - } - return; -} - -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) -{ - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 ent = pt[i]; - - if (ent == shadow_trap_nonpresent_pte) - continue; - - va = canonicalize(va); - if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) - audit_mappings_page(vcpu, ent, va, level - 1); - else { - gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); - gfn_t gfn = gpa >> PAGE_SHIFT; - pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); - hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - continue; - } - - if (is_shadow_present_pte(ent) - && (ent & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx gpa %llx hpa %llx ent %llx %d\n", - audit_msg, vcpu->arch.mmu.root_level, - va, gpa, hpa, ent, - is_shadow_present_pte(ent)); - else if (ent == shadow_notrap_nonpresent_pte - && !is_error_hpa(hpa)) - printk(KERN_ERR "audit: (%s) notrap shadow," - " valid guest gva %lx\n", audit_msg, va); - kvm_release_pfn_clean(pfn); - - } - } -} - -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->arch.mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); -} - -static int count_rmaps(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_memslots *slots; - int nmaps = 0; - int i, j, k, idx; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &slots->memslots[i]; - struct kvm_rmap_desc *d; - - for (j = 0; j < m->npages; ++j) { - unsigned long *rmapp = &m->rmap[j]; - - if (!*rmapp) - continue; - if (!(*rmapp & 1)) { - ++nmaps; - continue; - } - d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (d) { - for (k = 0; k < RMAP_EXT; ++k) - if (d->sptes[k]) - ++nmaps; - else - break; - d = d->more; - } - } - } - srcu_read_unlock(&kvm->srcu, idx); - return nmaps; -} - -void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - unsigned long *rmapp; - struct kvm_mmu_page *rev_sp; - gfn_t gfn; - - if (is_writable_pte(*sptep)) { - rev_sp = page_header(__pa(sptep)); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no memslot for gfn %ld\n", - audit_msg, gfn); - printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", - audit_msg, (long int)(sptep - rev_sp->spt), - rev_sp->gfn); - dump_stack(); - return; - } - - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); - if (!*rmapp) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no rmap for writable spte %llx\n", - audit_msg, *sptep); - dump_stack(); - } - } - -} - -void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, inspect_spte_has_rmap); -} - -static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - int i; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - u64 *pt = sp->spt; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - if (!is_writable_pte(ent)) - continue; - inspect_spte_has_rmap(vcpu->kvm, &pt[i]); - } - } - return; -} - -static void audit_rmap(struct kvm_vcpu *vcpu) -{ - check_writable_mappings_rmap(vcpu); - count_rmaps(vcpu); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - struct kvm_memory_slot *slot; - unsigned long *rmapp; - u64 *spte; - gfn_t gfn; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.direct) - continue; - if (sp->unsync) - continue; - - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[gfn - slot->base_gfn]; - - spte = rmap_next(vcpu->kvm, rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) - printk(KERN_ERR "%s: (%s) shadow page has " - "writable mappings: gfn %lx role %x\n", - __func__, audit_msg, sp->gfn, - sp->role.word); - spte = rmap_next(vcpu->kvm, rmapp, spte); - } - } -} - -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - int olddbg = dbg; + ASSERT(vcpu); - dbg = 0; - audit_msg = msg; - audit_rmap(vcpu); - audit_write_protection(vcpu); - if (strcmp("pre pte write", audit_msg) != 0) - audit_mappings(vcpu); - audit_writable_sptes_have_rmaps(vcpu); - dbg = olddbg; + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); + mmu_audit_disable(); } - -#endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index be66759..7086ca8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -49,10 +49,17 @@ #define PFERR_FETCH_MASK (1U << 4) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + +static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +{ + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; +} static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) __kvm_mmu_free_some_pages(vcpu); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c new file mode 100644 index 0000000..ba2bcdd --- /dev/null +++ b/arch/x86/kvm/mmu_audit.c @@ -0,0 +1,299 @@ +/* + * mmu_audit.c: + * + * Audit code for KVM MMU + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * Marcelo Tosatti <mtosatti@redhat.com> + * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/ratelimit.h> + +static int audit_point; + +#define audit_printk(fmt, args...) \ + printk(KERN_ERR "audit: (%s) error: " \ + fmt, audit_point_name[audit_point], ##args) + +typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); + +static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + inspect_spte_fn fn, int level) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 *ent = sp->spt; + + fn(vcpu, ent + i, level); + + if (is_shadow_present_pte(ent[i]) && + !is_last_spte(ent[i], level)) { + struct kvm_mmu_page *child; + + child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(vcpu, child, fn, level - 1); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); + return; + } + + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu, sp, fn, 2); + } + } + + return; +} + +typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); + +static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) + fn(kvm, sp); +} + +static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; + + sp = page_header(__pa(sptep)); + + if (sp->unsync) { + if (level != PT_PAGE_TABLE_LEVEL) { + audit_printk("unsync sp: %p level = %d\n", sp, level); + return; + } + + if (*sptep == shadow_notrap_nonpresent_pte) { + audit_printk("notrap spte in unsync sp: %p\n", sp); + return; + } + } + + if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { + audit_printk("notrap spte in direct sp: %p\n", sp); + return; + } + + if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) + return; + + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; + } + + hpa = pfn << PAGE_SHIFT; + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + audit_printk("levels %d pfn %llx hpa %llx ent %llxn", + vcpu->arch.mmu.root_level, pfn, hpa, *sptep); +} + +static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + + rev_sp = page_header(__pa(sptep)); + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + audit_printk("no memslot for gfn %llx\n", gfn); + audit_printk("index %ld of sp (gfn=%llx)\n", + (long int)(sptep - rev_sp->spt), rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + audit_printk("no rmap for writable spte %llx\n", *sptep); + dump_stack(); + } +} + +static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) + inspect_spte_has_rmap(vcpu->kvm, sptep); +} + +static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + if (audit_point == AUDIT_POST_SYNC && sp->unsync) + audit_printk("meet unsync sp(%p) after sync root.\n", sp); +} + +static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int i; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_rmap_spte(sp->spt[i])) + continue; + + inspect_spte_has_rmap(kvm, sp->spt + i); + } +} + +static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct kvm_memory_slot *slot; + unsigned long *rmapp; + u64 *spte; + + if (sp->role.direct || sp->unsync || sp->role.invalid) + return; + + slot = gfn_to_memslot(kvm, sp->gfn); + rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + if (is_writable_pte(*spte)) + audit_printk("shadow page has writable mappings: gfn " + "%llx role %x\n", sp->gfn, sp->role.word); + spte = rmap_next(kvm, rmapp, spte); + } +} + +static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + check_mappings_rmap(kvm, sp); + audit_write_protection(kvm, sp); +} + +static void audit_all_active_sps(struct kvm *kvm) +{ + walk_all_active_sps(kvm, audit_sp); +} + +static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + audit_sptes_have_rmaps(vcpu, sptep, level); + audit_mappings(vcpu, sptep, level); + audit_spte_after_sync(vcpu, sptep, level); +} + +static void audit_vcpu_spte(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, audit_spte); +} + +static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!__ratelimit(&ratelimit_state)) + return; + + audit_point = point; + audit_all_active_sps(vcpu->kvm); + audit_vcpu_spte(vcpu); +} + +static bool mmu_audit; + +static void mmu_audit_enable(void) +{ + int ret; + + if (mmu_audit) + return; + + ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + WARN_ON(ret); + + mmu_audit = true; +} + +static void mmu_audit_disable(void) +{ + if (!mmu_audit) + return; + + unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + tracepoint_synchronize_unregister(); + mmu_audit = false; +} + +static int mmu_audit_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned long enable; + + ret = strict_strtoul(val, 10, &enable); + if (ret < 0) + return -EINVAL; + + switch (enable) { + case 0: + mmu_audit_disable(); + break; + case 1: + mmu_audit_enable(); + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct kernel_param_ops audit_param_ops = { + .set = mmu_audit_set, + .get = param_get_bool, +}; + +module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3aab0f0..b60b4fd 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); + +TRACE_EVENT( + kvm_mmu_audit, + TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), + TP_ARGS(vcpu, audit_point), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(int, audit_point) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->audit_point = audit_point; + ), + + TP_printk("vcpu:%d %s", __entry->vcpu->cpu, + audit_point_name[__entry->audit_point]) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 51ef909..cd7a833 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,7 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -67,6 +67,7 @@ struct guest_walker { int level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; + pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; @@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; #if PTTYPE == 64 - if (is_nx(vcpu)) + if (vcpu->arch.mmu.nx) access &= ~(gpte >> PT64_NX_SHIFT); #endif return access; @@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) /* * Fetch a guest pte for a guest virtual address */ -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) +static int FNAME(walk_addr_generic)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t addr, u32 access) { pt_element_t pte; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; bool eperm, present, rsvd_fault; + int offset, write_fault, user_fault, fetch_fault; + + write_fault = access & PFERR_WRITE_MASK; + user_fault = access & PFERR_USER_MASK; + fetch_fault = access & PFERR_FETCH_MASK; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: present = true; eperm = rsvd_fault = false; - walker->level = vcpu->arch.mmu.root_level; - pte = vcpu->arch.cr3; + walker->level = mmu->root_level; + pte = mmu->get_cr3(vcpu); + #if PTTYPE == 64 - if (!is_long_mode(vcpu)) { - pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); + if (walker->level == PT32E_ROOT_LEVEL) { + pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) { present = false; @@ -142,7 +149,7 @@ walk: } #endif ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); pt_access = ACC_ALL; @@ -150,12 +157,14 @@ walk: index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); - pte_gpa = gfn_to_gpa(table_gfn); - pte_gpa += index * sizeof(pt_element_t); + offset = index * sizeof(pt_element_t); + pte_gpa = gfn_to_gpa(table_gfn) + offset; walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { + if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, + offset, sizeof(pte), + PFERR_USER_MASK|PFERR_WRITE_MASK)) { present = false; break; } @@ -167,7 +176,7 @@ walk: break; } - if (is_rsvd_bits_set(vcpu, pte, walker->level)) { + if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { rsvd_fault = true; break; } @@ -204,17 +213,28 @@ walk: (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && is_large_pte(pte) && - is_long_mode(vcpu))) { + mmu->root_level == PT64_ROOT_LEVEL)) { int lvl = walker->level; + gpa_t real_gpa; + gfn_t gfn; + u32 ac; - walker->gfn = gpte_to_gfn_lvl(pte, lvl); - walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) - >> PAGE_SHIFT; + gfn = gpte_to_gfn_lvl(pte, lvl); + gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) - walker->gfn += pse36_gfn_delta(pte); + gfn += pse36_gfn_delta(pte); + + ac = write_fault | fetch_fault | user_fault; + + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), + ac); + if (real_gpa == UNMAPPED_GVA) + return 0; + + walker->gfn = real_gpa >> PAGE_SHIFT; break; } @@ -249,18 +269,36 @@ error: walker->error_code = 0; if (present) walker->error_code |= PFERR_PRESENT_MASK; - if (write_fault) - walker->error_code |= PFERR_WRITE_MASK; - if (user_fault) - walker->error_code |= PFERR_USER_MASK; - if (fetch_fault && is_nx(vcpu)) + + walker->error_code |= write_fault | user_fault; + + if (fetch_fault && mmu->nx) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; + + vcpu->arch.fault.address = addr; + vcpu->arch.fault.error_code = walker->error_code; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, u32 access) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, + access); +} + +static int FNAME(walk_addr_nested)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + u32 access) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, + addr, access); +} + static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { @@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { - int r; pt_element_t curr_pte; - - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], + gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; + u64 mask; + int r, index; + + if (level == PT_PAGE_TABLE_LEVEL) { + mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; + base_gpa = pte_gpa & ~mask; + index = (pte_gpa - base_gpa) / sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); + curr_pte = gw->prefetch_ptes[index]; + } else + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &curr_pte, sizeof(curr_pte)); + return r || curr_pte != gw->ptes[level - 1]; } +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + u64 *sptep) +{ + struct kvm_mmu_page *sp; + struct kvm_mmu *mmu = &vcpu->arch.mmu; + pt_element_t *gptep = gw->prefetch_ptes; + u64 *spte; + int i; + + sp = page_header(__pa(sptep)); + + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + if (sp->role.direct) + return __direct_pte_prefetch(vcpu, sp, sptep); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + pt_element_t gpte; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; + bool dirty; + + if (spte == sptep) + continue; + + if (*spte != shadow_trap_nonpresent_pte) + continue; + + gpte = gptep[i]; + + if (!is_present_gpte(gpte) || + is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) { + if (!sp->unsync) + __set_spte(spte, shadow_notrap_nonpresent_pte); + continue; + } + + if (!(gpte & PT_ACCESSED_MASK)) + continue; + + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + gfn = gpte_to_gfn(gpte); + dirty = is_dirty_gpte(gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + (pte_access & ACC_WRITE_MASK) && dirty); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + break; + } + + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, + dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, + pfn, true, true); + } +} + /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ @@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, user_fault, write_fault, dirty, ptwrite, it.level, gw->gfn, pfn, false, true); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); return it.sptep; @@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; - int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *sptep; int write_pt = 0; @@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - kvm_mmu_audit(vcpu, "pre page fault"); r = mmu_topup_memory_caches(vcpu); if (r) @@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * Look up the guest pte for the faulting address. */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, - fetch_fault); + r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - inject_page_fault(vcpu, addr, walker.error_code); + inject_page_fault(vcpu); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ return 0; } @@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); @@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, "post page fault (fixed)"); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; @@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, - !!(access & PFERR_WRITE_MASK), - !!(access & PFERR_USER_MASK), - !!(access & PFERR_FETCH_MASK)); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } else if (error) + *error = walker.error_code; + + return gpa; +} + +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); @@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL) || gfn != sp->gfns[i] || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a3f9f6..82e144a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,7 +4,7 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -88,6 +88,14 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; + /* + * If we vmexit during an instruction emulation we need this to restore + * the l1 guest rip after the emulation + */ + unsigned long vmexit_rip; + unsigned long vmexit_rsp; + unsigned long vmexit_rax; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -96,6 +104,8 @@ struct nested_state { u32 intercept_exceptions; u64 intercept; + /* Nested Paging related state */ + u64 nested_cr3; }; #define MSRPM_OFFSETS 16 @@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) force_new_asid(vcpu); } +static int get_npt_level(void) +{ +#ifdef CONFIG_X86_64 + return PT64_ROOT_LEVEL; +#else + return PT32E_ROOT_LEVEL; +#endif +} + static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; @@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = offset; + } + + svm->vmcb->control.tsc_offset = offset + g_tsc_offset; +} + +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.tsc_offset += adjustment; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += adjustment; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - save->efer = EFER_SVME; + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; @@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm) * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + svm->vcpu.arch.cr0 = 0; + (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - svm->vmcb->control.tsc_offset = 0-native_read_tsc(); + kvm_write_tsc(&svm->vcpu, 0); err = fx_init(&svm->vcpu); if (err) @@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int i; if (unlikely(cpu != vcpu->cpu)) { - u64 delta; - - if (check_tsc_unstable()) { - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; - } - vcpu->cpu = cpu; - kvm_migrate_timers(vcpu); svm->asid_generation = 0; } @@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) ++vcpu->stat.host_state_reload; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - vcpu->arch.host_tsc = native_read_tsc(); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) switch (reg) { case VCPU_EXREG_PDPTR: BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); break; default: BUG(); @@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (old == new) { /* cr0 write with ts and mp unchanged */ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { + svm->nested.vmexit_rip = kvm_rip_read(vcpu); + svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); return; + } } } @@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm) return 1; } +static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->nested.nested_cr3; +} + +static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, + unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); +} + +static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; + svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; + + nested_svm_vmexit(svm); +} + +static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r; + + r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + + vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; + vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu.shadow_root_level = get_npt_level(); + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + static int nested_svm_check_permissions(struct vcpu_svm *svm) { if (!(svm->vcpu.arch.efer & EFER_SVME) @@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) return false; + /* + * if vmexit was already requested (by intercepted exception + * for instance) do not overwrite it with "external interrupt" + * vmexit. + */ + if (svm->nested.exit_required) + return false; + svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; + nested_vmcb->save.efer = svm->vcpu.arch.efer; nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; @@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); + svm->nested.nested_cr3 = 0; + /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; svm->vmcb->save.cs = hsave->save.cs; @@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_unmap(page); + nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool nested_vmcb_checks(struct vmcb *vmcb) +{ + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + return false; + + if (vmcb->control.asid == 0) + return false; + + if (vmcb->control.nested_ctl && !npt_enabled) + return false; + + return true; +} + static bool nested_svm_vmrun(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) if (!nested_vmcb) return false; - trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, nested_vmcb->control.event_inj, @@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; - hsave->save.rip = svm->next_rip; + hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; if (npt_enabled) @@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + if (nested_vmcb->control.nested_ctl) { + kvm_mmu_unload(&svm->vcpu); + svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + nested_svm_init_mmu_context(&svm->vcpu); + } + /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; svm->vmcb->save.cs = nested_vmcb->save.cs; @@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + /* Save rip after vmrun instruction */ + kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); if (!nested_svm_vmrun(svm)) return 1; @@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); @@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } +static int cr0_write_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int r; + + r = emulate_instruction(&svm->vcpu, 0, 0, 0); + + if (svm->nested.vmexit_rip) { + kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); + kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); + kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); + svm->nested.vmexit_rip = 0; + } + + return r == EMULATE_DONE; +} + static int cr8_write_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; @@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TSC: { - u64 tsc_offset = data - native_read_tsc(); - u64 g_tsc_offset = 0; - - if (is_nested(svm)) { - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = tsc_offset; - } - - svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, data); break; - } case MSR_STAR: svm->vmcb->save.star = data; break; @@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = cr0_write_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, @@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && + exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, @@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) + if (svm->vcpu.arch.hflags & HF_IRET_MASK) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } svm->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&svm->vcpu); @@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; @@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) } } +static void svm_cancel_injection(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + control->exit_int_info = control->event_inj; + control->exit_int_info_err = control->event_inj_err; + control->event_inj = 0; + svm_complete_interrupts(svm); +} + #ifdef CONFIG_X86_64 #define R "r" #else @@ -3167,9 +3313,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; - /* required for live migration with NPT */ - if (npt_enabled) - svm->vmcb->save.cr3 = vcpu->arch.cr3; clgi(); @@ -3291,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - if (npt_enabled) { - svm->vmcb->control.nested_cr3 = root; - force_new_asid(vcpu); - return; - } - svm->vmcb->save.cr3 = root; force_new_asid(vcpu); } +static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + + /* Also sync guest cr3 here in case we live migrate */ + svm->vmcb->save.cr3 = vcpu->arch.cr3; + + force_new_asid(vcpu); +} + static int is_disabled(void) { u64 vm_cr; @@ -3333,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static int get_npt_level(void) -{ -#ifdef CONFIG_X86_64 - return PT64_ROOT_LEVEL; -#else - return PT32E_ROOT_LEVEL; -#endif -} - static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; @@ -3354,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x80000001: + if (nested) + entry->ecx |= (1 << 2); /* Set SVM bit */ + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper ASID emulation to nested SVM */ entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Do not support any additional features */ + entry->edx = 0; /* Per default do not support any + additional features */ + + /* Support next_rip if host supports it */ + if (svm_has(SVM_FEATURE_NRIP)) + entry->edx |= SVM_FEATURE_NRIP; + + /* Support NPT for the guest if enabled */ + if (npt_enabled) + entry->edx |= SVM_FEATURE_NPT; break; } @@ -3497,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, + .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, @@ -3519,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = { .set_supported_cpuid = svm_set_supported_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, + + .write_tsc_offset = svm_write_tsc_offset, + .adjust_tsc_offset = svm_adjust_tsc_offset, + + .set_tdp_cr3 = set_tdp_cr3, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index e16a0db..fc7a101 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -6,7 +6,7 @@ * * timer support * - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7bddfab..8da0e45 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,7 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -125,6 +125,7 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u32 exit_intr_info; u32 idt_vectoring_info; struct shared_msr_entry *guest_msrs; int nmsrs; @@ -154,11 +155,6 @@ struct vcpu_vmx { u32 limit; u32 ar; } tr, es, ds, fs, gs; - struct { - bool pending; - u8 vector; - unsigned rip; - } irq; } rmode; int vpid; bool emulation_required; @@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.arch.host_tsc); list_del(&vmx->local_vcpus_link); vmx->vcpu.cpu = -1; vmx->launched = 0; @@ -706,11 +701,10 @@ static void reload_tss(void) /* * VT restores TR but not its size. Useless. */ - struct desc_ptr gdt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); struct desc_struct *descs; - native_store_gdt(&gdt); - descs = (void *)gdt.address; + descs = (void *)gdt->address; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); } @@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { - struct desc_ptr gdt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector) if (!(selector & ~3)) return 0; - native_store_gdt(&gdt); - table_base = gdt.address; + table_base = gdt->address; if (selector & 4) { /* from ldt */ u16 ldt_selector = kvm_read_ldt(); @@ -883,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tsc_this, delta, new_offset; u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); if (!vmm_exclusive) @@ -897,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vcpu->cpu != cpu) { - struct desc_ptr dt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); unsigned long sysenter_esp; - kvm_migrate_timers(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); local_irq_enable(); - vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - native_store_gdt(&dt); - vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ + vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - /* - * Make sure the time stamp counter is monotonous. - */ - rdtscll(tsc_this); - if (tsc_this < vcpu->arch.host_tsc) { - delta = vcpu->arch.host_tsc - tsc_this; - new_offset = vmcs_read64(TSC_OFFSET) + delta; - vmcs_write64(TSC_OFFSET, new_offset); - } } } @@ -1044,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = nr; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (kvm_exception_is_soft(nr)) - vmx->rmode.irq.rip += - vmx->vcpu.arch.event_exit_inst_len; - intr_info |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -1149,12 +1120,17 @@ static u64 guest_read_tsc(void) } /* - * writes 'guest_tsc' into guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + * writes 'offset' into guest's timestamp counter offset register */ -static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) { - vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); + u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); } /* @@ -1227,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; - u64 host_tsc; int ret = 0; switch (msr_index) { @@ -1257,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - rdtscll(host_tsc); - guest_write_tsc(data, host_tsc); + kvm_write_tsc(vcpu, data); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -1856,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -2515,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat, tsc_this, tsc_base; + u64 host_pat; unsigned long a; struct desc_ptr dt; int i; @@ -2656,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); - tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; - rdtscll(tsc_this); - if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) - tsc_base = tsc_this; - - guest_write_tsc(0, tsc_base); + kvm_write_tsc(&vmx->vcpu, 0); return 0; } @@ -2834,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (vcpu->arch.interrupt.soft) - vmx->rmode.irq.rip += - vmx->vcpu.arch.event_exit_inst_len; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -2875,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = NMI_VECTOR; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - NMI_VECTOR | INTR_TYPE_SOFT_INTR | - INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -3346,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3358,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + kvm_make_request(KVM_REQ_EVENT, vcpu); + ++vcpu->stat.irq_window_exits; /* @@ -3614,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); ++vcpu->stat.nmi_window_exits; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3623,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; int ret = 1; + u32 cpu_exec_ctrl; + bool intr_window_requested; + + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (!guest_state_valid(vcpu)) { + if (intr_window_requested + && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + return handle_interrupt_window(&vmx->vcpu); + err = emulate_instruction(vcpu, 0, 0, 0); if (err == EMULATE_DO_MMIO) { @@ -3790,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; - u32 idt_vectoring_info = vmx->idt_vectoring_info; - bool unblock_nmi; - u8 vector; - int type; - bool idtv_info_valid; - - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + u32 exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -3816,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) asm("int $2"); kvm_after_handle_nmi(&vmx->vcpu); } +} - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = vmx->exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; @@ -3839,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); +} + +static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) +{ + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&vmx->vcpu); @@ -3847,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) if (!idtv_info_valid) return; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; @@ -3863,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + u32 err = vmcs_read32(error_code_field); kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(&vmx->vcpu, vector, @@ -3885,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } } -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx) +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - vmx->rmode.irq.pending = 0; - if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) - return; - kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - vmx->idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx->rmode.irq.vector; + __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(to_vmx(vcpu), + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } #ifdef CONFIG_X86_64 @@ -4032,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" - , R"bx", R"di", R"si" + , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #endif @@ -4043,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } @@ -4119,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); @@ -4334,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, @@ -4356,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_supported_cpuid = vmx_set_supported_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .write_tsc_offset = vmx_write_tsc_offset, + .adjust_tsc_offset = vmx_adjust_tsc_offset, + + .set_tdp_cr3 = vmx_set_cr3, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c2ecf0..2288ad8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,7 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -55,6 +55,8 @@ #include <asm/mce.h> #include <asm/i387.h> #include <asm/xcr.h> +#include <asm/pvclock.h> +#include <asm/div64.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -71,7 +73,7 @@ #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P +#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, u32 prev_nr; int class1, class2; + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (!vcpu->arch.exception.pending) { queue: vcpu->arch.exception.pending = true; @@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - u32 error_code) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu) { + unsigned error_code = vcpu->arch.fault.error_code; + ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = addr; + vcpu->arch.cr2 = vcpu->arch.fault.address; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } +void kvm_propagate_fault(struct kvm_vcpu *vcpu) +{ + if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) + vcpu->arch.nested_mmu.inject_page_fault(vcpu); + else + vcpu->arch.mmu.inject_page_fault(vcpu); + + vcpu->arch.fault.nested = false; +} + void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) EXPORT_SYMBOL_GPL(kvm_require_cpl); /* + * This function will be used to read from the physical memory of the currently + * running guest. The difference to kvm_read_guest_page is that this function + * can read from guest physical or from the guest's guest physical memory. + */ +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t ngfn, void *data, int offset, int len, + u32 access) +{ + gfn_t real_gfn; + gpa_t ngpa; + + ngpa = gfn_to_gpa(ngfn); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + if (real_gfn == UNMAPPED_GVA) + return -EFAULT; + + real_gfn = gpa_to_gfn(real_gfn); + + return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); + +int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, + void *data, int offset, int len, u32 access) +{ + return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, + data, offset, len, access); +} + +/* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte)); + ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte), + PFERR_USER_MASK|PFERR_WRITE_MASK); if (ret < 0) { ret = 0; goto out; @@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } ret = 1; - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, @@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; + int offset; + gfn_t gfn; int r; if (is_long_mode(vcpu) || !is_pae(vcpu)) @@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; + offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); + r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), + PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) goto out; - changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; + changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; out: return changed; @@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.cr3)) return 1; } @@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) + && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) return 1; if (cr4 & X86_CR4_VMXE) @@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) return 1; - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + if (is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; } /* @@ -737,7 +791,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; @@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_write_guest_time below) to the + * system time (updated by kvm_guest_time_update below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ @@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) return quotient; } -static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, + s8 *pshift, u32 *pmultiplier) { - uint64_t nsecs = 1000000000LL; + uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = tsc_khz * 1000LL; - while (tps64 > nsecs*2) { + tps64 = base_khz * 1000LL; + scaled64 = scaled_khz * 1000LL; + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= (uint32_t)nsecs) { - tps32 <<= 1; + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { + if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) + scaled64 >>= 1; + else + tps32 <<= 1; shift++; } - hv_clock->tsc_shift = shift; - hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + *pshift = shift; + *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __func__, tsc_khz, hv_clock->tsc_shift, - hv_clock->tsc_to_system_mul); + pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", + __func__, base_khz, scaled_khz, shift, *pmultiplier); +} + +static inline u64 get_kernel_ns(void) +{ + struct timespec ts; + + WARN_ON(preemptible()); + ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + return timespec_to_ns(&ts); } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +unsigned long max_tsc_khz; -static void kvm_write_guest_time(struct kvm_vcpu *v) +static inline int kvm_tsc_changes_freq(void) +{ + int cpu = get_cpu(); + int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + cpufreq_quick_get(cpu) != 0; + put_cpu(); + return ret; +} + +static inline u64 nsec_to_cycles(u64 nsec) +{ + u64 ret; + + WARN_ON(preemptible()); + if (kvm_tsc_changes_freq()) + printk_once(KERN_WARNING + "kvm: unreliable cycle conversion on adjustable rate TSC\n"); + ret = nsec * __get_cpu_var(cpu_tsc_khz); + do_div(ret, USEC_PER_SEC); + return ret; +} + +static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +{ + /* Compute a scale to convert nanoseconds in TSC cycles */ + kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + &kvm->arch.virtual_tsc_shift, + &kvm->arch.virtual_tsc_mult); + kvm->arch.virtual_tsc_khz = this_tsc_khz; +} + +static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) +{ + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + vcpu->kvm->arch.virtual_tsc_mult, + vcpu->kvm->arch.virtual_tsc_shift); + tsc += vcpu->arch.last_tsc_write; + return tsc; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u64 offset, ns, elapsed; + unsigned long flags; + s64 sdiff; + + spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = data - native_read_tsc(); + ns = get_kernel_ns(); + elapsed = ns - kvm->arch.last_tsc_nsec; + sdiff = data - kvm->arch.last_tsc_write; + if (sdiff < 0) + sdiff = -sdiff; + + /* + * Special case: close write to TSC within 5 seconds of + * another CPU is interpreted as an attempt to synchronize + * The 5 seconds is to accomodate host load / swapping as + * well as any reset of TSC during the boot process. + * + * In that case, for a reliable TSC, we can match TSC offsets, + * or make a best guest using elapsed value. + */ + if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + elapsed < 5ULL * NSEC_PER_SEC) { + if (!check_tsc_unstable()) { + offset = kvm->arch.last_tsc_offset; + pr_debug("kvm: matched tsc offset for %llu\n", data); + } else { + u64 delta = nsec_to_cycles(elapsed); + offset += delta; + pr_debug("kvm: adjusted tsc offset by %llu\n", delta); + } + ns = kvm->arch.last_tsc_nsec; + } + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = data; + kvm->arch.last_tsc_offset = offset; + kvm_x86_ops->write_tsc_offset(vcpu, offset); + spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + /* Reset of TSC must disable overshoot protection below */ + vcpu->arch.hv_clock.tsc_timestamp = 0; + vcpu->arch.last_tsc_write = data; + vcpu->arch.last_tsc_nsec = ns; +} +EXPORT_SYMBOL_GPL(kvm_write_tsc); + +static int kvm_guest_time_update(struct kvm_vcpu *v) { - struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; - if ((!vcpu->time_page)) - return; + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + kernel_ns = get_kernel_ns(); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - this_tsc_khz = get_cpu_var(cpu_tsc_khz); - if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } + + /* + * We may have to catch up the TSC to match elapsed wall clock + * time for two reasons, even if kvmclock is used. + * 1) CPU could have been running below the maximum TSC rate + * 2) Broken TSC compensation resets the base at each VCPU + * entry to avoid unknown leaps of TSC even when running + * again on the same CPU. This may cause apparent elapsed + * time to disappear, and the guest to stand still or run + * very slowly. + */ + if (vcpu->tsc_catchup) { + u64 tsc = compute_guest_tsc(v, kernel_ns); + if (tsc > tsc_timestamp) { + kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + tsc_timestamp = tsc; + } } - put_cpu_var(cpu_tsc_khz); - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); local_irq_restore(flags); - /* With all the info we got, fill in the values */ + if (!vcpu->time_page) + return 0; - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + &vcpu->hv_clock.tsc_shift, + &vcpu->hv_clock.tsc_to_system_mul); + vcpu->hw_tsc_khz = this_tsc_khz; + } + + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + + /* With all the info we got, fill in the values */ + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; vcpu->hv_clock.flags = 0; /* @@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); -} - -static int kvm_request_guest_time_update(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - - if (!vcpu->time_page) - return 0; - kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); - return 1; + return 0; } static bool msr_mtrr_valid(unsigned msr) @@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } vcpu->arch.time = data; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } - - kvm_request_guest_time_update(vcpu); break; } case MSR_IA32_MCG_CTL: @@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to speicify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case 0xcd: /* fsb frequency */ data = 3; break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + data = 1 << 24; + break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; @@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + data = 0x20000000; + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { - unsigned long khz = cpufreq_quick_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + /* Make sure TSC doesn't go backwards */ + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; + if (tsc_delta < 0) + mark_tsc_unstable("KVM discovered backwards TSC"); + if (check_tsc_unstable()) { + kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + vcpu->arch.tsc_catchup = 1; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) + kvm_migrate_timers(vcpu); + vcpu->cpu = cpu; } - kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int is_efer_nx(void) @@ -1995,7 +2248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(F16C); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); @@ -2204,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -2357,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -2760,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_alloc_mmu_pages; + return kvm->arch.n_max_mmu_pages; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) @@ -2796,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -3201,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_SET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; s64 delta; @@ -3215,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + local_irq_disable(); + now_ns = get_kernel_ns(); delta = user_ns.clock - now_ns; + local_irq_enable(); kvm->arch.kvmclock_offset = delta; break; } case KVM_GET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + local_irq_disable(); + now_ns = get_kernel_ns(); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + local_irq_enable(); user_ns.flags = 0; r = -EFAULT; @@ -3292,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + +static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + gpa_t t_gpa; + u32 error; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* NPT walks are always user-walks */ + access |= PFERR_USER_MASK; + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); + if (t_gpa == UNMAPPED_GVA) + vcpu->arch.fault.nested = true; + + return t_gpa; +} + gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, @@ -3326,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3381,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3624,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, if (vcpu->arch.pio.count) goto data_avail; - trace_kvm_pio(1, port, size, 1); + trace_kvm_pio(0, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 1; @@ -3652,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu) { - trace_kvm_pio(0, port, size, 1); + trace_kvm_pio(1, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 0; @@ -3791,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_idt(vcpu, dt); +} + static unsigned long emulator_get_cached_segment_base(int seg, struct kvm_vcpu *vcpu) { @@ -3884,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = { .set_segment_selector = emulator_set_segment_selector, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, + .get_idt = emulator_get_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, @@ -3919,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; if (ctxt->exception == PF_VECTOR) - kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); + kvm_propagate_fault(vcpu); else if (ctxt->error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); else kvm_queue_exception(vcpu, ctxt->exception); } +static void init_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int cs_db, cs_l; + + cache_all_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +} + +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; + + init_emulate_ctxt(vcpu); + + vcpu->arch.emulate_ctxt.decode.op_bytes = 2; + vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + + if (ret != X86EMUL_CONTINUE) + return EMULATE_FAIL; + + vcpu->arch.emulate_ctxt.eip = c->eip; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (irq == NMI_VECTOR) + vcpu->arch.nmi_pending = false; + else + vcpu->arch.interrupt.pending = false; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); + static int handle_emulation_failure(struct kvm_vcpu *vcpu) { ++vcpu->stat.insn_emulation_fail; @@ -3982,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, cache_all_regs(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_emulate_ctxt(vcpu); vcpu->arch.emulate_ctxt.interruptibility = 0; vcpu->arch.emulate_ctxt.exception = -1; + vcpu->arch.emulate_ctxt.perm_ok = false; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt); + if (r == X86EMUL_PROPAGATE_FAULT) + goto done; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD @@ -4049,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu, memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); - if (r) { /* emulation failed */ + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; return handle_emulation_failure(vcpu); } - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - +done: if (vcpu->arch.emulate_ctxt.exception >= 0) { inject_emulated_exception(vcpu); - return EMULATE_DONE; - } - - if (vcpu->arch.pio.count) { + r = EMULATE_DONE; + } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->mmio_needed) { + r = EMULATE_DO_MMIO; + } else if (vcpu->mmio_needed) { if (vcpu->mmio_is_write) vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->arch.emulate_ctxt.restart) + r = EMULATE_DO_MMIO; + } else if (r == EMULATION_RESTART) goto restart; + else + r = EMULATE_DONE; - return EMULATE_DONE; + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + + return r; } EXPORT_SYMBOL_GPL(emulate_instruction); @@ -4097,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void bounce_off(void *info) +static void tsc_bad(void *info) +{ + __get_cpu_var(cpu_tsc_khz) = 0; +} + +static void tsc_khz_changed(void *data) { - /* nothing */ + struct cpufreq_freqs *freq = data; + unsigned long khz = 0; + + if (data) + khz = freq->new; + else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + khz = cpufreq_quick_get(raw_smp_processor_id()); + if (!khz) + khz = tsc_khz; + __get_cpu_var(cpu_tsc_khz) = khz; } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -4110,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; + /* + * We allow guests to temporarily run on slowing clocks, + * provided we notify them after, or to run on accelerating + * clocks, provided we notify them before. Thus time never + * goes backwards. + * + * However, we have a problem. We can't atomically update + * the frequency of a given CPU from this function; it is + * merely a notifier, which can be called from any CPU. + * Changing the TSC frequency at arbitrary points in time + * requires a recomputation of local variables related to + * the TSC for each VCPU. We must flag these local variables + * to be updated and be sure the update takes place with the + * new frequency before any guests proceed. + * + * Unfortunately, the combination of hotplug CPU and frequency + * change creates an intractable locking scenario; the order + * of when these callouts happen is undefined with respect to + * CPU hotplug, and they can race with each other. As such, + * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is + * undefined; you can actually have a CPU frequency change take + * place in between the computation of X and the setting of the + * variable. To protect against this problem, all updates of + * the per_cpu tsc_khz variable are done in an interrupt + * protected IPI, and all callers wishing to update the value + * must wait for a synchronous IPI to complete (which is trivial + * if the caller is on the CPU already). This establishes the + * necessary total order on variable updates. + * + * Note that because a guest time update may take place + * anytime after the setting of the VCPU's request bit, the + * correct TSC value must be set before the request. However, + * to ensure the update actually makes it to any guest which + * starts running in hardware virtualization between the set + * and the acquisition of the spinlock, we must also ping the + * CPU after setting the request bit. + * + */ + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - if (!kvm_request_guest_time_update(vcpu)) - continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != smp_processor_id()) - send_ipi++; + send_ipi = 1; } } spin_unlock(&kvm_lock); @@ -4142,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier + .notifier_call = kvmclock_cpufreq_notifier +}; + +static int kvmclock_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, tsc_bad, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvmclock_cpu_notifier_block = { + .notifier_call = kvmclock_cpu_notifier, + .priority = -INT_MAX }; static void kvm_timer_init(void) { int cpu; + max_tsc_khz = tsc_khz; + register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { +#ifdef CONFIG_CPU_FREQ + struct cpufreq_policy policy; + memset(&policy, 0, sizeof(policy)); + cpufreq_get_policy(&policy, get_cpu()); + if (policy.cpuinfo.max_freq) + max_tsc_khz = policy.cpuinfo.max_freq; +#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - for_each_online_cpu(cpu) { - unsigned long khz = cpufreq_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; - } - } else { - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } + pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -4269,6 +4672,7 @@ void kvm_arch_exit(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -4684,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) - kvm_write_guest_time(vcpu); + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { + r = kvm_guest_time_update(vcpu); + if (unlikely(r)) + goto out; + } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) @@ -4710,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -4728,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); + kvm_x86_ops->cancel_injection(vcpu); r = 1; goto out; } - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -4770,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + atomic_set(&vcpu->guest_mode, 0); smp_wmb(); local_irq_enable(); @@ -4899,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.count || vcpu->mmio_needed || - vcpu->arch.emulate_ctxt.restart) { + if (vcpu->arch.pio.count || vcpu->mmio_needed) { if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; @@ -4981,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -5044,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu->arch.mp_state = mp_state->mp_state; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -5051,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int cs_db, cs_l, ret; - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + int ret; - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_emulate_ctxt(vcpu); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, tss_selector, reason, has_error_code, error_code); @@ -5078,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -5113,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); mmu_reset_needed = 1; } @@ -5148,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -5334,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + printk_once(KERN_WARNING + "kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); return kvm_x86_ops->vcpu_create(kvm, id); } @@ -5376,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } int kvm_arch_hardware_enable(void *garbage) { - /* - * Since this may be called from a hotplug notifcation, - * we can't get the CPU frequency directly. - */ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - int cpu = raw_smp_processor_id(); - per_cpu(cpu_tsc_khz, cpu) = 0; - } + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; kvm_shared_msr_cpu_online(); - + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->cpu == smp_processor_id()) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return kvm_x86_ops->hardware_enable(garbage); } @@ -5425,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -5438,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); + if (!kvm->arch.virtual_tsc_khz) + kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -5497,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - rdtscll(kvm->arch.vm_init_tsc); + spin_lock_init(&kvm->arch.tsc_write_lock); return kvm; } @@ -5684,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b7a4047..2cea414 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; +} + static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); @@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); #endif |