diff options
63 files changed, 3432 insertions, 1232 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 57975bd..52702b0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2448,7 +2448,14 @@ S: Supported KERNEL VIRTUAL MACHINE (KVM) P: Avi Kivity -M: avi@qumranet.com +M: avi@redhat.com +L: kvm@vger.kernel.org +W: http://kvm.qumranet.com +S: Supported + +KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V +P: Joerg Roedel +M: joerg.roedel@amd.com L: kvm@vger.kernel.org W: http://kvm.qumranet.com S: Supported diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 1efe513..85db124 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -132,7 +132,7 @@ #define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */ #define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */ #define GPFN_GFW (6UL << 60) /* Guest Firmware */ -#define GPFN_HIGH_MMIO (7UL << 60) /* High MMIO range */ +#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */ #define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */ #define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */ @@ -413,6 +413,10 @@ struct kvm_arch { struct kvm_ioapic *vioapic; struct kvm_vm_stat stat; struct kvm_sal_data rdv_sal_data; + + struct list_head assigned_dev_head; + struct dmar_domain *intel_iommu_domain; + struct hlist_head irq_ack_notifier_list; }; union cpuid3_t { diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 7914e48..8e99fed 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -46,4 +46,6 @@ config KVM_INTEL config KVM_TRACE bool +source drivers/virtio/Kconfig + endif # VIRTUALIZATION diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile index bf22fb9..cf37f8f 100644 --- a/arch/ia64/kvm/Makefile +++ b/arch/ia64/kvm/Makefile @@ -44,7 +44,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o) + coalesced_mmio.o irq_comm.o) + +ifeq ($(CONFIG_DMAR),y) +common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +endif kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h new file mode 100644 index 0000000..c6786e8 --- /dev/null +++ b/arch/ia64/kvm/irq.h @@ -0,0 +1,31 @@ +/* + * irq.h: In-kernel interrupt controller related definitions + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Authors: + * Xiantao Zhang <xiantao.zhang@intel.com> + * + */ + +#ifndef __IRQ_H +#define __IRQ_H + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return 1; +} + +#endif diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index cd0d1a7..c0699f0 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -31,6 +31,7 @@ #include <linux/bitops.h> #include <linux/hrtimer.h> #include <linux/uaccess.h> +#include <linux/intel-iommu.h> #include <asm/pgtable.h> #include <asm/gcc_intrin.h> @@ -45,6 +46,7 @@ #include "iodev.h" #include "ioapic.h" #include "lapic.h" +#include "irq.h" static unsigned long kvm_vmm_base; static unsigned long kvm_vsa_base; @@ -179,12 +181,16 @@ int kvm_dev_ioctl_check_extension(long ext) switch (ext) { case KVM_CAP_IRQCHIP: case KVM_CAP_USER_MEMORY: + case KVM_CAP_MP_STATE: r = 1; break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; + case KVM_CAP_IOMMU: + r = intel_iommu_found(); + break; default: r = 0; } @@ -771,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm) */ kvm_build_io_pmt(kvm); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); } struct kvm *kvm_arch_create_vm(void) @@ -1334,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_iommu_unmap_guest(kvm); +#ifdef KVM_CAP_DEVICE_ASSIGNMENT + kvm_free_all_assigned_devices(kvm); +#endif kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); kvm_free_physmem(kvm); @@ -1435,17 +1446,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int user_alloc) { unsigned long i; - struct page *page; + unsigned long pfn; int npages = mem->memory_size >> PAGE_SHIFT; struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; unsigned long base_gfn = memslot->base_gfn; for (i = 0; i < npages; i++) { - page = gfn_to_page(kvm, base_gfn + i); - kvm_set_pmt_entry(kvm, base_gfn + i, - page_to_pfn(page) << PAGE_SHIFT, - _PAGE_AR_RWX|_PAGE_MA_WB); - memslot->rmap[i] = (unsigned long)page; + pfn = gfn_to_pfn(kvm, base_gfn + i); + if (!kvm_is_mmio_pfn(pfn)) { + kvm_set_pmt_entry(kvm, base_gfn + i, + pfn << PAGE_SHIFT, + _PAGE_AR_RWX | _PAGE_MA_WB); + memslot->rmap[i] = (unsigned long)pfn_to_page(pfn); + } else { + kvm_set_pmt_entry(kvm, base_gfn + i, + GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT), + _PAGE_MA_UC); + memslot->rmap[i] = 0; + } } return 0; @@ -1789,11 +1807,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - return -EINVAL; + vcpu_load(vcpu); + mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); + return 0; +} + +static int vcpu_reset(struct kvm_vcpu *vcpu) +{ + int r; + long psr; + local_irq_save(psr); + r = kvm_insert_vmm_mapping(vcpu); + if (r) + goto fail; + + vcpu->arch.launched = 0; + kvm_arch_vcpu_uninit(vcpu); + r = kvm_arch_vcpu_init(vcpu); + if (r) + goto fail; + + kvm_purge_vmm_mapping(vcpu); + r = 0; +fail: + local_irq_restore(psr); + return r; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - return -EINVAL; + int r = 0; + + vcpu_load(vcpu); + vcpu->arch.mp_state = mp_state->mp_state; + if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) + r = vcpu_reset(vcpu); + vcpu_put(vcpu); + return r; } diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h index 13980d9..2cc41d1 100644 --- a/arch/ia64/kvm/kvm_minstate.h +++ b/arch/ia64/kvm/kvm_minstate.h @@ -50,27 +50,18 @@ #define PAL_VSA_SYNC_READ \ /* begin to call pal vps sync_read */ \ +{.mii; \ add r25 = VMM_VPD_BASE_OFFSET, r21; \ - adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21; /* entry point */ \ + nop 0x0; \ + mov r24=ip; \ ;; \ +} \ +{.mmb \ + add r24=0x20, r24; \ ld8 r25 = [r25]; /* read vpd base */ \ - ld8 r20 = [r20]; \ - ;; \ - add r20 = PAL_VPS_SYNC_READ,r20; \ - ;; \ -{ .mii; \ - nop 0x0; \ - mov r24 = ip; \ - mov b0 = r20; \ + br.cond.sptk kvm_vps_sync_read; /*call the service*/ \ ;; \ }; \ -{ .mmb; \ - add r24 = 0x20, r24; \ - nop 0x0; \ - br.cond.sptk b0; /* call the service */ \ - ;; \ -}; - #define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21 diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S index e4f15d6..634abad 100644 --- a/arch/ia64/kvm/optvfault.S +++ b/arch/ia64/kvm/optvfault.S @@ -1,9 +1,12 @@ /* - * arch/ia64/vmx/optvfault.S + * arch/ia64/kvm/optvfault.S * optimize virtualization fault handler * * Copyright (C) 2006 Intel Co * Xuefei Xu (Anthony Xu) <anthony.xu@intel.com> + * Copyright (C) 2008 Intel Co + * Add the support for Tukwila processors. + * Xiantao Zhang <xiantao.zhang@intel.com> */ #include <asm/asmmacro.h> @@ -20,6 +23,98 @@ #define ACCE_MOV_TO_PSR #define ACCE_THASH +#define VMX_VPS_SYNC_READ \ + add r16=VMM_VPD_BASE_OFFSET,r21; \ + mov r17 = b0; \ + mov r18 = r24; \ + mov r19 = r25; \ + mov r20 = r31; \ + ;; \ +{.mii; \ + ld8 r16 = [r16]; \ + nop 0x0; \ + mov r24 = ip; \ + ;; \ +}; \ +{.mmb; \ + add r24=0x20, r24; \ + mov r25 =r16; \ + br.sptk.many kvm_vps_sync_read; \ +}; \ + mov b0 = r17; \ + mov r24 = r18; \ + mov r25 = r19; \ + mov r31 = r20 + +ENTRY(kvm_vps_entry) + adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21 + ;; + ld8 r29 = [r29] + ;; + add r29 = r29, r30 + ;; + mov b0 = r29 + br.sptk.many b0 +END(kvm_vps_entry) + +/* + * Inputs: + * r24 : return address + * r25 : vpd + * r29 : scratch + * + */ +GLOBAL_ENTRY(kvm_vps_sync_read) + movl r30 = PAL_VPS_SYNC_READ + ;; + br.sptk.many kvm_vps_entry +END(kvm_vps_sync_read) + +/* + * Inputs: + * r24 : return address + * r25 : vpd + * r29 : scratch + * + */ +GLOBAL_ENTRY(kvm_vps_sync_write) + movl r30 = PAL_VPS_SYNC_WRITE + ;; + br.sptk.many kvm_vps_entry +END(kvm_vps_sync_write) + +/* + * Inputs: + * r23 : pr + * r24 : guest b0 + * r25 : vpd + * + */ +GLOBAL_ENTRY(kvm_vps_resume_normal) + movl r30 = PAL_VPS_RESUME_NORMAL + ;; + mov pr=r23,-2 + br.sptk.many kvm_vps_entry +END(kvm_vps_resume_normal) + +/* + * Inputs: + * r23 : pr + * r24 : guest b0 + * r25 : vpd + * r17 : isr + */ +GLOBAL_ENTRY(kvm_vps_resume_handler) + movl r30 = PAL_VPS_RESUME_HANDLER + ;; + ld8 r27=[r25] + shr r17=r17,IA64_ISR_IR_BIT + ;; + dep r27=r17,r27,63,1 // bit 63 of r27 indicate whether enable CFLE + mov pr=r23,-2 + br.sptk.many kvm_vps_entry +END(kvm_vps_resume_handler) + //mov r1=ar3 GLOBAL_ENTRY(kvm_asm_mov_from_ar) #ifndef ACCE_MOV_FROM_AR @@ -157,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm) #ifndef ACCE_RSM br.many kvm_virtualization_fault_back #endif - add r16=VMM_VPD_BASE_OFFSET,r21 + VMX_VPS_SYNC_READ + ;; extr.u r26=r25,6,21 extr.u r27=r25,31,2 ;; - ld8 r16=[r16] extr.u r28=r25,36,1 dep r26=r27,r26,21,2 ;; @@ -196,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm) tbit.nz p6,p0=r23,0 ;; tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT - (p6) br.dptk kvm_resume_to_guest + (p6) br.dptk kvm_resume_to_guest_with_sync ;; add r26=VMM_VCPU_META_RR0_OFFSET,r21 add r27=VMM_VCPU_META_RR0_OFFSET+8,r21 @@ -212,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm) mov rr[r28]=r27 ;; srlz.d - br.many kvm_resume_to_guest + br.many kvm_resume_to_guest_with_sync END(kvm_asm_rsm) @@ -221,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm) #ifndef ACCE_SSM br.many kvm_virtualization_fault_back #endif - add r16=VMM_VPD_BASE_OFFSET,r21 + VMX_VPS_SYNC_READ + ;; extr.u r26=r25,6,21 extr.u r27=r25,31,2 ;; - ld8 r16=[r16] extr.u r28=r25,36,1 dep r26=r27,r26,21,2 ;; //r26 is imm24 @@ -271,7 +366,7 @@ kvm_asm_ssm_1: tbit.nz p6,p0=r29,IA64_PSR_I_BIT ;; tbit.z.or p6,p0=r19,IA64_PSR_I_BIT - (p6) br.dptk kvm_resume_to_guest + (p6) br.dptk kvm_resume_to_guest_with_sync ;; add r29=VPD_VTPR_START_OFFSET,r16 add r30=VPD_VHPI_START_OFFSET,r16 @@ -286,7 +381,7 @@ kvm_asm_ssm_1: ;; cmp.gt p6,p0=r30,r17 (p6) br.dpnt.few kvm_asm_dispatch_vexirq - br.many kvm_resume_to_guest + br.many kvm_resume_to_guest_with_sync END(kvm_asm_ssm) @@ -295,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr) #ifndef ACCE_MOV_TO_PSR br.many kvm_virtualization_fault_back #endif - add r16=VMM_VPD_BASE_OFFSET,r21 - extr.u r26=r25,13,7 //r2 + VMX_VPS_SYNC_READ ;; - ld8 r16=[r16] + extr.u r26=r25,13,7 //r2 addl r20=@gprel(asm_mov_from_reg),gp ;; adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20 @@ -374,7 +468,7 @@ kvm_asm_mov_to_psr_1: ;; tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT tbit.z.or p6,p0=r30,IA64_PSR_I_BIT - (p6) br.dpnt.few kvm_resume_to_guest + (p6) br.dpnt.few kvm_resume_to_guest_with_sync ;; add r29=VPD_VTPR_START_OFFSET,r16 add r30=VPD_VHPI_START_OFFSET,r16 @@ -389,13 +483,29 @@ kvm_asm_mov_to_psr_1: ;; cmp.gt p6,p0=r30,r17 (p6) br.dpnt.few kvm_asm_dispatch_vexirq - br.many kvm_resume_to_guest + br.many kvm_resume_to_guest_with_sync END(kvm_asm_mov_to_psr) ENTRY(kvm_asm_dispatch_vexirq) //increment iip + mov r17 = b0 + mov r18 = r31 +{.mii + add r25=VMM_VPD_BASE_OFFSET,r21 + nop 0x0 + mov r24 = ip + ;; +} +{.mmb + add r24 = 0x20, r24 + ld8 r25 = [r25] + br.sptk.many kvm_vps_sync_write +} + mov b0 =r17 mov r16=cr.ipsr + mov r31 = r18 + mov r19 = 37 ;; extr.u r17=r16,IA64_PSR_RI_BIT,2 tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1 @@ -435,25 +545,31 @@ GLOBAL_ENTRY(kvm_asm_thash) ;; kvm_asm_thash_back1: shr.u r23=r19,61 // get RR number - adds r25=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr + adds r28=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr adds r16=VMM_VPD_VPTA_OFFSET,r16 // get vpta ;; - shladd r27=r23,3,r25 // get vcpu->arch.vrr[r23]'s addr + shladd r27=r23,3,r28 // get vcpu->arch.vrr[r23]'s addr ld8 r17=[r16] // get PTA mov r26=1 ;; - extr.u r29=r17,2,6 // get pta.size - ld8 r25=[r27] // get vcpu->arch.vrr[r23]'s value + extr.u r29=r17,2,6 // get pta.size + ld8 r28=[r27] // get vcpu->arch.vrr[r23]'s value ;; - extr.u r25=r25,2,6 // get rr.ps + mov b0=r24 + //Fallback to C if pta.vf is set + tbit.nz p6,p0=r17, 8 + ;; + (p6) mov r24=EVENT_THASH + (p6) br.cond.dpnt.many kvm_virtualization_fault_back + extr.u r28=r28,2,6 // get rr.ps shl r22=r26,r29 // 1UL << pta.size ;; - shr.u r23=r19,r25 // vaddr >> rr.ps + shr.u r23=r19,r28 // vaddr >> rr.ps adds r26=3,r29 // pta.size + 3 shl r27=r17,3 // pta << 3 ;; shl r23=r23,3 // (vaddr >> rr.ps) << 3 - shr.u r27=r27,r26 // (pta << 3) >> (pta.size+3) + shr.u r27=r27,r26 // (pta << 3) >> (pta.size+3) movl r16=7<<61 ;; adds r22=-1,r22 // (1UL << pta.size) - 1 @@ -724,6 +840,29 @@ END(asm_mov_from_reg) * r31: pr * r24: b0 */ +ENTRY(kvm_resume_to_guest_with_sync) + adds r19=VMM_VPD_BASE_OFFSET,r21 + mov r16 = r31 + mov r17 = r24 + ;; +{.mii + ld8 r25 =[r19] + nop 0x0 + mov r24 = ip + ;; +} +{.mmb + add r24 =0x20, r24 + nop 0x0 + br.sptk.many kvm_vps_sync_write +} + + mov r31 = r16 + mov r24 =r17 + ;; + br.sptk.many kvm_resume_to_guest +END(kvm_resume_to_guest_with_sync) + ENTRY(kvm_resume_to_guest) adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21 ;; diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index 5a33f7e..3417783 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu) void vmm_transition(struct kvm_vcpu *vcpu) { ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd, - 0, 0, 0, 0, 0, 0); + 1, 0, 0, 0, 0, 0); vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host); ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd, - 0, 0, 0, 0, 0, 0); + 1, 0, 0, 0, 0, 0); kvm_do_resume_op(vcpu); } diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h index b0fcfb6..341e3fe 100644 --- a/arch/ia64/kvm/vcpu.h +++ b/arch/ia64/kvm/vcpu.h @@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir, trp->rid = rid; } -extern u64 kvm_lookup_mpa(u64 gpfn); -extern u64 kvm_gpa_to_mpa(u64 gpa); - -/* Return I/O type if trye */ -#define __gpfn_is_io(gpfn) \ - ({ \ - u64 pte, ret = 0; \ - pte = kvm_lookup_mpa(gpfn); \ - if (!(pte & GPFN_INV_MASK)) \ - ret = pte & GPFN_IO_MASK; \ - ret; \ - }) +extern u64 kvm_get_mpt_entry(u64 gpfn); +/* Return I/ */ +static inline u64 __gpfn_is_io(u64 gpfn) +{ + u64 pte; + pte = kvm_get_mpt_entry(gpfn); + if (!(pte & GPFN_INV_MASK)) { + pte = pte & GPFN_IO_MASK; + if (pte != GPFN_PHYS_MMIO) + return pte; + } + return 0; +} #endif - #define IA64_NO_FAULT 0 #define IA64_FAULT 1 diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S index 3ee5f48..c1d7251 100644 --- a/arch/ia64/kvm/vmm_ivt.S +++ b/arch/ia64/kvm/vmm_ivt.S @@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid: adds r19=VMM_VPD_VPSR_OFFSET,r18 ;; ld8 r19=[r19] //vpsr - adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21 - ;; - ld8 r20=[r20] - ;; -//vsa_sync_write_start mov r25=r18 adds r16= VMM_VCPU_GP_OFFSET,r21 ;; @@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid: ;; add r24=r24,r16 ;; - add r16=PAL_VPS_SYNC_WRITE,r20 - ;; - mov b0=r16 - br.cond.sptk b0 // call the service + br.sptk.many kvm_vps_sync_write // call the service ;; END(ia64_leave_hypervisor) // fall through @@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry) * r17:cr.isr * r18:vpd * r19:vpsr - * r20:__vsa_base * r22:b0 * r23:predicate */ mov r24=r22 mov r25=r18 tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic + (p1) br.cond.sptk.few kvm_vps_resume_normal + (p2) br.cond.sptk.many kvm_vps_resume_handler ;; - (p1) add r29=PAL_VPS_RESUME_NORMAL,r20 - (p1) br.sptk.many ia64_vmm_entry_out - ;; - tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT //p1=cr.isr.ir - ;; - (p1) add r29=PAL_VPS_RESUME_NORMAL,r20 - (p2) add r29=PAL_VPS_RESUME_HANDLER,r20 - (p2) ld8 r26=[r25] - ;; -ia64_vmm_entry_out: - mov pr=r23,-2 - mov b0=r29 - ;; - br.cond.sptk b0 // call pal service END(ia64_vmm_entry) @@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry) //set up ipsr, iip, vpd.vpsr, dcr // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1 // For DCR: all bits 0 + bsw.0 + ;; + mov r21 =r13 adds r14=-VMM_PT_REGS_SIZE, r12 ;; movl r6=0x501008826000 // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1 @@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry) ;; srlz.i ;; - bsw.0 - ;; - mov r21 =r13 - ;; - bsw.1 - ;; mov ar.rsc = 0 ;; flushrs @@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry) ld8 r1 = [r20] ;; mov cr.iip=r4 - ;; adds r16=VMM_VPD_BASE_OFFSET,r13 - adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13 ;; ld8 r18=[r16] - ld8 r20=[r20] ;; adds r19=VMM_VPD_VPSR_OFFSET,r18 ;; diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c index def4576..e22b933 100644 --- a/arch/ia64/kvm/vtlb.c +++ b/arch/ia64/kvm/vtlb.c @@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps) u64 translate_phy_pte(u64 *pte, u64 itir, u64 va) { - u64 ps, ps_mask, paddr, maddr; + u64 ps, ps_mask, paddr, maddr, io_mask; union pte_flags phy_pte; ps = itir_ps(itir); @@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va) phy_pte.val = *pte; paddr = *pte; paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask); - maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT); - if (maddr & GPFN_IO_MASK) { + maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT); + io_mask = maddr & GPFN_IO_MASK; + if (io_mask && (io_mask != GPFN_PHYS_MMIO)) { *pte |= VTLB_PTE_IO; return -1; } @@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 ifa, int type) { u64 ps; - u64 phy_pte; + u64 phy_pte, io_mask, index; union ia64_rr vrr, mrr; int ret = 0; @@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir, vrr.val = vcpu_get_rr(v, ifa); mrr.val = ia64_get_rr(ifa); + index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT; + io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK; phy_pte = translate_phy_pte(&pte, itir, ifa); /* Ensure WB attribute if pte is related to a normal mem page, * which is required by vga acceleration since qemu maps shared * vram buffer with WB. */ - if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) { + if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) && + io_mask != GPFN_PHYS_MMIO) { pte &= ~_PAGE_MA_MASK; phy_pte &= ~_PAGE_MA_MASK; } @@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz) } } -u64 kvm_lookup_mpa(u64 gpfn) +u64 kvm_get_mpt_entry(u64 gpfn) { u64 *base = (u64 *) KVM_P2M_BASE; return *(base + gpfn); } +u64 kvm_lookup_mpa(u64 gpfn) +{ + u64 maddr; + maddr = kvm_get_mpt_entry(gpfn); + return maddr&_PAGE_PPN_MASK; +} + u64 kvm_gpa_to_mpa(u64 gpa) { u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2655e2a..34b52b7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -81,11 +81,17 @@ struct kvm_vcpu_arch { struct tlbe shadow_tlb[PPC44x_TLB_SIZE]; /* Pages which are referenced in the shadow TLB. */ struct page *shadow_pages[PPC44x_TLB_SIZE]; - /* Copy of the host's TLB. */ - struct tlbe host_tlb[PPC44x_TLB_SIZE]; + + /* Track which TLB entries we've modified in the current exit. */ + u8 shadow_tlb_mod[PPC44x_TLB_SIZE]; u32 host_stack; u32 host_pid; + u32 host_dbcr0; + u32 host_dbcr1; + u32 host_dbcr2; + u32 host_iac[4]; + u32 host_msr; u64 fpr[32]; u32 gpr[32]; @@ -123,7 +129,11 @@ struct kvm_vcpu_arch { u32 ivor[16]; u32 ivpr; u32 pir; + + u32 shadow_pid; u32 pid; + u32 swap_pid; + u32 pvr; u32 ccr0; u32 ccr1; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a8b0687..8931ba7 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -64,6 +64,10 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, gva_t eend, u32 asid); extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); +extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); + +/* XXX Book E specific */ +extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i); extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu); @@ -92,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvm_vcpu_block(vcpu); } +static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid) +{ + if (vcpu->arch.pid != new_pid) { + vcpu->arch.pid = new_pid; + vcpu->arch.swap_pid = 1; + } +} + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 09febc5..75c5dd0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -359,8 +359,8 @@ int main(void) DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); - DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb)); DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb)); + DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); @@ -372,7 +372,7 @@ int main(void) DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); - DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid)); + DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 5a5602d..2e227a4 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -19,6 +19,7 @@ #include <linux/types.h> #include <linux/string.h> +#include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <asm/mmu-44x.h> @@ -109,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe) return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW); } -/* Must be called with mmap_sem locked for writing. */ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu, unsigned int index) { @@ -124,6 +124,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu, } } +void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i) +{ + vcpu->arch.shadow_tlb_mod[i] = 1; +} + /* Caller must ensure that the specified guest TLB entry is safe to insert into * the shadow TLB. */ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, @@ -142,19 +147,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, stlbe = &vcpu->arch.shadow_tlb[victim]; /* Get reference to new page. */ - down_read(¤t->mm->mmap_sem); new_page = gfn_to_page(vcpu->kvm, gfn); if (is_error_page(new_page)) { printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); kvm_release_page_clean(new_page); - up_read(¤t->mm->mmap_sem); return; } hpaddr = page_to_phys(new_page); /* Drop reference to old page. */ kvmppc_44x_shadow_release(vcpu, victim); - up_read(¤t->mm->mmap_sem); vcpu->arch.shadow_pages[victim] = new_page; @@ -164,27 +166,30 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, /* XXX what about AS? */ - stlbe->tid = asid & 0xff; + stlbe->tid = !(asid & 0xff); /* Force TS=1 for all guest mappings. */ /* For now we hardcode 4KB mappings, but it will be important to * use host large pages in the future. */ stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS | PPC44x_TLB_4K; - stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf); stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags, vcpu->arch.msr & MSR_PR); + kvmppc_tlbe_set_modified(vcpu, victim); + + KVMTRACE_5D(STLB_WRITE, vcpu, victim, + stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2, + handler); } void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, gva_t eend, u32 asid) { - unsigned int pid = asid & 0xff; + unsigned int pid = !(asid & 0xff); int i; /* XXX Replace loop with fancy data structures. */ - down_write(¤t->mm->mmap_sem); for (i = 0; i <= tlb_44x_hwater; i++) { struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i]; unsigned int tid; @@ -204,21 +209,35 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr, kvmppc_44x_shadow_release(vcpu, i); stlbe->word0 = 0; + kvmppc_tlbe_set_modified(vcpu, i); + KVMTRACE_5D(STLB_INVAL, vcpu, i, + stlbe->tid, stlbe->word0, stlbe->word1, + stlbe->word2, handler); } - up_write(¤t->mm->mmap_sem); } -/* Invalidate all mappings, so that when they fault back in they will get the - * proper permission bits. */ +/* Invalidate all mappings on the privilege switch after PID has been changed. + * The guest always runs with PID=1, so we must clear the entire TLB when + * switching address spaces. */ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode) { int i; - /* XXX Replace loop with fancy data structures. */ - down_write(¤t->mm->mmap_sem); - for (i = 0; i <= tlb_44x_hwater; i++) { - kvmppc_44x_shadow_release(vcpu, i); - vcpu->arch.shadow_tlb[i].word0 = 0; + if (vcpu->arch.swap_pid) { + /* XXX Replace loop with fancy data structures. */ + for (i = 0; i <= tlb_44x_hwater; i++) { + struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i]; + + /* Future optimization: clear only userspace mappings. */ + kvmppc_44x_shadow_release(vcpu, i); + stlbe->word0 = 0; + kvmppc_tlbe_set_modified(vcpu, i); + KVMTRACE_5D(STLB_INVAL, vcpu, i, + stlbe->tid, stlbe->word0, stlbe->word1, + stlbe->word2, handler); + } + vcpu->arch.swap_pid = 0; } - up_write(¤t->mm->mmap_sem); + + vcpu->arch.shadow_pid = !usermode; } diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 6b07601..53aaa66 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -37,6 +37,17 @@ config KVM_BOOKE_HOST Provides host support for KVM on Book E PowerPC processors. Currently this works on 440 processors only. +config KVM_TRACE + bool "KVM trace support" + depends on KVM && MARKERS && SYSFS + select RELAY + select DEBUG_FS + default n + ---help--- + This option allows reading a trace of kvm-related events through + relayfs. Note the ABI is not considered stable and will be + modified in future updates. + source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 04e3449..2a5d439 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -4,9 +4,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) -kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o +common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) + +kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o obj-$(CONFIG_KVM) += kvm.o AFLAGS_booke_interrupts.o := -I$(obj) diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c index 9c8ad85..7b2591e 100644 --- a/arch/powerpc/kvm/booke_guest.c +++ b/arch/powerpc/kvm/booke_guest.c @@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } + case BOOKE_INTERRUPT_DEBUG: { + u32 dbsr; + + vcpu->arch.pc = mfspr(SPRN_CSRR0); + + /* clear IAC events in DBSR register */ + dbsr = mfspr(SPRN_DBSR); + dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4; + mtspr(SPRN_DBSR, dbsr); + + run->exit_reason = KVM_EXIT_DEBUG; + r = RESUME_HOST; + break; + } + default: printk(KERN_EMERG "exit_nr %d\n", exit_nr); BUG(); @@ -471,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.msr = 0; vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */ + vcpu->arch.shadow_pid = 1; + /* Eye-catching number so we know if the guest takes an interrupt * before it's programmed its own IVPR. */ vcpu->arch.ivpr = 0x55550000; diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 3b653b5..95e165b 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -42,7 +42,8 @@ #define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */ #define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \ - (1<<BOOKE_INTERRUPT_DTLB_MISS)) + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_DEBUG)) #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ (1<<BOOKE_INTERRUPT_DTLB_MISS)) @@ -331,51 +332,57 @@ lightweight_exit: mfspr r3, SPRN_PID stw r3, VCPU_HOST_PID(r4) - lwz r3, VCPU_PID(r4) + lwz r3, VCPU_SHADOW_PID(r4) mtspr SPRN_PID, r3 - /* Prevent all TLB updates. */ + /* Prevent all asynchronous TLB updates. */ mfmsr r5 lis r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h ori r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l andc r6, r5, r6 mtmsr r6 - /* Save the host's non-pinned TLB mappings, and load the guest mappings - * over them. Leave the host's "pinned" kernel mappings in place. */ - /* XXX optimization: use generation count to avoid swapping unmodified - * entries. */ + /* Load the guest mappings, leaving the host's "pinned" kernel mappings + * in place. */ mfspr r10, SPRN_MMUCR /* Save host MMUCR. */ - lis r8, tlb_44x_hwater@ha - lwz r8, tlb_44x_hwater@l(r8) - addi r3, r4, VCPU_HOST_TLB - 4 - addi r9, r4, VCPU_SHADOW_TLB - 4 - li r6, 0 + li r5, PPC44x_TLB_SIZE + lis r5, tlb_44x_hwater@ha + lwz r5, tlb_44x_hwater@l(r5) + mtctr r5 + addi r9, r4, VCPU_SHADOW_TLB + addi r5, r4, VCPU_SHADOW_MOD + li r3, 0 1: - /* Save host entry. */ - tlbre r7, r6, PPC44x_TLB_PAGEID - mfspr r5, SPRN_MMUCR - stwu r5, 4(r3) - stwu r7, 4(r3) - tlbre r7, r6, PPC44x_TLB_XLAT - stwu r7, 4(r3) - tlbre r7, r6, PPC44x_TLB_ATTRIB - stwu r7, 4(r3) + lbzx r7, r3, r5 + cmpwi r7, 0 + beq 3f + /* Load guest entry. */ - lwzu r7, 4(r9) + mulli r11, r3, TLBE_BYTES + add r11, r11, r9 + lwz r7, 0(r11) mtspr SPRN_MMUCR, r7 - lwzu r7, 4(r9) - tlbwe r7, r6, PPC44x_TLB_PAGEID - lwzu r7, 4(r9) - tlbwe r7, r6, PPC44x_TLB_XLAT - lwzu r7, 4(r9) - tlbwe r7, r6, PPC44x_TLB_ATTRIB - /* Increment index. */ - addi r6, r6, 1 - cmpw r6, r8 - blt 1b + lwz r7, 4(r11) + tlbwe r7, r3, PPC44x_TLB_PAGEID + lwz r7, 8(r11) + tlbwe r7, r3, PPC44x_TLB_XLAT + lwz r7, 12(r11) + tlbwe r7, r3, PPC44x_TLB_ATTRIB +3: + addi r3, r3, 1 /* Increment index. */ + bdnz 1b + mtspr SPRN_MMUCR, r10 /* Restore host MMUCR. */ + /* Clear bitmap of modified TLB entries */ + li r5, PPC44x_TLB_SIZE>>2 + mtctr r5 + addi r5, r4, VCPU_SHADOW_MOD - 4 + li r6, 0 +1: + stwu r6, 4(r5) + bdnz 1b + iccci 0, 0 /* XXX hack */ /* Load some guest volatiles. */ @@ -431,6 +438,14 @@ lightweight_exit: oris r3, r3, KVMPPC_MSR_MASK@h ori r3, r3, KVMPPC_MSR_MASK@l mtsrr1 r3 + + /* Clear any debug events which occurred since we disabled MSR[DE]. + * XXX This gives us a 3-instruction window in which a breakpoint + * intended for guest context could fire in the host instead. */ + lis r3, 0xffff + ori r3, r3, 0xffff + mtspr SPRN_DBSR, r3 + lwz r3, VCPU_GPR(r3)(r4) lwz r4, VCPU_GPR(r4)(r4) rfi diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 8c605d0..0fce4fb 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst) kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags); } + KVMTRACE_5D(GTLB_WRITE, vcpu, index, + tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2, + handler); + return EMULATE_DONE; } @@ -504,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_MMUCR: vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break; case SPRN_PID: - vcpu->arch.pid = vcpu->arch.gpr[rs]; break; + kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break; case SPRN_CCR0: vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break; case SPRN_CCR1: @@ -765,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; } + KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit); + if (advance) vcpu->arch.pc += 4; /* Advance past emulated instruction. */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 53826a5..90a6fc4 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -27,6 +27,7 @@ #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> +#include <asm/tlbflush.h> gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) @@ -239,18 +240,114 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { } +/* Note: clearing MSR[DE] just means that the debug interrupt will not be + * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits. + * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt + * will be delivered as an "imprecise debug event" (which is indicated by + * DBSR[IDE]. + */ +static void kvmppc_disable_debug_interrupts(void) +{ + mtmsr(mfmsr() & ~MSR_DE); +} + +static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu) +{ + kvmppc_disable_debug_interrupts(); + + mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]); + mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]); + mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]); + mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]); + mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1); + mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2); + mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0); + mtmsr(vcpu->arch.host_msr); +} + +static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu) +{ + struct kvm_guest_debug *dbg = &vcpu->guest_debug; + u32 dbcr0 = 0; + + vcpu->arch.host_msr = mfmsr(); + kvmppc_disable_debug_interrupts(); + + /* Save host debug register state. */ + vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1); + vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2); + vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3); + vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4); + vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0); + vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1); + vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2); + + /* set registers up for guest */ + + if (dbg->bp[0]) { + mtspr(SPRN_IAC1, dbg->bp[0]); + dbcr0 |= DBCR0_IAC1 | DBCR0_IDM; + } + if (dbg->bp[1]) { + mtspr(SPRN_IAC2, dbg->bp[1]); + dbcr0 |= DBCR0_IAC2 | DBCR0_IDM; + } + if (dbg->bp[2]) { + mtspr(SPRN_IAC3, dbg->bp[2]); + dbcr0 |= DBCR0_IAC3 | DBCR0_IDM; + } + if (dbg->bp[3]) { + mtspr(SPRN_IAC4, dbg->bp[3]); + dbcr0 |= DBCR0_IAC4 | DBCR0_IDM; + } + + mtspr(SPRN_DBCR0, dbcr0); + mtspr(SPRN_DBCR1, 0); + mtspr(SPRN_DBCR2, 0); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + int i; + + if (vcpu->guest_debug.enabled) + kvmppc_load_guest_debug_registers(vcpu); + + /* Mark every guest entry in the shadow TLB entry modified, so that they + * will all be reloaded on the next vcpu run (instead of being + * demand-faulted). */ + for (i = 0; i <= tlb_44x_hwater; i++) + kvmppc_tlbe_set_modified(vcpu, i); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + if (vcpu->guest_debug.enabled) + kvmppc_restore_host_debug_state(vcpu); + + /* Don't leave guest TLB entries resident when being de-scheduled. */ + /* XXX It would be nice to differentiate between heavyweight exit and + * sched_out here, since we could avoid the TLB flush for heavyweight + * exits. */ + _tlbia(); } int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) { - return -ENOTSUPP; + int i; + + vcpu->guest_debug.enabled = dbg->enabled; + if (vcpu->guest_debug.enabled) { + for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) { + if (dbg->breakpoints[i].enabled) + vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; + else + vcpu->guest_debug.bp[i] = 0; + } + } + + return 0; } static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4c03049..bc581d8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -565,13 +565,16 @@ config ZFCPDUMP Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this. config S390_GUEST -bool "s390 guest support (EXPERIMENTAL)" +bool "s390 guest support for KVM (EXPERIMENTAL)" depends on 64BIT && EXPERIMENTAL select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE help - Select this option if you want to run the kernel under s390 linux + Select this option if you want to run the kernel as a guest under + the KVM hypervisor. This will add detection for KVM as well as a + virtio transport. If KVM is detected, the virtio console will be + the default console. endmenu source "net/Kconfig" diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index d1faf5c..cce40ff 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu) int rc; vcpu->stat.instruction_stfl++; - facility_list &= ~(1UL<<24); /* no stfle */ - facility_list &= ~(1UL<<23); /* no large pages */ + /* only pass the facility bits, which we can handle */ + facility_list &= 0xfe00fff3; rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), &facility_list, sizeof(facility_list)); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def0..774ac49 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) return ret; } +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + return preset_lpj; +} + +static void kvm_get_preset_lpj(void) +{ + struct pvclock_vcpu_time_info *src; + unsigned long khz; + u64 lpj; + + src = &per_cpu(hv_clock, 0); + khz = pvclock_tsc_khz(src); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -153,6 +181,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; + pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; #endif @@ -163,6 +192,7 @@ void __init kvmclock_init(void) #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; #endif + kvm_get_preset_lpj(); clocksource_register(&kvm_clock); } } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a..4f9c55f 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, return dst->version; } +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d0e940b..c023435 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,10 +3,13 @@ # common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o) + coalesced_mmio.o irq_comm.o) ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif +ifeq ($(CONFIG_DMAR),y) +common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872..634132a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(&pt->pending)) set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (vcpu0 && waitqueue_active(&vcpu0->wq)) wake_up_interruptible(&vcpu0->wq); - } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); pt->scheduled = ktime_to_ns(pt->timer.expires); + if (pt->period) + ps->channels[0].count_load_time = pt->timer.expires; return (pt->period == 0 ? 0 : 1); } @@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) + if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; } +static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, + irq_ack_notifier); + spin_lock(&ps->inject_lock); + if (atomic_dec_return(&ps->pit_timer.pending) < 0) + atomic_inc(&ps->pit_timer.pending); + ps->irq_ack = 1; + spin_unlock(&ps->inject_lock); +} + static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps; @@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) hrtimer_cancel(&pt->timer); } -static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { + struct kvm_kpit_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) pt->period = (is_period == 0) ? 0 : interval; pt->timer.function = pit_timer_fn; atomic_set(&pt->pending, 0); + ps->irq_ack = 1; hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); @@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(&ps->pit_timer, val, 0); + create_pit_timer(ps, val, 0); break; case 2: case 3: - create_pit_timer(&ps->pit_timer, val, 1); + create_pit_timer(ps, val, 1); break; default: destroy_pit_timer(&ps->pit_timer); @@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit) mutex_unlock(&pit->pit_state.lock); atomic_set(&pit->pit_state.pit_timer.pending, 0); - pit->pit_state.inject_pending = 1; + pit->pit_state.irq_ack = 1; } struct kvm_pit *kvm_create_pit(struct kvm *kvm) @@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); + spin_lock_init(&pit->pit_state.inject_lock); /* Initialize PIO device */ pit->dev.read = pit_ioport_read; @@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit_state->pit = pit; hrtimer_init(&pit_state->pit_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->irq_ack_notifier.gsi = 0; + pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; + kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); @@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm) { mutex_lock(&kvm->lock); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); + kvm_set_irq(kvm, 0, 1); + kvm_set_irq(kvm, 0, 0); mutex_unlock(&kvm->lock); } @@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm_kpit_state *ps; if (vcpu && pit) { + int inject = 0; ps = &pit->pit_state; - /* Try to inject pending interrupts when: - * 1. Pending exists - * 2. Last interrupt was accepted or waited for too long time*/ - if (atomic_read(&ps->pit_timer.pending) && - (ps->inject_pending || - (jiffies - ps->last_injected_time - >= KVM_MAX_PIT_INTR_INTERVAL))) { - ps->inject_pending = 0; - __inject_pit_timer_intr(kvm); - ps->last_injected_time = jiffies; - } - } -} - -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_arch *arch = &vcpu->kvm->arch; - struct kvm_kpit_state *ps; - - if (vcpu && arch->vpit) { - ps = &arch->vpit->pit_state; - if (atomic_read(&ps->pit_timer.pending) && - (((arch->vpic->pics[0].imr & 1) == 0 && - arch->vpic->pics[0].irq_base == vec) || - (arch->vioapic->redirtbl[0].fields.vector == vec && - arch->vioapic->redirtbl[0].fields.mask != 1))) { - ps->inject_pending = 1; - atomic_dec(&ps->pit_timer.pending); - ps->channels[0].count_load_time = ktime_get(); + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; } + spin_unlock(&ps->inject_lock); + if (inject) + __inject_pit_timer_intr(kvm); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a..e436d49 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,7 +8,6 @@ struct kvm_kpit_timer { int irq; s64 period; /* unit: ns */ s64 scheduled; - ktime_t last_update; atomic_t pending; }; @@ -34,8 +33,9 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - bool inject_pending; /* if inject pending interrupts */ - unsigned long last_injected_time; + spinlock_t inject_lock; + unsigned long irq_ack; + struct kvm_irq_ack_notifier irq_ack_notifier; }; struct kvm_pit { @@ -54,7 +54,6 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); struct kvm_pit *kvm_create_pit(struct kvm *kvm); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index c31164e..17e41e1 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,6 +30,19 @@ #include <linux/kvm_host.h> +static void pic_clear_isr(struct kvm_kpic_state *s, int irq) +{ + s->isr &= ~(1 << irq); + s->isr_ack |= (1 << irq); +} + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; +} + /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) */ static inline void pic_intack(struct kvm_kpic_state *s, int irq) { + s->isr |= 1 << irq; if (s->auto_eoi) { if (s->rotate_on_auto_eoi) s->priority_add = (irq + 1) & 7; - } else - s->isr |= (1 << irq); + pic_clear_isr(s, irq); + } /* * We don't clear a level sensitive interrupt here */ @@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) s->irr &= ~(1 << irq); } -int kvm_pic_read_irq(struct kvm_pic *s) +int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; + struct kvm_pic *s = pic_irqchip(kvm); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { @@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + kvm_notify_acked_irq(kvm, irq); return intno; } void kvm_pic_reset(struct kvm_kpic_state *s) { + int irq, irqbase; + struct kvm *kvm = s->pics_state->irq_request_opaque; + struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; + + if (s == &s->pics_state->pics[0]) + irqbase = 0; + else + irqbase = 8; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (s->irr & (1 << irq) || s->isr & (1 << irq)) + kvm_notify_acked_irq(kvm, irq+irqbase); + } s->last_irr = 0; s->irr = 0; s->imr = 0; s->isr = 0; + s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); if (cmd == 5) s->priority_add = (irq + 1) & 7; pic_update_irq(s->pics_state); @@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 3: irq = val & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; case 6: @@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 7: irq = val & 7; - s->isr &= ~(1 << irq); s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; default: @@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) s->pics_state->pics[0].irr &= ~(1 << 2); } s->irr &= ~(1 << ret); - s->isr &= ~(1 << ret); + pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); } else { @@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_pic *s = pic_irqchip(kvm); + int irq = pic_get_irq(&s->pics[0]); - pic_irqchip(kvm)->output = level; - if (vcpu) + s->output = level; + if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { + s->pics[0].isr_ack &= ~(1 << irq); kvm_vcpu_kick(vcpu); + } } struct kvm_pic *kvm_create_pic(struct kvm *kvm) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b..c019b8e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(s); + vector = kvm_pic_read_irq(v->kvm); } } return vector; @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { kvm_apic_timer_intr_post(vcpu, vec); - kvm_pit_timer_intr_post(vcpu, vec); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7ca47cb..f17c8f5 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -42,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -63,12 +64,13 @@ struct kvm_pic { void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; + void (*ack_notifier)(void *opaque, int irq); }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); +int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 0000000..1ff819d --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -0,0 +1,32 @@ +#ifndef ASM_KVM_CACHE_REGS_H +#define ASM_KVM_CACHE_REGS_H + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, reg); + + return vcpu->arch.regs[reg]; +} + +static inline void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + vcpu->arch.regs[reg] = val; + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RIP); +} + +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RIP, val); +} + +#endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de..6571926 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,6 +32,7 @@ #include <asm/current.h> #include <asm/apicdef.h> #include <asm/atomic.h> +#include "kvm_cache_regs.h" #include "irq.h" #define PRId64 "d" @@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - } + kvm_vcpu_kick(vcpu); result = (orig_irr == 0); break; @@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); + apic_debug("Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); } - break; case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); + apic_debug("SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); + kvm_vcpu_kick(vcpu); } break; @@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - + int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_update_ppr(apic); if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_run *run = vcpu->run; set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); - kvm_x86_ops->cache_regs(vcpu); - run->tpr_access.rip = vcpu->arch.rip; + run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } @@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, * Refer SDM 8.4.1 */ if (len != 4 || alignment) { - if (printk_ratelimit()) - printk(KERN_ERR "apic write: bad size=%d %lx\n", - len, (long)address); + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", + len, (long)address); return; } @@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) if(!atomic_inc_and_test(&apic->timer.pending)) set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if (waitqueue_active(q)) wake_up_interruptible(q); - } + if (apic_lvtt_period(apic)) { result = 1; apic->timer.dev.expires = ktime_add_ns( diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3da2508..99c239c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -70,6 +70,9 @@ static int dbg = 0; module_param(dbg, bool, 0644); #endif +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -135,18 +138,24 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); -}; +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; }; +struct kvm_shadow_walk { + int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, + u64 addr, u64 *spte, int level); +}; + +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); +}; + +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr; + int ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return 0; + return ret; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) - return 1; + ret = 1; + up_read(¤t->mm->mmap_sem); - return 0; + return ret; } static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) if (write_protected) kvm_flush_remote_tlbs(kvm); - - account_shadowed(kvm, gfn); } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } +} + +static void kvm_mmu_update_unsync_bitmap(u64 *spte) +{ + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + __set_bit(index, sp->unsync_child_bitmap); + sp->unsync_children = 1; +} + +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->parent_pte) + return; + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, sp->spt[i] = shadow_trap_nonpresent_pte; } +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_unsync_walk *walker) +{ + int i, ret; + + if (!sp->unsync_children) + return 0; + + for_each_unsync_children(sp->unsync_child_bitmap, i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + ret = mmu_unsync_walk(child, walker); + if (ret) + return ret; + __clear_bit(i, sp->unsync_child_bitmap); + } + + if (child->unsync) { + ret = walker->entry(child, walker); + __clear_bit(i, sp->unsync_child_bitmap); + if (ret) + return ret; + } + } + } + + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) + sp->unsync_children = 0; + + return 0; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + rmap_write_protect(vcpu->kvm, sp->gfn); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return 0; +} + +struct sync_walker { + struct kvm_vcpu *vcpu; + struct kvm_unsync_walk walker; +}; + +static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct sync_walker *sync_walk = container_of(walk, struct sync_walker, + walker); + struct kvm_vcpu *vcpu = sync_walk->vcpu; + + kvm_sync_page(vcpu, sp); + return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + struct sync_walker walker = { + .walker = { .entry = mmu_sync_fn, }, + .vcpu = vcpu, + }; + + while (mmu_unsync_walk(sp, &walker.walker)) + cond_resched_lock(&vcpu->kvm->mmu_lock); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; role.word = 0; role.glevels = vcpu->arch.mmu.root_level; @@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } pgprintk("%s: found\n", __func__); return sp; } @@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) + if (!metaphysical) { rmap_write_protect(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, gfn); + } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) vcpu->arch.mmu.prefetch_page(vcpu, sp); else @@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } +static int walk_shadow(struct kvm_shadow_walk *walker, + struct kvm_vcpu *vcpu, u64 addr) +{ + hpa_t shadow_addr; + int level; + int r; + u64 *sptep; + unsigned index; + + shadow_addr = vcpu->arch.mmu.root_hpa; + level = vcpu->arch.mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + while (level >= PT_PAGE_TABLE_LEVEL) { + index = SHADOW_PT_INDEX(addr, level); + sptep = ((u64 *)__va(shadow_addr)) + index; + r = walker->entry(walker, vcpu, addr, sptep, level); + if (r) + return r; + shadow_addr = *sptep & PT64_BASE_ADDR_MASK; + --level; + } + return 0; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, rmap_remove(kvm, &pt[i]); pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); return; } @@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) kvm->vcpus[i]->arch.last_pte_updated = NULL; } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - ++kvm->stat.mmu_shadow_zapped; while (sp->multimapped || sp->parent_pte) { if (!sp->multimapped) parent_pte = sp->parent_pte; @@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_put_page(sp, parent_pte); set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); } +} + +struct zap_walker { + struct kvm_unsync_walk walker; + struct kvm *kvm; + int zapped; +}; + +static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct zap_walker *zap_walk = container_of(walk, struct zap_walker, + walker); + kvm_mmu_zap_page(zap_walk->kvm, sp); + zap_walk->zapped = 1; + return 0; +} + +static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct zap_walker walker = { + .walker = { .entry = mmu_zap_fn, }, + .kvm = kvm, + .zapped = 0, + }; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + mmu_unsync_walk(sp, &walker.walker); + return walker.zapped; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int ret; + ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); + kvm_mmu_unlink_parents(kvm, sp); + kvm_flush_remote_tlbs(kvm); + if (!sp->role.invalid && !sp->role.metaphysical) + unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { - if (!sp->role.metaphysical && !sp->role.invalid) - unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { - int invalid = sp->role.invalid; - list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; + list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); - if (!sp->role.metaphysical && !invalid) - unaccount_shadowed(kvm, sp->gfn); } kvm_mmu_reset_last_pte_updated(kvm); + return ret; } /* @@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) if (sp->gfn == gfn && !sp->role.metaphysical) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } @@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) __set_bit(slot, &sp->slot_bitmap); } +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + } +} + struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; @@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return NULL; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); return page; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - u64 spte; - int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.metaphysical) + continue; + if (s->role.word != sp->role.word) + return 1; + } + kvm_mmu_mark_parents_unsync(vcpu, sp); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + mmu_convert_notrap(sp); + return 0; +} - if (is_rmap_pte(*shadow_pte)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (largepage && !is_large_pte(*shadow_pte)) { - struct kvm_mmu_page *child; - u64 pte = *shadow_pte; +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *shadow; - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { - pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync && oos_shadow) + return kvm_unsync_page(vcpu, shadow); + return 1; } + return 0; +} +static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int largepage, + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) +{ + u64 spte; + int ret = 0; /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) - pte_access |= PT_ACCESSED_MASK; + spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) @@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; + + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + ret = 1; + spte = shadow_trap_nonpresent_pte; + goto set_pte; + } spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow || - (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); + ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { + if (is_writeble_pte(spte)) spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; } } if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", - (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); +set_pte: set_shadow_pte(shadow_pte, spte); - if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) - && (spte & PT_PRESENT_MASK)) + return ret; +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) +{ + int was_rmapped = 0; + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __func__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_pte(*shadow_pte)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } else if (pfn != spte_to_pfn(*shadow_pte)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*shadow_pte), pfn); + rmap_remove(vcpu->kvm, shadow_pte); + } else { + if (largepage) + was_rmapped = is_large_pte(*shadow_pte); + else + was_rmapped = 1; + } + } + if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, + dirty, largepage, gfn, pfn, speculative, true)) { + if (write_fault) + *ptwrite = 1; + kvm_x86_ops->tlb_flush(vcpu); + } + + pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + is_large_pte(*shadow_pte)? "2MB" : "4kB", + is_present_pte(*shadow_pte)?"RW":"R", gfn, + *shadow_pte, shadow_pte); + if (!was_rmapped && is_large_pte(*shadow_pte)) ++vcpu->kvm->stat.lpages; page_header_update_slot(vcpu->kvm, shadow_pte, gfn); @@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn, - int level) -{ - hpa_t table_addr = vcpu->arch.mmu.root_hpa; - int pt_write = 0; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; - - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); +struct direct_shadow_walk { + struct kvm_shadow_walk walker; + pfn_t pfn; + int write; + int largepage; + int pt_write; +}; - if (level == 1) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 0, gfn, pfn, false); - return pt_write; - } +static int direct_map_entry(struct kvm_shadow_walk *_walk, + struct kvm_vcpu *vcpu, + u64 addr, u64 *sptep, int level) +{ + struct direct_shadow_walk *walk = + container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_mmu_page *sp; + gfn_t pseudo_gfn; + gfn_t gfn = addr >> PAGE_SHIFT; + + if (level == PT_PAGE_TABLE_LEVEL + || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, + 0, walk->write, 1, &walk->pt_write, + walk->largepage, gfn, walk->pfn, false); + ++vcpu->stat.pf_fixed; + return 1; + } - if (largepage && level == 2) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 1, gfn, pfn, false); - return pt_write; + if (*sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, + 1, ACC_ALL, sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(walk->pfn); + return -ENOMEM; } - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index]); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } - - set_shadow_pte(&table[index], - __pa(new_table->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; + set_shadow_pte(sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } + return 0; +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) +{ + int r; + struct direct_shadow_walk walker = { + .walker = { .entry = direct_map_entry, }, + .pfn = pfn, + .largepage = largepage, + .write = write, + .pt_write = 0, + }; + + r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); + if (r < 0) + return r; + return walker.pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) @@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn, - PT32E_ROOT_LEVEL); + r = __direct_map(vcpu, v, write, largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; @@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); + largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; @@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); @@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); @@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) + if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -1969,6 +2350,16 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + spin_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.invlpg(vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_enable_tdp(void) { tdp_enabled = true; @@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) @@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); @@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret) { int r; - struct kvm_pv_mmu_op_buffer buffer; + struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - buffer.ptr = buffer.buf; - buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); - buffer.processed = 0; + buffer->ptr = buffer->buf; + buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); + buffer->processed = 0; - r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); + r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); if (r) goto out; - while (buffer.len) { - r = kvm_pv_mmu_op_one(vcpu, &buffer); + while (buffer->len) { + r = kvm_pv_mmu_op_one(vcpu, buffer); if (r < 0) goto out; if (r == 0) @@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, r = 1; out: - *ret = buffer.processed; + *ret = buffer->processed; return r; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bf..613ec9a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,11 +25,11 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 + #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 @@ -42,11 +42,11 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 + #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 @@ -73,6 +73,17 @@ struct guest_walker { u32 error_code; }; +struct shadow_walker { + struct kvm_shadow_walk walker; + struct guest_walker *guest_walker; + int user_fault; + int write_fault; + int largepage; + int *ptwrite; + pfn_t pfn; + u64 *sptep; +}; + static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(kvm, table_gfn); - up_read(¤t->mm->mmap_sem); table = kmap_atomic(page, KM_USER0); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); kvm_release_page_dirty(page); @@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) +static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, u64 addr, + u64 *sptep, int level) { - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; - - if (!is_present_pte(walker->ptes[walker->level - 1])) - return NULL; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); + struct guest_walker *gw = sw->guest_walker; + unsigned access = gw->pt_access; + struct kvm_mmu_page *shadow_page; + u64 spte; + int metaphysical; + gfn_t table_gfn; + int r; + pt_element_t curr_pte; + + if (level == PT_PAGE_TABLE_LEVEL + || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, + sw->user_fault, sw->write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, + false); + sw->sptep = sptep; + return 1; } - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (largepage && level == PT_DIRECTORY_LEVEL) - break; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return 0; - if (is_shadow_present_pte(*shadow_ent) - && !is_large_pte(*shadow_ent)) { - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - continue; - } + if (is_large_pte(*sptep)) { + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + rmap_remove(vcpu->kvm, sptep); + } - if (is_large_pte(*shadow_ent)) - rmap_remove(vcpu->kvm, shadow_ent); - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent); - if (!metaphysical) { - int r; - pt_element_t curr_pte; - r = kvm_read_guest_atomic(vcpu->kvm, - walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) { - kvm_release_pfn_clean(pfn); - return NULL; - } + if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, + metaphysical, access, sptep); + if (!metaphysical) { + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_release_pfn_clean(sw->pfn); + sw->sptep = NULL; + return 1; } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - set_shadow_pte(shadow_ent, shadow_pte); } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, pfn, false); + spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + return 0; +} + +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *guest_walker, + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_walk_entry), }, + .guest_walker = guest_walker, + .user_fault = user_fault, + .write_fault = write_fault, + .largepage = largepage, + .ptwrite = ptwrite, + .pfn = pfn, + }; + + if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) + return NULL; + + walk_shadow(&walker.walker, vcpu, addr); - return shadow_ent; + return walker.sptep; } /* @@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - down_read(¤t->mm->mmap_sem); if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); @@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -453,6 +461,31 @@ out_unlock: return 0; } +static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, u64 addr, + u64 *sptep, int level) +{ + + if (level == PT_PAGE_TABLE_LEVEL) { + if (is_shadow_present_pte(*sptep)) + rmap_remove(vcpu->kvm, sptep); + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + return 1; + } + if (!is_shadow_present_pte(*sptep)) + return 1; + return 0; +} + +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_invlpg_entry), }, + }; + + walk_shadow(&walker.walker, vcpu, gva); +} + static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) { struct guest_walker walker; @@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, } } +/* + * Using the cached information from sp->gfns is safe because: + * - The spte has a reference to the struct page, so the pfn for a given gfn + * can't change unless all sptes pointing to it are nuked first. + * - Alias changes zap the entire shadow cache. + */ +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int i, offset, nr_present; + + offset = nr_present = 0; + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn = sp->gfns[i]; + + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); + + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -EINVAL; + + if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + !(gpte & PT_ACCESSED_MASK)) { + u64 nonpresent; + + rmap_remove(vcpu->kvm, &sp->spt[i]); + if (is_present_pte(gpte)) + nonpresent = shadow_trap_nonpresent_pte; + else + nonpresent = shadow_notrap_nonpresent_pte; + set_shadow_pte(&sp->spt[i], nonpresent); + continue; + } + + nr_present++; + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + is_dirty_pte(gpte), 0, gfn, + spte_to_pfn(sp->spt[i]), true, false); + } + + return !nr_present; +} + #undef pt_element_t #undef guest_walker +#undef shadow_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_LEVEL_BITS diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8233b86..9c4ce65 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -18,6 +18,7 @@ #include "kvm_svm.h" #include "irq.h" #include "mmu.h" +#include "kvm_cache_regs.h" #include <linux/module.h> #include <linux/kernel.h> @@ -35,10 +36,6 @@ MODULE_LICENSE("GPL"); #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -#define DB_VECTOR 1 -#define UD_VECTOR 6 -#define GP_VECTOR 13 - #define DR7_GD_MASK (1 << 13) #define DR6_BD_MASK (1 << 13) @@ -47,7 +44,7 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) -#define SVM_DEATURE_SVML (1 << 2) +#define SVM_FEATURE_SVML (1 << 2) #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) @@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) printk(KERN_DEBUG "%s: NOP\n", __func__); return; } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", - __func__, - svm->vmcb->save.rip, - svm->next_rip); + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); - vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; + kvm_rip_write(vcpu, svm->next_rip); svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; vcpu->arch.interrupt_window_open = 1; @@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | (1ULL << INTERCEPT_INVLPGA) | (1ULL << INTERCEPT_IOIO_PROT) | (1ULL << INTERCEPT_MSR_PROT) | @@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; + svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* * cr0 val on cpu init should be 0x60000010, we enable cpu @@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); + control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| INTERCEPT_CR3_MASK); @@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); if (vcpu->vcpu_id != 0) { - svm->vmcb->save.rip = 0; + kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; return 0; } @@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) rdtscll(vcpu->arch.host_tsc); } -static void svm_cache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.rip = svm->vmcb->save.rip; -} - -static void svm_decache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.rip; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (npt_enabled) svm_flush_tlb(&svm->vcpu); - if (event_injection) + if (!npt_enabled && event_injection) kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } @@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 3; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); kvm_emulate_hypercall(&svm->vcpu); return 1; @@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm, static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } +static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); + return 1; +} + static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), handler); - svm->vmcb->save.rax = data & 0xffffffff; + svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } return 1; @@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vmcb->save.rax & -1u) + u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), handler); - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) kvm_inject_gp(&svm->vcpu, 0); else @@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, - [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invalid_op_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, @@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +#ifdef CONFIG_X86_64 +#define R "r" +#else +#define R "e" +#endif + static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_enable(); asm volatile ( + "push %%"R"bp; \n\t" + "mov %c[rbx](%[svm]), %%"R"bx \n\t" + "mov %c[rcx](%[svm]), %%"R"cx \n\t" + "mov %c[rdx](%[svm]), %%"R"dx \n\t" + "mov %c[rsi](%[svm]), %%"R"si \n\t" + "mov %c[rdi](%[svm]), %%"R"di \n\t" + "mov %c[rbp](%[svm]), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "push %%rbp; \n\t" -#else - "push %%ebp; \n\t" -#endif - -#ifdef CONFIG_X86_64 - "mov %c[rbx](%[svm]), %%rbx \n\t" - "mov %c[rcx](%[svm]), %%rcx \n\t" - "mov %c[rdx](%[svm]), %%rdx \n\t" - "mov %c[rsi](%[svm]), %%rsi \n\t" - "mov %c[rdi](%[svm]), %%rdi \n\t" - "mov %c[rbp](%[svm]), %%rbp \n\t" "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" "mov %c[r10](%[svm]), %%r10 \n\t" @@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%[svm]), %%r13 \n\t" "mov %c[r14](%[svm]), %%r14 \n\t" "mov %c[r15](%[svm]), %%r15 \n\t" -#else - "mov %c[rbx](%[svm]), %%ebx \n\t" - "mov %c[rcx](%[svm]), %%ecx \n\t" - "mov %c[rdx](%[svm]), %%edx \n\t" - "mov %c[rsi](%[svm]), %%esi \n\t" - "mov %c[rdi](%[svm]), %%edi \n\t" - "mov %c[rbp](%[svm]), %%ebp \n\t" #endif -#ifdef CONFIG_X86_64 - /* Enter guest mode */ - "push %%rax \n\t" - "mov %c[vmcb](%[svm]), %%rax \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" - "pop %%rax \n\t" -#else /* Enter guest mode */ - "push %%eax \n\t" - "mov %c[vmcb](%[svm]), %%eax \n\t" + "push %%"R"ax \n\t" + "mov %c[vmcb](%[svm]), %%"R"ax \n\t" __ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMRUN) "\n\t" __ex(SVM_VMSAVE) "\n\t" - "pop %%eax \n\t" -#endif + "pop %%"R"ax \n\t" /* Save guest registers, load host registers */ + "mov %%"R"bx, %c[rbx](%[svm]) \n\t" + "mov %%"R"cx, %c[rcx](%[svm]) \n\t" + "mov %%"R"dx, %c[rdx](%[svm]) \n\t" + "mov %%"R"si, %c[rsi](%[svm]) \n\t" + "mov %%"R"di, %c[rdi](%[svm]) \n\t" + "mov %%"R"bp, %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 - "mov %%rbx, %c[rbx](%[svm]) \n\t" - "mov %%rcx, %c[rcx](%[svm]) \n\t" - "mov %%rdx, %c[rdx](%[svm]) \n\t" - "mov %%rsi, %c[rsi](%[svm]) \n\t" - "mov %%rdi, %c[rdi](%[svm]) \n\t" - "mov %%rbp, %c[rbp](%[svm]) \n\t" "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" "mov %%r10, %c[r10](%[svm]) \n\t" @@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" - - "pop %%rbp; \n\t" -#else - "mov %%ebx, %c[rbx](%[svm]) \n\t" - "mov %%ecx, %c[rcx](%[svm]) \n\t" - "mov %%edx, %c[rdx](%[svm]) \n\t" - "mov %%esi, %c[rsi](%[svm]) \n\t" - "mov %%edi, %c[rdi](%[svm]) \n\t" - "mov %%ebp, %c[rbp](%[svm]) \n\t" - - "pop %%ebp; \n\t" #endif + "pop %%"R"bp" : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" + , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx" , "esi", "edi" #endif ); @@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) load_db_regs(svm->host_db_regs); vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; write_dr6(svm->host_dr6); write_dr7(svm->host_dr7); @@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; } +#undef R + static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, - .cache_regs = svm_cache_regs, - .decache_regs = svm_decache_regs, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7041cc5..2643b43 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,8 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/moduleparam.h> +#include "kvm_cache_regs.h" +#include "x86.h" #include <asm/io.h> #include <asm/desc.h> @@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); static int enable_ept = 1; module_param(enable_ept, bool, 0); +static int emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -56,6 +61,7 @@ struct vmcs { struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; + unsigned long host_rsp; int launched; u8 fail; u32 idt_vectoring_info; @@ -83,6 +89,7 @@ struct vcpu_vmx { } irq; } rmode; int vpid; + bool emulation_required; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) - eb |= 1u << 1; + eb |= 1u << DB_VECTOR; if (vcpu->arch.rmode.active) eb = ~0; if (vm_need_ept()) @@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) unsigned long rip; u32 interruptibility; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs_writel(GUEST_RIP, rip); + kvm_rip_write(vcpu, rip); /* * We emulated an instruction, so temporary interrupt blocking @@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = nr; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (nr == BP_VECTOR) + vmx->rmode.irq.rip++; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_SOFT_INTR + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); } static bool vmx_exception_injected(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + return false; } /* @@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return ret; } -/* - * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. - */ -static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); -} - -/* - * Syncs rsp and rip back into the vmcs. Should be called after possible - * modification. - */ -static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + default: + break; + } } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field; - - idtv_info_field = vmx->idt_vectoring_info; - if (idtv_info_field & INTR_INFO_VALID_MASK) { - if (is_external_interrupt(idtv_info_field)) - return idtv_info_field & VECTORING_INFO_VECTOR_MASK; - else - printk(KERN_DEBUG "pending exception: not handled yet\n"); - } - return -1; + if (!vcpu->arch.interrupt.pending) + return -1; + return vcpu->arch.interrupt.nr; } static __init int cpu_has_kvm_support(void) @@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - == MSR_IA32_FEATURE_CONTROL_LOCKED; + return (msr & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + == FEATURE_CONTROL_LOCKED; /* locked but not enabled */ } @@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage) INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - != (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + if ((old & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) @@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING; + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses don't need to cause VM Exits when EPT enabled */ + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); @@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + return; + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); @@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); @@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + goto continue_rmode; + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); vmcs_write32(GUEST_SS_LIMIT, 0xffff); vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); @@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); +continue_rmode: kvm_mmu_reset_context(vcpu); init_rmode(vcpu->kvm); } @@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SELECTOR_RPL_MASK; + + if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.dpl > cs_rpl) + return false; + } else if (cs.type & AR_TYPE_CODE_MASK) { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SELECTOR_RPL_MASK; + + if ((ss.type != 3) || (ss.type != 7)) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SELECTOR_RPL_MASK; + + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SELECTOR_RPL_MASK) == + (ss.selector & SELECTOR_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; @@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm) if (r < 0) goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); if (r < 0) goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); @@ -1789,7 +1994,7 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0x93); + vmcs_write32(sf->ar_bytes, 0xf3); } static int alloc_apic_access_page(struct kvm *kvm) @@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; @@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; @@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } if (!vm_need_ept()) exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING; + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); if (cpu_has_secondary_exec_ctrls()) { @@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); down_read(&vcpu->kvm->slots_lock); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; @@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) fx_init(&vmx->vcpu); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. @@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, 0x02); if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); + kvm_rip_write(vcpu, 0xfff0); else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); + kvm_rip_write(vcpu, 0); + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); @@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ret = 0; + /* HACK: Don't enable emulation on guest boot/reset */ + vmx->emulation_required = 0; + out: up_read(&vcpu->kvm->slots_lock); return ret; @@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + ++vcpu->stat.irq_injections; if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmx->rmode.irq.rip = kvm_rip_read(vcpu); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vcpu->arch.nmi_pending = 0; } static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) @@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); if (!vcpu->arch.irq_pending[word_index]) clear_bit(word_index, &vcpu->arch.irq_summary); - vmx_inject_irq(vcpu, irq); + kvm_queue_interrupt(vcpu, irq); } @@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ + vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) kvm_do_inject_irq(vcpu); + if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) @@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->arch.rmode.active) - return 0; - /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. @@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) return 1; + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + switch (vec) { + case DE_VECTOR: + case DB_VECTOR: + case BP_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + kvm_queue_exception(vcpu, vec); + return 1; + } return 0; } @@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } error_code = 0; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vect_info & VECTORING_INFO_VALID_MASK) + if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), + handler); switch (cr) { case 0: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 3: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 4: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; @@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) }; break; case 2: /* clts */ - vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); @@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 1: /*mov from cr*/ switch (cr) { case 3: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, vcpu->arch.cr3); KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), handler); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], handler); + (u32)kvm_register_read(vcpu, reg), handler); skip_emulated_instruction(vcpu); return 1; } @@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & 7; reg = (exit_qualification >> 8) & 15; - vcpu_load_rsp_rip(vcpu); if (exit_qualification & 16) { /* mov from dr */ switch (dr) { @@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->arch.regs[reg] = val; + kvm_register_write(vcpu, reg, val); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { /* mov to dr */ } - vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; } @@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); @@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int err; + + preempt_enable(); + local_irq_enable(); + + while (!guest_state_valid(vcpu)) { + err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + switch (err) { + case EMULATE_DONE: + break; + case EMULATE_DO_MMIO: + kvm_report_emulation_failure(vcpu, "mmio"); + /* TODO: Handle MMIO */ + return; + default: + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; + } + + if (signal_pending(current)) + break; + if (need_resched()) + schedule(); + } + + local_irq_disable(); + preempt_disable(); + + /* Guest state should be valid now, no more emulation should be needed */ + vmx->emulation_required = 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, @@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), - (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); + KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), + (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) enable_irq_window(vcpu); } -static void vmx_intr_assist(struct kvm_vcpu *vcpu) +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field, exit_intr_info_field; - int vector; + u32 exit_intr_info; + u32 idt_vectoring_info; + bool unblock_nmi; + u8 vector; + int type; + bool idtv_info_valid; + u32 error; - update_tpr_threshold(vcpu); - - intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); - idtv_info_field = vmx->idt_vectoring_info; - if (intr_info_field & INTR_INFO_VALID_MASK) { - if (idtv_info_field & INTR_INFO_VALID_MASK) { - /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); - } - enable_intr_window(vcpu); - return; + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 25.7.1.2 + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + */ + if (unblock_nmi && vector != DF_VECTOR) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); } - if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; - - vmx_inject_irq(vcpu, vect); - enable_intr_window(vcpu); - return; - } - - KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); + idt_vectoring_info = vmx->idt_vectoring_info; + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + if (vmx->vcpu.arch.nmi_injected) { /* * SDM 3: 25.7.1.2 * Clear bit "block by NMI" before VM entry if a NMI delivery * faulted. */ - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - ~GUEST_INTR_STATE_NMI); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field - & ~INTR_INFO_RESVD_BITS_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - - if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - enable_intr_window(vcpu); - return; + if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->vcpu.arch.nmi_injected = false; + } + kvm_clear_exception_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + error = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, error); + } else + kvm_queue_exception(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; } + kvm_clear_interrupt_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { + kvm_queue_interrupt(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } +} + +static void vmx_intr_assist(struct kvm_vcpu *vcpu) +{ + update_tpr_threshold(vcpu); + if (cpu_has_virtual_nmis()) { - /* - * SDM 3: 25.7.1.2 - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - */ - if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && - (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | - GUEST_INTR_STATE_NMI); - else if (vcpu->arch.nmi_pending) { - if (vmx_nmi_enabled(vcpu)) - vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vmx_nmi_enabled(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_intr_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); enable_intr_window(vcpu); return; } - } - if (!kvm_cpu_has_interrupt(vcpu)) - return; - if (vmx_irq_enabled(vcpu)) { - vector = kvm_cpu_get_interrupt(vcpu); - vmx_inject_irq(vcpu, vector); - kvm_timer_intr_post(vcpu, vector); - } else - enable_irq_window(vcpu); + if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { + if (vmx_irq_enabled(vcpu)) + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); + else + enable_irq_window(vcpu); + } + if (vcpu->arch.interrupt.pending) { + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + } } /* @@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void fixup_rmode_irq(struct vcpu_vmx *vmx) { vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; @@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Handle invalid guest state instead of entering VMX */ + if (vmx->emulation_required && emulate_invalid_guest_state) { + handle_invalid_guest_state(vcpu, kvm_run); + return; + } + + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* * Loading guest fpu may have cleared host cr0.ts */ @@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm( /* Store host registers */ -#ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" - "push %%rcx \n\t" -#else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" - "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" @@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ -#else - "mov %c[cr2](%0), %%eax \n\t" - "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ #endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + /* Enter guest mode */ "jne .Llaunched \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" @@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" @@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" - "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" - - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" -#else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" - "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" - - "pop %%ebp; pop %%ebp; pop %%edx \n\t" #endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), @@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" + , R"bx", R"di", R"si" #ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" #endif ); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_dirty = 0; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); } + + vmx_complete_interrupts(vmx); } +#undef R +#undef Q + static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .cache_regs = vcpu_load_rsp_rip, - .decache_regs = vcpu_put_rsp_rip, + .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 17e2599..3e010d2 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,9 +331,6 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 -#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 - #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19afbb6..4f0677d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4,10 +4,14 @@ * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> + * Amit Shah <amit.shah@qumranet.com> + * Ben-Ami Yassour <benami@il.ibm.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -19,14 +23,18 @@ #include "mmu.h" #include "i8254.h" #include "tss.h" +#include "kvm_cache_regs.h" +#include "x86.h" #include <linux/clocksource.h> +#include <linux/interrupt.h> #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/highmem.h> +#include <linux/intel-iommu.h> #include <asm/uaccess.h> #include <asm/msr.h> @@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); struct kvm_x86_ops *kvm_x86_ops; +EXPORT_SYMBOL_GPL(kvm_x86_ops); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "irq_injections", VCPU_STAT(irq_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } }; - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); return; } @@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", __func__, data); break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* Values other than LBR and BTF are vendor-specific, + thus reserved and should throw a #GP */ + return 1; + } + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: break; @@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); if (is_error_page(vcpu->arch.time_page)) { kvm_release_page_clean(vcpu->arch.time_page); @@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+8: case MSR_IA32_MC0_MISC+12: case MSR_IA32_MC0_MISC+16: + case MSR_IA32_MC0_MISC+20: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: data = 0; break; case MSR_MTRRcap: @@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: r = !tdp_enabled; break; + case KVM_CAP_IOMMU: + r = intel_iommu_found(); + break; default: r = 0; break; @@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; + struct kvm_lapic_state *lapic = NULL; switch (ioctl) { case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; + lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + r = -ENOMEM; + if (!lapic) + goto out; + r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) + if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; } case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; - + lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + r = -ENOMEM; + if (!lapic) + goto out; r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) + if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); if (r) goto out; r = 0; @@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: + if (lapic) + kfree(lapic); return r; } @@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -EINVAL; + /* + * This union makes it completely explicit to gcc-3.x + * that these two variables' stack usage should be + * combined, not added together. + */ + union { + struct kvm_pit_state ps; + struct kvm_memory_alias alias; + } u; switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; - + case KVM_SET_MEMORY_ALIAS: r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) + if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); if (r) goto out; break; - } case KVM_CREATE_IRQCHIP: r = -ENOMEM; kvm->arch.vpic = kvm_create_pic(kvm); @@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->arch.vioapic, - irq_event.irq, - irq_event.level); + kvm_set_irq(kvm, irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); r = 0; } @@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto get_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + goto get_irqchip_out; + r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) - goto out; + goto get_irqchip_out; r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; + if (copy_to_user(argp, chip, sizeof *chip)) + goto get_irqchip_out; r = 0; + get_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto set_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + goto set_irqchip_out; + r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) - goto out; + goto set_irqchip_out; r = 0; + set_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_GET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_get_pit(kvm, &ps); + r = kvm_vm_ioctl_get_pit(kvm, &u.ps); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &ps, sizeof ps)) + if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) goto out; r = 0; break; } case KVM_SET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_set_pit(kvm, &ps); + r = kvm_vm_ioctl_set_pit(kvm, &u.ps); if (r) goto out; r = 0; @@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); kaddr = kmap_atomic(page, KM_USER0); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); @@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) { + kvm_mmu_invlpg(vcpu, address); return X86EMUL_CONTINUE; } @@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { u8 opcodes[4]; - unsigned long rip = vcpu->arch.rip; + unsigned long rip = kvm_rip_read(vcpu); unsigned long rip_linear; if (!printk_ratelimit()) @@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = { .cmpxchg_emulated = emulator_cmpxchg_emulated, }; +static void cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; +} + int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, @@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int r; struct decode_cache *c; + kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; - kvm_x86_ops->cache_regs(vcpu); + /* + * TODO: fix x86_emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); vcpu->mmio_is_write = 0; vcpu->arch.pio.string = 0; @@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (vcpu->mmio_is_write) { @@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu) struct kvm_pio_request *io = &vcpu->arch.pio; long delta; int r; - - kvm_x86_ops->cache_regs(vcpu); + unsigned long val; if (!io->string) { - if (io->in) - memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, - io->size); + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } } else { if (io->in) { r = pio_copy_data(vcpu); - if (r) { - kvm_x86_ops->cache_regs(vcpu); + if (r) return r; - } } delta = 1; @@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu) * The size of the register should really depend on * current address size. */ - vcpu->arch.regs[VCPU_REGS_RCX] -= delta; + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); } if (io->down) delta = -delta; delta *= io->size; - if (io->in) - vcpu->arch.regs[VCPU_REGS_RDI] += delta; - else - vcpu->arch.regs[VCPU_REGS_RSI] += delta; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } } - kvm_x86_ops->decache_regs(vcpu); - io->count -= io->cur_count; io->cur_count = 0; @@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { struct kvm_io_device *pio_dev; + unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, handler); - kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - up_read(&vcpu->kvm->slots_lock); - kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - return -EINTR; return 1; } else { vcpu->run->exit_reason = KVM_EXIT_HLT; @@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int r = 1; - kvm_x86_ops->cache_regs(vcpu); - - nr = vcpu->arch.regs[VCPU_REGS_RAX]; - a0 = vcpu->arch.regs[VCPU_REGS_RBX]; - a1 = vcpu->arch.regs[VCPU_REGS_RCX]; - a2 = vcpu->arch.regs[VCPU_REGS_RDX]; - a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); @@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } - vcpu->arch.regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; } @@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; int ret = 0; + unsigned long rip = kvm_rip_read(vcpu); /* @@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) */ kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->cache_regs(vcpu); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) + if (emulator_write_emulated(rip, instruction, 3, vcpu) != X86EMUL_CONTINUE) ret = -EFAULT; @@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) u32 function, index; struct kvm_cpuid_entry2 *e, *best; - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->arch.regs[VCPU_REGS_RAX]; - index = vcpu->arch.regs[VCPU_REGS_RCX]; - vcpu->arch.regs[VCPU_REGS_RAX] = 0; - vcpu->arch.regs[VCPU_REGS_RBX] = 0; - vcpu->arch.regs[VCPU_REGS_RCX] = 0; - vcpu->arch.regs[VCPU_REGS_RDX] = 0; + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { e = &vcpu->arch.cpuid_entries[i]; @@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) best = e; } if (best) { - vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; - vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; - vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); KVMTRACE_5D(CPUID, vcpu, function, - (u32)vcpu->arch.regs[VCPU_REGS_RAX], - (u32)vcpu->arch.regs[VCPU_REGS_RBX], - (u32)vcpu->arch.regs[VCPU_REGS_RCX], - (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); + (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); vcpu->arch.apic->vapic_page = page; } @@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - - down_read(&vcpu->kvm->slots_lock); - vapic_enter(vcpu); - -preempted: - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - -again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); @@ -2829,6 +2866,8 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, @@ -2854,21 +2893,15 @@ again: local_irq_disable(); - if (vcpu->requests || need_resched()) { + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); vcpu->guest_mode = 1; /* @@ -2917,8 +2950,8 @@ again: * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) { - kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); } if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) @@ -2927,26 +2960,63 @@ again: kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(kvm_run, vcpu); +out: + return r; +} - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) - goto again; +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_x86_ops->vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -out: - up_read(&vcpu->kvm->slots_lock); - if (r > 0) { - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - goto preempted; + down_read(&vcpu->kvm->slots_lock); + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + r = vcpu_enter_guest(vcpu, kvm_run); + else { + up_read(&vcpu->kvm->slots_lock); + kvm_vcpu_block(vcpu); + down_read(&vcpu->kvm->slots_lock); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) + r = -EINTR; + } + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); + } + } } + up_read(&vcpu->kvm->slots_lock); post_kvm_run_save(vcpu, kvm_run); vapic_exit(vcpu); @@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; } @@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } #endif - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { - kvm_x86_ops->cache_regs(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; - kvm_x86_ops->decache_regs(vcpu); - } + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, + kvm_run->hypercall.ret); r = __vcpu_run(vcpu, kvm_run); @@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - kvm_x86_ops->cache_regs(vcpu); - - regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); #ifdef CONFIG_X86_64 - regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; - regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; - regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; - regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; - regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; - regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; - regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; - regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); #endif - regs->rip = vcpu->arch.rip; + regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* @@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; - vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); #ifdef CONFIG_X86_64 - vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; - vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; - vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; - vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; - vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; - vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; - vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; - vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + #endif - vcpu->arch.rip = regs->rip; + kvm_rip_write(vcpu, regs->rip); kvm_x86_ops->set_rflags(vcpu, regs->rflags); - kvm_x86_ops->decache_regs(vcpu); vcpu->arch.exception.pending = false; @@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return 0; +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; + if (!(vcpu->arch.cr0 & X86_CR0_PE)) + return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; kvm_seg.type |= type_bits; @@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { tss->cr3 = vcpu->arch.cr3; - tss->eip = vcpu->arch.rip; + tss->eip = kvm_rip_read(vcpu); tss->eflags = kvm_x86_ops->get_rflags(vcpu); - tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; - + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); @@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, { kvm_set_cr3(vcpu, tss->cr3); - vcpu->arch.rip = tss->eip; + kvm_rip_write(vcpu, tss->eip); kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) return 1; @@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, static void save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - tss->ip = vcpu->arch.rip; + tss->ip = kvm_rip_read(vcpu); tss->flag = kvm_x86_ops->get_rflags(vcpu); - tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); @@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, static int load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - vcpu->arch.rip = tss->ip; + kvm_rip_write(vcpu, tss->ip); kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) return 1; @@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_x86_ops->cache_regs(vcpu); if (nseg_desc.type & 8) ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, @@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); out: - kvm_x86_ops->decache_regs(vcpu); return ret; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, pr_debug("Set back pending irq %d\n", pending_vec); } + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + /* Older userspace won't unhalt the vcpu on reset. */ + if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !(vcpu->arch.cr0 & X86_CR0_PE)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; @@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); return kvm; } @@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_iommu_unmap_guest(kvm); + kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, + MAP_PRIVATE | MAP_ANONYMOUS, 0); up_write(¤t->mm->mmap_sem); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 0000000..6a4be78 --- /dev/null +++ b/arch/x86/kvm/x86.h @@ -0,0 +1,22 @@ +#ifndef ARCH_X86_KVM_X86_H +#define ARCH_X86_KVM_X86_H + +#include <linux/kvm_host.h> + +static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.exception.pending = false; +} + +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +{ + vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.nr = vector; +} + +static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.interrupt.pending = false; +} + +#endif diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index f2f9046..ea05117 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -26,6 +26,7 @@ #define DPRINTF(_f, _a ...) printf(_f , ## _a) #else #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define DPRINTF(x...) do {} while (0) #endif #include <linux/module.h> @@ -46,25 +47,26 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstMask (7<<1) /* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) +#define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<4) /* Generic ModRM decode. */ -#define ModRM (1<<6) +#define ModRM (1<<7) /* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) -#define MemAbs (1<<9) /* Memory operand is absolute displacement */ -#define String (1<<10) /* String instruction (rep capable) */ -#define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Mov (1<<8) +#define BitOp (1<<9) +#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ @@ -94,7 +96,7 @@ static u16 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -106,7 +108,8 @@ static u16 opcode_table[256] = { /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, /* 0x40 - 0x47 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x48 - 0x4F */ @@ -153,9 +156,16 @@ static u16 opcode_table[256] = { 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, - DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, @@ -169,17 +179,20 @@ static u16 opcode_table[256] = { /* 0xD8 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xE8 - 0xEF */ ImplicitOps | Stack, SrcImm | ImplicitOps, ImplicitOps, SrcImmByte | ImplicitOps, - 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, Group | Group4, Group | Group5, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { @@ -268,15 +281,16 @@ static u16 group_table[] = { ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM | SrcImm, 0, - DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + DstMem | SrcImm | ModRM, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, - SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem, ModRM | SrcMem, SrcNone | ModRM | DstMem | Mov, 0, @@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); @@ -1048,6 +1062,23 @@ done_prefixes: } c->dst.type = OP_MEM; break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; } if (c->rip_relative) @@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, case 1: /* dec */ emulate_1op("dec", c->dst, ctxt->eflags); break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt); + break; + } case 4: /* jmp abs */ c->eip = c->src.val; break; @@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) u64 msr_data; unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; + unsigned int port; + int io_dir_in; int rc = 0; /* Shadow copy of register state. Committed on successful emulation. @@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (c->regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } /* The second termination condition only applies for REPE @@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) (c->b == 0xae) || (c->b == 0xaf)) { if ((c->rep_prefix == REPE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == 0)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } if ((c->rep_prefix == REPNE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } } c->regs[VCPU_REGS_RCX]--; - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } if (c->src.type == OP_MEM) { @@ -1351,27 +1392,10 @@ special_insn: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; - case 0x20 ... 0x23: + case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); break; - case 0x24: /* and al imm8 */ - c->dst.type = OP_REG; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - c->dst.val = *(u8 *)c->dst.ptr; - c->dst.bytes = 1; - c->dst.orig_val = c->dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - if (c->op_bytes == 2) - c->dst.val = *(u16 *)c->dst.ptr; - else - c->dst.val = *(u32 *)c->dst.ptr; - c->dst.orig_val = c->dst.val; - goto and; case 0x28 ... 0x2d: sub: /* sub */ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); @@ -1659,7 +1683,7 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; - case 0xb8: /* mov r, imm */ + case 0xb0 ... 0xbf: /* mov r, imm */ goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); @@ -1679,6 +1703,16 @@ special_insn: c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 1; + goto do_io; + case 0xe6: /* outb */ + case 0xe7: /* out */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 0; + goto do_io; case 0xe8: /* call (near) */ { long int rel; switch (c->op_bytes) { @@ -1729,6 +1763,22 @@ special_insn: jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; + } + return 0; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; break; @@ -1754,6 +1804,14 @@ special_insn: ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; case 0xfe ... 0xff: /* Grp4/Grp5 */ rc = emulate_grp45(ctxt, ops); if (rc != 0) @@ -1768,7 +1826,7 @@ writeback: /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); done: if (rc == X86EMUL_UNHANDLEABLE) { @@ -1793,7 +1851,7 @@ twobyte_insn: goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -1889,7 +1947,7 @@ twobyte_insn: rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; @@ -1899,7 +1957,7 @@ twobyte_insn: rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 004ba86..c9f7cda 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void) /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { - u64 xen_khz = 1000000ULL << 32; - const struct pvclock_vcpu_time_info *info = + struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(xen_khz, info->tsc_to_system_mul); - if (info->tsc_shift < 0) - xen_khz <<= -info->tsc_shift; - else - xen_khz >>= info->tsc_shift; - - return xen_khz; + return pvclock_tsc_khz(info); } cycle_t xen_clocksource_read(void) diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index bd2c016..e842e75 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -28,9 +28,9 @@ #include <linux/pci.h> #include <linux/dmar.h> +#include <linux/iova.h> +#include <linux/intel-iommu.h> #include <linux/timer.h> -#include "iova.h" -#include "intel-iommu.h" #undef PREFIX #define PREFIX "DMAR:" diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 389fdd6..fc5f2db 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -33,8 +33,8 @@ #include <linux/dma-mapping.h> #include <linux/mempool.h> #include <linux/timer.h> -#include "iova.h" -#include "intel-iommu.h" +#include <linux/iova.h> +#include <linux/intel-iommu.h> #include <asm/proto.h> /* force_iommu in this header in x86-64*/ #include <asm/cacheflush.h> #include <asm/iommu.h> @@ -156,7 +156,7 @@ static inline void *alloc_domain_mem(void) return iommu_kmem_cache_alloc(iommu_domain_cache); } -static inline void free_domain_mem(void *vaddr) +static void free_domain_mem(void *vaddr) { kmem_cache_free(iommu_domain_cache, vaddr); } @@ -1341,7 +1341,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) * find_domain * Note: we use struct pci_dev->dev.archdata.iommu stores the info */ -struct dmar_domain * +static struct dmar_domain * find_domain(struct pci_dev *pdev) { struct device_domain_info *info; @@ -2318,3 +2318,111 @@ int __init intel_iommu_init(void) return 0; } +void intel_iommu_domain_exit(struct dmar_domain *domain) +{ + u64 end; + + /* Domain 0 is reserved, so dont process it */ + if (!domain) + return; + + end = DOMAIN_MAX_ADDR(domain->gaw); + end = end & (~PAGE_MASK_4K); + + /* clear ptes */ + dma_pte_clear_range(domain, 0, end); + + /* free page tables */ + dma_pte_free_pagetable(domain, 0, end); + + iommu_free_domain(domain); + free_domain_mem(domain); +} +EXPORT_SYMBOL_GPL(intel_iommu_domain_exit); + +struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev) +{ + struct dmar_drhd_unit *drhd; + struct dmar_domain *domain; + struct intel_iommu *iommu; + + drhd = dmar_find_matched_drhd_unit(pdev); + if (!drhd) { + printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n"); + return NULL; + } + + iommu = drhd->iommu; + if (!iommu) { + printk(KERN_ERR + "intel_iommu_domain_alloc: iommu == NULL\n"); + return NULL; + } + domain = iommu_alloc_domain(iommu); + if (!domain) { + printk(KERN_ERR + "intel_iommu_domain_alloc: domain == NULL\n"); + return NULL; + } + if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + printk(KERN_ERR + "intel_iommu_domain_alloc: domain_init() failed\n"); + intel_iommu_domain_exit(domain); + return NULL; + } + return domain; +} +EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc); + +int intel_iommu_context_mapping( + struct dmar_domain *domain, struct pci_dev *pdev) +{ + int rc; + rc = domain_context_mapping(domain, pdev); + return rc; +} +EXPORT_SYMBOL_GPL(intel_iommu_context_mapping); + +int intel_iommu_page_mapping( + struct dmar_domain *domain, dma_addr_t iova, + u64 hpa, size_t size, int prot) +{ + int rc; + rc = domain_page_mapping(domain, iova, hpa, size, prot); + return rc; +} +EXPORT_SYMBOL_GPL(intel_iommu_page_mapping); + +void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn) +{ + detach_domain_for_dev(domain, bus, devfn); +} +EXPORT_SYMBOL_GPL(intel_iommu_detach_dev); + +struct dmar_domain * +intel_iommu_find_domain(struct pci_dev *pdev) +{ + return find_domain(pdev); +} +EXPORT_SYMBOL_GPL(intel_iommu_find_domain); + +int intel_iommu_found(void) +{ + return g_num_of_iommus; +} +EXPORT_SYMBOL_GPL(intel_iommu_found); + +u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova) +{ + struct dma_pte *pte; + u64 pfn; + + pfn = 0; + pte = addr_to_dma_pte(domain, iova); + + if (pte) + pfn = dma_pte_addr(*pte); + + return pfn >> PAGE_SHIFT_4K; +} +EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn); diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index bb642cc..738d4c8 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -4,7 +4,7 @@ #include <linux/pci.h> #include <linux/irq.h> #include <asm/io_apic.h> -#include "intel-iommu.h" +#include <linux/intel-iommu.h> #include "intr_remapping.h" static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h index 05f2635..ca48f0d 100644 --- a/drivers/pci/intr_remapping.h +++ b/drivers/pci/intr_remapping.h @@ -1,4 +1,4 @@ -#include "intel-iommu.h" +#include <linux/intel-iommu.h> struct ioapic_scope { struct intel_iommu *iommu; diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c index 3ef4ac0..2287116 100644 --- a/drivers/pci/iova.c +++ b/drivers/pci/iova.c @@ -7,7 +7,7 @@ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> */ -#include "iova.h" +#include <linux/iova.h> void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h index 78e954d..ba0dd79 100644 --- a/include/asm-x86/kvm.h +++ b/include/asm-x86/kvm.h @@ -208,26 +208,4 @@ struct kvm_pit_channel_state { struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; - -#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) -#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) -#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) -#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) -#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) -#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) -#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) -#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) -#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) -#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) -#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) -#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) -#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) -#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) -#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) -#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) -#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) -#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) -#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) -#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) - #endif /* ASM_X86__KVM_H */ diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 6979454..411fb8c 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -57,6 +57,10 @@ #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) #define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 #define UD_VECTOR 6 #define NM_VECTOR 7 #define DF_VECTOR 8 @@ -65,6 +69,7 @@ #define SS_VECTOR 12 #define GP_VECTOR 13 #define PF_VECTOR 14 +#define MF_VECTOR 16 #define MC_VECTOR 18 #define SELECTOR_TI_MASK (1 << 2) @@ -89,7 +94,7 @@ extern struct list_head vm_list; struct kvm_vcpu; struct kvm; -enum { +enum kvm_reg { VCPU_REGS_RAX = 0, VCPU_REGS_RCX = 1, VCPU_REGS_RDX = 2, @@ -108,6 +113,7 @@ enum { VCPU_REGS_R14 = 14, VCPU_REGS_R15 = 15, #endif + VCPU_REGS_RIP, NR_VCPU_REGS }; @@ -189,10 +195,20 @@ struct kvm_mmu_page { */ int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ + bool unsync; + bool unsync_children; union { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ }; + DECLARE_BITMAP(unsync_child_bitmap, 512); +}; + +struct kvm_pv_mmu_op_buffer { + void *ptr; + unsigned len; + unsigned processed; + char buf[512] __aligned(sizeof(long)); }; /* @@ -207,6 +223,9 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); + int (*sync_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -219,8 +238,13 @@ struct kvm_vcpu_arch { int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); - unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ - unsigned long rip; /* needs vcpu_load_rsp_rip() */ + /* + * rip and regs accesses must go through + * kvm_{register,rip}_{read,write} functions. + */ + unsigned long regs[NR_VCPU_REGS]; + u32 regs_avail; + u32 regs_dirty; unsigned long cr0; unsigned long cr2; @@ -237,6 +261,9 @@ struct kvm_vcpu_arch { bool tpr_access_reporting; struct kvm_mmu mmu; + /* only needed in kvm_pv_mmu_op() path, but it's hot so + * put it here to avoid allocation */ + struct kvm_pv_mmu_op_buffer mmu_op_buffer; struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache; @@ -269,6 +296,11 @@ struct kvm_vcpu_arch { u32 error_code; } exception; + struct kvm_queued_interrupt { + bool pending; + u8 nr; + } interrupt; + struct { int active; u8 save_iopl; @@ -294,6 +326,7 @@ struct kvm_vcpu_arch { struct page *time_page; bool nmi_pending; + bool nmi_injected; u64 mtrr[0x100]; }; @@ -316,9 +349,12 @@ struct kvm_arch{ * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head assigned_dev_head; + struct dmar_domain *intel_iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; + struct hlist_head irq_ack_notifier_list; int round_robin_prev_vcpu; unsigned int tss_addr; @@ -338,6 +374,7 @@ struct kvm_vm_stat { u32 mmu_flooded; u32 mmu_recycled; u32 mmu_cache_miss; + u32 mmu_unsync; u32 remote_tlb_flush; u32 lpages; }; @@ -364,6 +401,7 @@ struct kvm_vcpu_stat { u32 insn_emulation; u32 insn_emulation_fail; u32 hypercalls; + u32 irq_injections; }; struct descriptor_table { @@ -414,8 +452,7 @@ struct kvm_x86_ops { unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, int *exception); - void (*cache_regs)(struct kvm_vcpu *vcpu); - void (*decache_regs)(struct kvm_vcpu *vcpu); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -528,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); +void kvm_pic_set_irq(void *opaque, int irq, int level); + void kvm_inject_nmi(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu); @@ -550,12 +589,14 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_fix_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_enable_tdp(void); void kvm_disable_tdp(void); @@ -686,33 +727,6 @@ enum { TASK_SWITCH_GATE = 3, }; -#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 5, d1, d2, d3, d4, d5) -#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 4, d1, d2, d3, d4, 0) -#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 3, d1, d2, d3, 0, 0) -#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 2, d1, d2, 0, 0, 0) -#define KVMTRACE_1D(evt, vcpu, d1, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 1, d1, 0, 0, 0, 0) -#define KVMTRACE_0D(evt, vcpu, name) \ - trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ - vcpu, 0, 0, 0, 0, 0, 0) - -#ifdef CONFIG_64BIT -# define KVM_EX_ENTRY ".quad" -# define KVM_EX_PUSH "pushq" -#else -# define KVM_EX_ENTRY ".long" -# define KVM_EX_PUSH "pushl" -#endif - /* * Hardware virtualization extension instructions may fault if a * reboot turns off virtualization while processes are running. @@ -724,11 +738,11 @@ asmlinkage void kvm_handle_fault_on_reboot(void); "666: " insn "\n\t" \ ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ - KVM_EX_PUSH " $666b \n\t" \ + __ASM_SIZE(push) " $666b \n\t" \ "jmp kvm_handle_fault_on_reboot \n\t" \ ".popsection \n\t" \ ".pushsection __ex_table, \"a\" \n\t" \ - KVM_EX_ENTRY " 666b, 667b \n\t" \ + _ASM_PTR " 666b, 667b \n\t" \ ".popsection" #define KVM_ARCH_WANT_MMU_NOTIFIER diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h index 0bb4330..dabd10f 100644 --- a/include/asm-x86/msr-index.h +++ b/include/asm-x86/msr-index.h @@ -178,6 +178,9 @@ #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_IA32_FEATURE_CONTROL 0x0000003a +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) + #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) #define MSR_IA32_APICBASE_ENABLE (1<<11) diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h index 1a38f68..ad29e27 100644 --- a/include/asm-x86/pvclock.h +++ b/include/asm-x86/pvclock.h @@ -6,6 +6,7 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); diff --git a/drivers/pci/dma_remapping.h b/include/linux/dma_remapping.h index bff5c65..bff5c65 100644 --- a/drivers/pci/dma_remapping.h +++ b/include/linux/dma_remapping.h diff --git a/drivers/pci/intel-iommu.h b/include/linux/intel-iommu.h index 2142c01..2e117f3 100644 --- a/drivers/pci/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -25,10 +25,10 @@ #include <linux/types.h> #include <linux/msi.h> #include <linux/sysdev.h> -#include "iova.h" +#include <linux/iova.h> #include <linux/io.h> +#include <linux/dma_remapping.h> #include <asm/cacheflush.h> -#include "dma_remapping.h" /* * Intel IOMMU register specification per version 1.0 public spec. @@ -304,4 +304,24 @@ extern int dmar_enable_qi(struct intel_iommu *iommu); extern void qi_global_iec(struct intel_iommu *iommu); extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); + +void intel_iommu_domain_exit(struct dmar_domain *domain); +struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev); +int intel_iommu_context_mapping(struct dmar_domain *domain, + struct pci_dev *pdev); +int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova, + u64 hpa, size_t size, int prot); +void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn); +struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev); +u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova); + +#ifdef CONFIG_DMAR +int intel_iommu_found(void); +#else /* CONFIG_DMAR */ +static inline int intel_iommu_found(void) +{ + return 0; +} +#endif /* CONFIG_DMAR */ + #endif diff --git a/drivers/pci/iova.h b/include/linux/iova.h index 228f6c9..228f6c9 100644 --- a/drivers/pci/iova.h +++ b/include/linux/iova.h diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 70a3065..797fcd7 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -311,22 +311,33 @@ struct kvm_s390_interrupt { /* This structure represents a single trace buffer record. */ struct kvm_trace_rec { - __u32 event:28; - __u32 extra_u32:3; - __u32 cycle_in:1; + /* variable rec_val + * is split into: + * bits 0 - 27 -> event id + * bits 28 -30 -> number of extra data args of size u32 + * bits 31 -> binary indicator for if tsc is in record + */ + __u32 rec_val; __u32 pid; __u32 vcpu_id; union { struct { - __u64 cycle_u64; + __u64 timestamp; __u32 extra_u32[KVM_TRC_EXTRA_MAX]; - } __attribute__((packed)) cycle; + } __attribute__((packed)) timestamp; struct { __u32 extra_u32[KVM_TRC_EXTRA_MAX]; - } nocycle; + } notimestamp; } u; }; +#define TRACE_REC_EVENT_ID(val) \ + (0x0fffffff & (val)) +#define TRACE_REC_NUM_DATA_ARGS(val) \ + (0x70000000 & ((val) << 28)) +#define TRACE_REC_TCS(val) \ + (0x80000000 & ((val) << 31)) + #define KVMIO 0xAE /* @@ -372,6 +383,10 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#define KVM_CAP_DEVICE_ASSIGNMENT 17 +#endif +#define KVM_CAP_IOMMU 18 /* * ioctls for VM fds @@ -401,6 +416,10 @@ struct kvm_trace_rec { _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) #define KVM_UNREGISTER_COALESCED_MMIO \ _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) +#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ + struct kvm_assigned_pci_dev) +#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ + struct kvm_assigned_irq) /* * ioctls for vcpu fds @@ -440,4 +459,45 @@ struct kvm_trace_rec { #define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) #define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) +#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) +#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) +#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) +#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) +#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) +#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) +#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) +#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) +#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) +#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) +#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) +#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) +#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) +#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) +#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) +#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) +#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) +#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) +#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) +#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) +#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16) +#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17) +#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) +#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) + +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; +}; + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; + __u32 guest_irq; + __u32 flags; +}; + +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8525afc..3833c48 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -34,6 +34,8 @@ #define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_TRIPLE_FAULT 4 #define KVM_REQ_PENDING_TIMER 5 +#define KVM_REQ_UNHALT 6 +#define KVM_REQ_MMU_SYNC 7 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; @@ -279,12 +281,68 @@ void kvm_free_physmem(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); +void kvm_free_all_assigned_devices(struct kvm *kvm); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +int kvm_is_mmio_pfn(pfn_t pfn); + +struct kvm_irq_ack_notifier { + struct hlist_node link; + unsigned gsi; + void (*irq_acked)(struct kvm_irq_ack_notifier *kian); +}; + +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct work_struct interrupt_work; + struct list_head list; + int assigned_dev_id; + int host_busnr; + int host_devfn; + int host_irq; + int guest_irq; + int irq_requested; + struct pci_dev *dev; + struct kvm *kvm; +}; +void kvm_set_irq(struct kvm *kvm, int irq, int level); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); + +#ifdef CONFIG_DMAR +int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, + unsigned long npages); +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev); +int kvm_iommu_unmap_guest(struct kvm *kvm); +#else /* CONFIG_DMAR */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, + unsigned long npages) +{ + return 0; +} + +static inline int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + return -ENODEV; +} + +static inline int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + return 0; +} +#endif /* CONFIG_DMAR */ + static inline void kvm_guest_enter(void) { account_system_vtime(current); @@ -307,6 +365,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn) return (gpa_t)gfn << PAGE_SHIFT; } +static inline hpa_t pfn_to_hpa(pfn_t pfn) +{ + return (hpa_t)pfn << PAGE_SHIFT; +} + static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) { set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); @@ -326,6 +389,25 @@ struct kvm_stats_debugfs_item { extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; +#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \ + trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ + vcpu, 5, d1, d2, d3, d4, d5) +#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \ + trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ + vcpu, 4, d1, d2, d3, d4, 0) +#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \ + trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ + vcpu, 3, d1, d2, d3, 0, 0) +#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \ + trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ + vcpu, 2, d1, d2, 0, 0, 0) +#define KVMTRACE_1D(evt, vcpu, d1, name) \ + trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ + vcpu, 1, d1, 0, 0, 0, 0) +#define KVMTRACE_0D(evt, vcpu, name) \ + trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \ + vcpu, 0, 0, 0, 0, 0, 0) + #ifdef CONFIG_KVM_TRACE int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg); void kvm_trace_cleanup(void); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index c0d2287..53772bb 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -39,6 +39,7 @@ #include "ioapic.h" #include "lapic.h" +#include "irq.h" #if 0 #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) @@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) } } -static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi, + int trigger_mode) { union ioapic_redir_entry *ent; ent = &ioapic->redirtbl[gsi]; - ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); - ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) - ioapic_service(ioapic, gsi); + kvm_notify_acked_irq(ioapic->kvm, gsi); + + if (trigger_mode == IOAPIC_LEVEL_TRIG) { + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) + ioapic_service(ioapic, gsi); + } } -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; int i; for (i = 0; i < IOAPIC_NUM_PINS; i++) if (ioapic->redirtbl[i].fields.vector == vector) - __kvm_ioapic_update_eoi(ioapic, i); + __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); } static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, @@ -380,7 +386,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, break; #ifdef CONFIG_IA64 case IOAPIC_REG_EOI: - kvm_ioapic_update_eoi(ioapic->kvm, data); + kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG); break; #endif diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 7f16675..cd7ae76 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -58,6 +58,7 @@ struct kvm_ioapic { } redirtbl[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); }; #ifdef DEBUG @@ -78,16 +79,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } -#ifdef CONFIG_IA64 -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} -#endif - struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, unsigned long bitmap); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c new file mode 100644 index 0000000..d0169f5 --- /dev/null +++ b/virt/kvm/irq_comm.c @@ -0,0 +1,60 @@ +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#include <linux/kvm_host.h> +#include "irq.h" + +#include "ioapic.h" + +/* This should be called with the kvm->lock mutex held */ +void kvm_set_irq(struct kvm *kvm, int irq, int level) +{ + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); +#ifdef CONFIG_X86 + kvm_pic_set_irq(pic_irqchip(kvm), irq, level); +#endif +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + + hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_del(&kian->link); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7dd9b0b..cf0ab8e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -51,6 +51,12 @@ #include "coalesced_mmio.h" #endif +#ifdef KVM_CAP_DEVICE_ASSIGNMENT +#include <linux/pci.h> +#include <linux/interrupt.h> +#include "irq.h" +#endif + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -71,11 +77,253 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, bool kvm_rebooting; +#ifdef KVM_CAP_DEVICE_ASSIGNMENT +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +{ + struct kvm_assigned_dev_kernel *assigned_dev; + + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + + /* This is taken to safely inject irq inside the guest. When + * the interrupt injection (or the ioapic code) uses a + * finer-grained lock, update this + */ + mutex_lock(&assigned_dev->kvm->lock); + kvm_set_irq(assigned_dev->kvm, + assigned_dev->guest_irq, 1); + mutex_unlock(&assigned_dev->kvm->lock); + kvm_put_kvm(assigned_dev->kvm); +} + +/* FIXME: Implement the OR logic needed to make shared interrupts on + * this line behave properly + */ +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + kvm_get_kvm(assigned_dev->kvm); + schedule_work(&assigned_dev->interrupt_work); + disable_irq_nosync(irq); + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + kvm_set_irq(dev->kvm, dev->guest_irq, 0); + enable_irq(dev->host_irq); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + if (match->irq_requested) { + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + mutex_unlock(&kvm->lock); + return 0; + } + + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); + + if (irqchip_in_kernel(kvm)) { + if (!capable(CAP_SYS_RAWIO)) { + r = -EPERM; + goto out_release; + } + + if (assigned_irq->host_irq) + match->host_irq = assigned_irq->host_irq; + else + match->host_irq = match->dev->irq; + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); + + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_device", (void *)match)) { + r = -EIO; + goto out_release; + } + } + + match->irq_requested = true; + mutex_unlock(&kvm->lock); + return r; +out_release: + mutex_unlock(&kvm->lock); + kvm_free_assigned_device(kvm, match); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EINVAL; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_bus_and_slot(assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->dev = dev; + + match->kvm = kvm; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + r = kvm_iommu_map_guest(kvm, match); + if (r) + goto out_list_del; + } + +out: + mutex_unlock(&kvm->lock); + return r; +out_list_del: + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + mutex_unlock(&kvm->lock); + return r; +} +#endif + static inline int valid_vcpu(int n) { return likely(n >= 0 && n < KVM_MAX_VCPUS); } +inline int kvm_is_mmio_pfn(pfn_t pfn) +{ + if (pfn_valid(pfn)) + return PageReserved(pfn_to_page(pfn)); + + return true; +} + /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -570,6 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm, } kvm_free_physmem_slot(&old, &new); +#ifdef CONFIG_DMAR + /* map the pages in iommu page table */ + r = kvm_iommu_map_pages(kvm, base_gfn, npages); + if (r) + goto out; +#endif return 0; out_free: @@ -708,9 +962,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_hva); -/* - * Requires current->mm->mmap_sem to be held - */ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { struct page *page[1]; @@ -726,21 +977,24 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) return page_to_pfn(bad_page); } - npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, - NULL); + npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); + if (vma == NULL || addr < vma->vm_start || !(vma->vm_flags & VM_PFNMAP)) { + up_read(¤t->mm->mmap_sem); get_page(bad_page); return page_to_pfn(bad_page); } pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - BUG_ON(pfn_valid(pfn)); + up_read(¤t->mm->mmap_sem); + BUG_ON(!kvm_is_mmio_pfn(pfn)); } else pfn = page_to_pfn(page[0]); @@ -754,10 +1008,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) pfn_t pfn; pfn = gfn_to_pfn(kvm, gfn); - if (pfn_valid(pfn)) + if (!kvm_is_mmio_pfn(pfn)) return pfn_to_page(pfn); - WARN_ON(!pfn_valid(pfn)); + WARN_ON(kvm_is_mmio_pfn(pfn)); get_page(bad_page); return bad_page; @@ -773,7 +1027,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - if (pfn_valid(pfn)) + if (!kvm_is_mmio_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); @@ -799,7 +1053,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty); void kvm_set_pfn_dirty(pfn_t pfn) { - if (pfn_valid(pfn)) { + if (!kvm_is_mmio_pfn(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) SetPageDirty(page); @@ -809,14 +1063,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(pfn_t pfn) { - if (pfn_valid(pfn)) + if (!kvm_is_mmio_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); void kvm_get_pfn(pfn_t pfn) { - if (pfn_valid(pfn)) + if (!kvm_is_mmio_pfn(pfn)) get_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_get_pfn); @@ -972,12 +1226,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if (kvm_cpu_has_interrupt(vcpu)) - break; - if (kvm_cpu_has_pending_timer(vcpu)) - break; - if (kvm_arch_vcpu_runnable(vcpu)) + if (kvm_cpu_has_interrupt(vcpu) || + kvm_cpu_has_pending_timer(vcpu) || + kvm_arch_vcpu_runnable(vcpu)) { + set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; + } if (signal_pending(current)) break; @@ -1074,12 +1328,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) r = kvm_arch_vcpu_setup(vcpu); if (r) - goto vcpu_destroy; + return r; mutex_lock(&kvm->lock); if (kvm->vcpus[n]) { r = -EEXIST; - mutex_unlock(&kvm->lock); goto vcpu_destroy; } kvm->vcpus[n] = vcpu; @@ -1095,8 +1348,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) unlink: mutex_lock(&kvm->lock); kvm->vcpus[n] = NULL; - mutex_unlock(&kvm->lock); vcpu_destroy: + mutex_unlock(&kvm->lock); kvm_arch_vcpu_destroy(vcpu); return r; } @@ -1118,6 +1371,8 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; + struct kvm_fpu *fpu = NULL; + struct kvm_sregs *kvm_sregs = NULL; if (vcpu->kvm->mm != current->mm) return -EIO; @@ -1165,25 +1420,28 @@ out_free2: break; } case KVM_GET_SREGS: { - struct kvm_sregs kvm_sregs; - - memset(&kvm_sregs, 0, sizeof kvm_sregs); - r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; + r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) + if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) goto out; r = 0; break; } case KVM_SET_SREGS: { - struct kvm_sregs kvm_sregs; - + kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + r = -ENOMEM; + if (!kvm_sregs) + goto out; r = -EFAULT; - if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) + if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) goto out; - r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); + r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); if (r) goto out; r = 0; @@ -1264,25 +1522,28 @@ out_free2: break; } case KVM_GET_FPU: { - struct kvm_fpu fpu; - - memset(&fpu, 0, sizeof fpu); - r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; + r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &fpu, sizeof fpu)) + if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) goto out; r = 0; break; } case KVM_SET_FPU: { - struct kvm_fpu fpu; - + fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + r = -ENOMEM; + if (!fpu) + goto out; r = -EFAULT; - if (copy_from_user(&fpu, argp, sizeof fpu)) + if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) goto out; - r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); + r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); if (r) goto out; r = 0; @@ -1292,6 +1553,8 @@ out_free2: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: + kfree(fpu); + kfree(kvm_sregs); return r; } @@ -1360,6 +1623,30 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif +#ifdef KVM_CAP_DEVICE_ASSIGNMENT + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -1369,17 +1656,22 @@ out: static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page[1]; + unsigned long addr; + int npages; + gfn_t gfn = vmf->pgoff; struct kvm *kvm = vma->vm_file->private_data; - struct page *page; - if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) return VM_FAULT_SIGBUS; - page = gfn_to_page(kvm, vmf->pgoff); - if (is_error_page(page)) { - kvm_release_page_clean(page); + + npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, + NULL); + if (unlikely(npages != 1)) return VM_FAULT_SIGBUS; - } - vmf->page = page; + + vmf->page = page[0]; return 0; } diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c index 58141f3..41dcc84 100644 --- a/virt/kvm/kvm_trace.c +++ b/virt/kvm/kvm_trace.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/relay.h> #include <linux/debugfs.h> +#include <linux/ktime.h> #include <linux/kvm_host.h> @@ -35,16 +36,16 @@ static struct kvm_trace *kvm_trace; struct kvm_trace_probe { const char *name; const char *format; - u32 cycle_in; + u32 timestamp_in; marker_probe_func *probe_func; }; -static inline int calc_rec_size(int cycle, int extra) +static inline int calc_rec_size(int timestamp, int extra) { int rec_size = KVM_TRC_HEAD_SIZE; rec_size += extra; - return cycle ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; + return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; } static void kvm_add_trace(void *probe_private, void *call_data, @@ -54,12 +55,13 @@ static void kvm_add_trace(void *probe_private, void *call_data, struct kvm_trace *kt = kvm_trace; struct kvm_trace_rec rec; struct kvm_vcpu *vcpu; - int i, extra, size; + int i, size; + u32 extra; if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) return; - rec.event = va_arg(*args, u32); + rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32)); vcpu = va_arg(*args, struct kvm_vcpu *); rec.pid = current->tgid; rec.vcpu_id = vcpu->vcpu_id; @@ -67,21 +69,21 @@ static void kvm_add_trace(void *probe_private, void *call_data, extra = va_arg(*args, u32); WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); - rec.extra_u32 = extra; - rec.cycle_in = p->cycle_in; + rec.rec_val |= TRACE_REC_TCS(p->timestamp_in) + | TRACE_REC_NUM_DATA_ARGS(extra); - if (rec.cycle_in) { - rec.u.cycle.cycle_u64 = get_cycles(); + if (p->timestamp_in) { + rec.u.timestamp.timestamp = ktime_to_ns(ktime_get()); - for (i = 0; i < rec.extra_u32; i++) - rec.u.cycle.extra_u32[i] = va_arg(*args, u32); + for (i = 0; i < extra; i++) + rec.u.timestamp.extra_u32[i] = va_arg(*args, u32); } else { - for (i = 0; i < rec.extra_u32; i++) - rec.u.nocycle.extra_u32[i] = va_arg(*args, u32); + for (i = 0; i < extra; i++) + rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32); } - size = calc_rec_size(rec.cycle_in, rec.extra_u32 * sizeof(u32)); + size = calc_rec_size(p->timestamp_in, extra * sizeof(u32)); relay_write(kt->rchan, &rec, size); } diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c new file mode 100644 index 0000000..a770874 --- /dev/null +++ b/virt/kvm/vtd.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Author: Allen M. Kay <allen.m.kay@intel.com> + * Author: Weidong Han <weidong.han@intel.com> + * Author: Ben-Ami Yassour <benami@il.ibm.com> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/intel-iommu.h> + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + int i, r = 0; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + for (i = 0; i < npages; i++) { + /* check if already mapped */ + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + if (pfn) + continue; + + pfn = gfn_to_pfn(kvm, gfn); + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_ERR "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); + goto unmap_pages; + } + gfn++; + } + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, base_gfn, i); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, r; + + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (r) + break; + } + up_read(&kvm->slots_lock); + return r; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + int r; + + if (!intel_iommu_found()) { + printk(KERN_ERR "%s: intel iommu not found\n", __func__); + return -ENODEV; + } + + printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + pdev = assigned_dev->dev; + + if (pdev == NULL) { + if (kvm->arch.intel_iommu_domain) { + intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); + kvm->arch.intel_iommu_domain = NULL; + } + return -ENODEV; + } + + kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); + if (!kvm->arch.intel_iommu_domain) + return -ENODEV; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, + pdev->bus->number, pdev->devfn); + + r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, + pdev); + if (r) { + printk(KERN_ERR "Domain context map for %s failed", + pci_name(pdev)); + goto out_unmap; + } + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + int i; + + for (i = 0; i < npages; i++) { + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + kvm_release_pfn_clean(pfn); + gfn++; + } +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i; + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + } + up_read(&kvm->slots_lock); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct kvm_assigned_dev_kernel *entry; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { + printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", + entry->host_busnr, + PCI_SLOT(entry->host_devfn), + PCI_FUNC(entry->host_devfn)); + + /* detach kvm dmar domain */ + intel_iommu_detach_dev(domain, entry->host_busnr, + entry->host_devfn); + } + kvm_iommu_unmap_memslots(kvm); + intel_iommu_domain_exit(domain); + return 0; +} |