From e13dcc1ab593452b99c39de2cb588c39f6d6a7e9 Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Tue, 3 Jul 2012 05:48:49 +0000 Subject: PPC: epapr: create define for return code value of success Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index bf2c06c..c0c7adc 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -88,7 +88,8 @@ #define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) #define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) -/* epapr error codes */ +/* epapr return codes */ +#define EV_SUCCESS 0 #define EV_EPERM 1 /* Operation not permitted */ #define EV_ENOENT 2 /* Entry Not Found */ #define EV_EIO 3 /* I/O error occured */ -- cgit v0.10.2 From fdcf8bd7e711d4c0fe3ef624cfb5e3808149ff7f Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Tue, 3 Jul 2012 05:48:50 +0000 Subject: KVM: PPC: use definitions in epapr header for hcalls Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index c18916b..a168ce3 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -75,9 +75,10 @@ struct kvm_vcpu_arch_shared { }; #define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ -#define HC_VENDOR_KVM (42 << 16) -#define HC_EV_SUCCESS 0 -#define HC_EV_UNIMPLEMENTED 12 + +#define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num) + +#include #define KVM_FEATURE_MAGIC_PAGE 1 @@ -121,7 +122,7 @@ static unsigned long kvm_hypercall(unsigned long *in, unsigned long *out, unsigned long nr) { - return HC_EV_UNIMPLEMENTED; + return EV_UNIMPLEMENTED; } #endif @@ -132,7 +133,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2) unsigned long out[8]; unsigned long r; - r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); *r2 = out[0]; return r; @@ -143,7 +144,7 @@ static inline long kvm_hypercall0(unsigned int nr) unsigned long in[8]; unsigned long out[8]; - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); } static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) @@ -152,7 +153,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) unsigned long out[8]; in[0] = p1; - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); } static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, @@ -163,7 +164,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, in[0] = p1; in[1] = p2; - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); } static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, @@ -175,7 +176,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, in[0] = p1; in[1] = p2; in[2] = p3; - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); } static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, @@ -189,7 +190,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, in[1] = p2; in[2] = p3; in[3] = p4; - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); } diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 867db1d..a61b133 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -419,7 +419,7 @@ static void kvm_map_magic_page(void *data) in[0] = KVM_MAGIC_PAGE; in[1] = KVM_MAGIC_PAGE; - kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE); + kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE)); *features = out[0]; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4d213b8..0368a93 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -67,18 +67,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) } switch (nr) { - case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: + case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): { vcpu->arch.magic_page_pa = param1; vcpu->arch.magic_page_ea = param2; r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; - r = HC_EV_SUCCESS; + r = EV_SUCCESS; break; } - case HC_VENDOR_KVM | KVM_HC_FEATURES: - r = HC_EV_SUCCESS; + case KVM_HCALL_TOKEN(KVM_HC_FEATURES): + r = EV_SUCCESS; #if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) /* XXX Missing magic page on 44x */ r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); @@ -87,7 +87,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) /* Second return value is in r4 */ break; default: - r = HC_EV_UNIMPLEMENTED; + r = EV_UNIMPLEMENTED; break; } -- cgit v0.10.2 From 784bafac79e7646e56f40998a6dde0e1ed5595f8 Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Tue, 3 Jul 2012 05:48:51 +0000 Subject: KVM: PPC: add pvinfo for hcall opcodes on e500mc/e5500 Signed-off-by: Liu Yu [stuart: factored this out from idle hcall support in host patch] Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0368a93..a478e66 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -751,9 +751,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { + u32 inst_nop = 0x60000000; +#ifdef CONFIG_KVM_BOOKE_HV + u32 inst_sc1 = 0x44000022; + pvinfo->hcall[0] = inst_sc1; + pvinfo->hcall[1] = inst_nop; + pvinfo->hcall[2] = inst_nop; + pvinfo->hcall[3] = inst_nop; +#else u32 inst_lis = 0x3c000000; u32 inst_ori = 0x60000000; - u32 inst_nop = 0x60000000; u32 inst_sc = 0x44000002; u32 inst_imm_mask = 0xffff; @@ -770,6 +777,7 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); pvinfo->hcall[2] = inst_sc; pvinfo->hcall[3] = inst_nop; +#endif return 0; } -- cgit v0.10.2 From 9202e07636f0c4858ba6c30773a3f160b2b5659a Mon Sep 17 00:00:00 2001 From: Liu Yu-B13201 Date: Tue, 3 Jul 2012 05:48:52 +0000 Subject: KVM: PPC: Add support for ePAPR idle hcall in host kernel And add a new flag definition in kvm_ppc_pvinfo to indicate whether the host supports the EV_IDLE hcall. Signed-off-by: Liu Yu [stuart.yoder@freescale.com: cleanup,fixes for conditions allowing idle] Signed-off-by: Stuart Yoder [agraf: fix typo] Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f6ec3a9..170afb1 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1194,12 +1194,15 @@ struct kvm_ppc_pvinfo { This ioctl fetches PV specific information that need to be passed to the guest using the device tree or other means from vm context. -For now the only implemented piece of information distributed here is an array -of 4 instructions that make up a hypercall. +The hcall array defines 4 instructions that make up a hypercall. If any additional field gets added to this structure later on, a bit for that additional piece of information will be set in the flags bitmap. +The flags bitmap is defined as: + + /* the host supports the ePAPR idle hcall + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) 4.48 KVM_ASSIGN_PCI_DEVICE diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 7e313f1..13d6b7bf 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -34,5 +34,6 @@ header-y += termios.h header-y += types.h header-y += ucontext.h header-y += unistd.h +header-y += epapr_hcalls.h generic-y += rwsem.h diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a478e66..dbf56e1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -38,8 +38,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !(v->arch.shared->msr & MSR_WE) || - !!(v->arch.pending_exceptions) || + return !!(v->arch.pending_exceptions) || v->requests; } @@ -86,6 +85,11 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) /* Second return value is in r4 */ break; + case EV_HCALL_TOKEN(EV_IDLE): + r = EV_SUCCESS; + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + break; default: r = EV_UNIMPLEMENTED; break; @@ -779,6 +783,8 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) pvinfo->hcall[3] = inst_nop; #endif + pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE; + return 0; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 0a6d6ba..4cb3761 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -477,6 +477,8 @@ struct kvm_ppc_smmu_info { struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; +#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ -- cgit v0.10.2 From 2f979de8a716bdbdc9f4db532652fbca08ed710c Mon Sep 17 00:00:00 2001 From: Liu Yu-B13201 Date: Tue, 3 Jul 2012 05:48:53 +0000 Subject: KVM: PPC: ev_idle hcall support for e500 guests Signed-off-by: Liu Yu [varun: 64-bit changes] Signed-off-by: Varun Sethi Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index c0c7adc..833ce2c 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -50,10 +50,6 @@ #ifndef _EPAPR_HCALLS_H #define _EPAPR_HCALLS_H -#include -#include -#include - #define EV_BYTE_CHANNEL_SEND 1 #define EV_BYTE_CHANNEL_RECEIVE 2 #define EV_BYTE_CHANNEL_POLL 3 @@ -109,6 +105,11 @@ #define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ #define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ +#ifndef __ASSEMBLY__ +#include +#include +#include + /* * Hypercall register clobber list * @@ -506,5 +507,5 @@ static inline unsigned int ev_idle(void) return r3; } - +#endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S index 697b390..62c0dc2 100644 --- a/arch/powerpc/kernel/epapr_hcalls.S +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -8,13 +8,41 @@ */ #include +#include #include #include #include #include #include +#include #include +/* epapr_ev_idle() was derived from e500_idle() */ +_GLOBAL(epapr_ev_idle) + CURRENT_THREAD_INFO(r3, r1) + PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */ + ori r4, r4,_TLF_NAPPING /* so when we take an exception */ + PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + + wrteei 1 + +idle_loop: + LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) + +.global epapr_ev_idle_start +epapr_ev_idle_start: + li r3, -1 + nop + nop + nop + + /* + * Guard against spurious wakeups from a hypervisor -- + * only interrupt will cause us to return to LR due to + * _TLF_NAPPING. + */ + b idle_loop + /* Hypercall entry point. Will be patched with device tree instructions. */ .global epapr_hypercall_start epapr_hypercall_start: diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index 028aeae..f3eab85 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -21,6 +21,10 @@ #include #include #include +#include + +extern void epapr_ev_idle(void); +extern u32 epapr_ev_idle_start[]; bool epapr_paravirt_enabled; @@ -41,8 +45,13 @@ static int __init epapr_paravirt_init(void) if (len % 4 || len > (4 * 4)) return -ENODEV; - for (i = 0; i < (len / 4); i++) + for (i = 0; i < (len / 4); i++) { patch_instruction(epapr_hypercall_start + i, insts[i]); + patch_instruction(epapr_ev_idle_start + i, insts[i]); + } + + if (of_get_property(hyper_node, "has-idle", NULL)) + ppc_md.power_save = epapr_ev_idle; epapr_paravirt_enabled = true; -- cgit v0.10.2 From 40656397241860bb21f2802af17ac1de607fb7a9 Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Tue, 3 Jul 2012 05:48:54 +0000 Subject: PPC: select EPAPR_PARAVIRT for all users of epapr hcalls Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index e7a896a..48a920d 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -90,6 +90,7 @@ config MPIC config PPC_EPAPR_HV_PIC bool default n + select EPAPR_PARAVIRT config MPIC_WEIRD bool diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig index 830cd62..aa99cd2 100644 --- a/drivers/tty/Kconfig +++ b/drivers/tty/Kconfig @@ -358,6 +358,7 @@ config TRACE_SINK config PPC_EPAPR_HV_BYTECHAN tristate "ePAPR hypervisor byte channel driver" depends on PPC + select EPAPR_PARAVIRT help This driver creates /dev entries for each ePAPR hypervisor byte channel, thereby allowing applications to communicate with byte diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig index 2dcdbc9..99ebdde 100644 --- a/drivers/virt/Kconfig +++ b/drivers/virt/Kconfig @@ -15,6 +15,7 @@ if VIRT_DRIVERS config FSL_HV_MANAGER tristate "Freescale hypervisor management driver" depends on FSL_SOC + select EPAPR_PARAVIRT help The Freescale hypervisor management driver provides several services to drivers and applications related to the Freescale hypervisor: -- cgit v0.10.2 From 305bcf26128e380bb1296d2802387659ab8b038e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 3 Jul 2012 05:48:55 +0000 Subject: powerpc/fsl-soc: use CONFIG_EPAPR_PARAVIRT for hcalls Signed-off-by: Scott Wood Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 6e097de..7e2b2f2 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -236,7 +236,6 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc) u32 intr_index; u32 have_shift = 0; struct fsl_msi_cascade_data *cascade_data; - unsigned int ret; cascade_data = irq_get_handler_data(irq); msi_data = cascade_data->msi_data; @@ -268,7 +267,9 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc) case FSL_PIC_IP_IPIC: msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4); break; - case FSL_PIC_IP_VMPIC: +#ifdef CONFIG_EPAPR_PARAVIRT + case FSL_PIC_IP_VMPIC: { + unsigned int ret; ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value); if (ret) { pr_err("fsl-msi: fh_vmpic_get_msir() failed for " @@ -277,6 +278,8 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc) } break; } +#endif + } while (msir_value) { intr_index = ffs(msir_value) - 1; @@ -508,10 +511,12 @@ static const struct of_device_id fsl_of_msi_ids[] = { .compatible = "fsl,ipic-msi", .data = (void *)&ipic_msi_feature, }, +#ifdef CONFIG_EPAPR_PARAVIRT { .compatible = "fsl,vmpic-msi", .data = (void *)&vmpic_msi_feature, }, +#endif {} }; diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index c449dbd..97118dc 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -253,6 +253,7 @@ struct platform_diu_data_ops diu_ops; EXPORT_SYMBOL(diu_ops); #endif +#ifdef CONFIG_EPAPR_PARAVIRT /* * Restart the current partition * @@ -278,3 +279,4 @@ void fsl_hv_halt(void) pr_info("hv exit\n"); fh_partition_stop(-1); } +#endif -- cgit v0.10.2 From 8e525d59d024f54b88a038faac38f76b9094774e Mon Sep 17 00:00:00 2001 From: Liu Yu-B13201 Date: Tue, 3 Jul 2012 05:48:56 +0000 Subject: PPC: Don't use hardcoded opcode for ePAPR hcall invocation Signed-off-by: Liu Yu Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 833ce2c..b8d9445 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -195,7 +195,7 @@ static inline unsigned int ev_int_set_config(unsigned int interrupt, r5 = priority; r6 = destination; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6) : : EV_HCALL_CLOBBERS4 ); @@ -224,7 +224,7 @@ static inline unsigned int ev_int_get_config(unsigned int interrupt, r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG); r3 = interrupt; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6) : : EV_HCALL_CLOBBERS4 ); @@ -254,7 +254,7 @@ static inline unsigned int ev_int_set_mask(unsigned int interrupt, r3 = interrupt; r4 = mask; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -279,7 +279,7 @@ static inline unsigned int ev_int_get_mask(unsigned int interrupt, r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK); r3 = interrupt; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "=r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -307,7 +307,7 @@ static inline unsigned int ev_int_eoi(unsigned int interrupt) r11 = EV_HCALL_TOKEN(EV_INT_EOI); r3 = interrupt; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -346,7 +346,7 @@ static inline unsigned int ev_byte_channel_send(unsigned int handle, r7 = be32_to_cpu(p[2]); r8 = be32_to_cpu(p[3]); - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8) : : EV_HCALL_CLOBBERS6 @@ -385,7 +385,7 @@ static inline unsigned int ev_byte_channel_receive(unsigned int handle, r3 = handle; r4 = *count; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8) : : EV_HCALL_CLOBBERS6 @@ -423,7 +423,7 @@ static inline unsigned int ev_byte_channel_poll(unsigned int handle, r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL); r3 = handle; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5) : : EV_HCALL_CLOBBERS3 ); @@ -456,7 +456,7 @@ static inline unsigned int ev_int_iack(unsigned int handle, r11 = EV_HCALL_TOKEN(EV_INT_IACK); r3 = handle; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "=r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -480,7 +480,7 @@ static inline unsigned int ev_doorbell_send(unsigned int handle) r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND); r3 = handle; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -500,7 +500,7 @@ static inline unsigned int ev_idle(void) r11 = EV_HCALL_TOKEN(EV_IDLE); - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "=r" (r3) : : EV_HCALL_CLOBBERS1 ); diff --git a/arch/powerpc/include/asm/fsl_hcalls.h b/arch/powerpc/include/asm/fsl_hcalls.h index 922d9b5..3abb583 100644 --- a/arch/powerpc/include/asm/fsl_hcalls.h +++ b/arch/powerpc/include/asm/fsl_hcalls.h @@ -96,7 +96,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask) r11 = FH_HCALL_TOKEN(FH_SEND_NMI); r3 = vcpu_mask; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -151,7 +151,7 @@ static inline unsigned int fh_partition_get_dtprop(int handle, r9 = (uint32_t)propvalue_addr; r10 = *propvalue_len; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8), "+r" (r9), "+r" (r10) @@ -205,7 +205,7 @@ static inline unsigned int fh_partition_set_dtprop(int handle, r9 = (uint32_t)propvalue_addr; r10 = propvalue_len; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8), "+r" (r9), "+r" (r10) @@ -229,7 +229,7 @@ static inline unsigned int fh_partition_restart(unsigned int partition) r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART); r3 = partition; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -262,7 +262,7 @@ static inline unsigned int fh_partition_get_status(unsigned int partition, r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS); r3 = partition; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "=r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -295,7 +295,7 @@ static inline unsigned int fh_partition_start(unsigned int partition, r4 = entry_point; r5 = load; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5) : : EV_HCALL_CLOBBERS3 ); @@ -317,7 +317,7 @@ static inline unsigned int fh_partition_stop(unsigned int partition) r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP); r3 = partition; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -376,7 +376,7 @@ static inline unsigned int fh_partition_memcpy(unsigned int source, #endif r7 = count; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7) : : EV_HCALL_CLOBBERS5 @@ -399,7 +399,7 @@ static inline unsigned int fh_dma_enable(unsigned int liodn) r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE); r3 = liodn; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -421,7 +421,7 @@ static inline unsigned int fh_dma_disable(unsigned int liodn) r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE); r3 = liodn; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -447,7 +447,7 @@ static inline unsigned int fh_vmpic_get_msir(unsigned int interrupt, r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR); r3 = interrupt; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "=r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -469,7 +469,7 @@ static inline unsigned int fh_system_reset(void) r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET); - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "=r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -506,7 +506,7 @@ static inline unsigned int fh_err_get_info(int queue, uint32_t *bufsize, r6 = addr_lo; r7 = peek; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7) : : EV_HCALL_CLOBBERS5 @@ -542,7 +542,7 @@ static inline unsigned int fh_get_core_state(unsigned int handle, r3 = handle; r4 = vcpu; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -572,7 +572,7 @@ static inline unsigned int fh_enter_nap(unsigned int handle, unsigned int vcpu) r3 = handle; r4 = vcpu; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -597,7 +597,7 @@ static inline unsigned int fh_exit_nap(unsigned int handle, unsigned int vcpu) r3 = handle; r4 = vcpu; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3), "+r" (r4) : : EV_HCALL_CLOBBERS2 ); @@ -618,7 +618,7 @@ static inline unsigned int fh_claim_device(unsigned int handle) r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE); r3 = handle; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); @@ -645,7 +645,7 @@ static inline unsigned int fh_partition_stop_dma(unsigned int handle) r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA); r3 = handle; - __asm__ __volatile__ ("sc 1" + asm volatile("bl epapr_hypercall_start" : "+r" (r11), "+r" (r3) : : EV_HCALL_CLOBBERS1 ); -- cgit v0.10.2 From 97c95059848358f1577f471ec47cf68690f996e4 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 2 Aug 2012 15:10:00 +0200 Subject: KVM: PPC: PR: Use generic tracepoint for guest exit We want to have tracing information on guest exits for booke as well as book3s. Since most information is identical, use a common trace point. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 05c28f5..7f0fe6f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -549,7 +549,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* We get here with MSR.EE=0, so enable it to be a nice citizen */ __hard_irq_enable(); - trace_kvm_book3s_exit(exit_nr, vcpu); + trace_kvm_exit(exit_nr, vcpu); preempt_enable(); kvm_resched(vcpu); switch (exit_nr) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d25a097..7ce2ed0 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -39,6 +39,7 @@ #include "timing.h" #include "booke.h" +#include "trace.h" unsigned long kvmppc_booke_handlers; @@ -677,6 +678,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, local_irq_enable(); + trace_kvm_exit(exit_nr, vcpu); + run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 877186b..9fab6ed 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -31,6 +31,57 @@ TRACE_EVENT(kvm_ppc_instr, __entry->inst, __entry->pc, __entry->emulate) ); +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) +#ifdef CONFIG_KVM_BOOK3S_PR + __field( unsigned long, srr1 ) +#endif + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( +#ifdef CONFIG_KVM_BOOK3S_PR + struct kvmppc_book3s_shadow_vcpu *svcpu; +#endif + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; +#ifdef CONFIG_KVM_BOOK3S_PR + svcpu = svcpu_get(vcpu); + __entry->srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); +#endif + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=0x%x" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" +#ifdef CONFIG_KVM_BOOK3S_PR + " | srr1=0x%lx" +#endif + " | last_inst=0x%lx" + , + __entry->exit_nr, + __entry->pc, + __entry->msr, + __entry->dar, +#ifdef CONFIG_KVM_BOOK3S_PR + __entry->srr1, +#endif + __entry->last_inst + ) +); + TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), TP_ARGS(stlb_index), @@ -105,34 +156,6 @@ TRACE_EVENT(kvm_gtlb_write, #ifdef CONFIG_KVM_BOOK3S_PR -TRACE_EVENT(kvm_book3s_exit, - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), - TP_ARGS(exit_nr, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, exit_nr ) - __field( unsigned long, pc ) - __field( unsigned long, msr ) - __field( unsigned long, dar ) - __field( unsigned long, srr1 ) - ), - - TP_fast_assign( - struct kvmppc_book3s_shadow_vcpu *svcpu; - __entry->exit_nr = exit_nr; - __entry->pc = kvmppc_get_pc(vcpu); - __entry->dar = kvmppc_get_fault_dar(vcpu); - __entry->msr = vcpu->arch.shared->msr; - svcpu = svcpu_get(vcpu); - __entry->srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); - ), - - TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", - __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, - __entry->srr1) -); - TRACE_EVENT(kvm_book3s_reenter, TP_PROTO(int r, struct kvm_vcpu *vcpu), TP_ARGS(r, vcpu), -- cgit v0.10.2 From f4800b1f4d23156e9080a08d6114e5d8bb767964 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 7 Aug 2012 10:24:14 +0200 Subject: KVM: PPC: Expose SYNC cap based on mmu notifiers Semantically, the "SYNC" cap means that we have mmu notifiers available. Express this in our #ifdef'ery around the feature, so that we can be sure we don't miss out on ppc targets when they get their implementation. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index dbf56e1..45fe433 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -264,10 +264,16 @@ int kvm_dev_ioctl_check_extension(long ext) if (cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; +#endif case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_BOOK3S_64_HV r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; - break; +#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) + r = 1; +#else + r = 0; #endif + break; case KVM_CAP_NR_VCPUS: /* * Recommending a number of CPUs is somewhat arbitrary; we -- cgit v0.10.2 From cf1c5ca47319d9eb49166859921822fea354d4b3 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 1 Aug 2012 12:56:51 +0200 Subject: KVM: PPC: BookE: Expose remote TLB flushes in debugfs We're already counting remote TLB flushes in a variable, but don't export it to user space yet. Do so, so we know what's going on. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 7ce2ed0..1d4ce9a 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -63,6 +63,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "doorbell", VCPU_STAT(dbell_exits) }, { "guest doorbell", VCPU_STAT(gdbell_exits) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { NULL } }; -- cgit v0.10.2 From 2bb890f5ee79c85b9d3b7df37ecb639d8d4b961e Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 2 Aug 2012 13:38:49 +0200 Subject: KVM: PPC: E500: Fix clear_tlb_refs Our mapping code assumes that TLB0 entries are always mapped. However, after calling clear_tlb_refs() this is no longer the case. Map them dynamically if we find an entry unmapped in TLB0. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index ff38b66..b56b6e1 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1039,8 +1039,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, sesel = 0; /* unused */ priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, - &priv->ref, eaddr, &stlbe); + /* Only triggers after clear_tlb_refs */ + if (unlikely(!(priv->ref.flags & E500_TLB_VALID))) + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + else + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, + &priv->ref, eaddr, &stlbe); break; case 1: { -- cgit v0.10.2 From 1340f3e8871b9f35b39c33d0140383c6c6c1f005 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 6 Aug 2012 00:04:14 +0000 Subject: KVM: PPC: Quieten message about allocating linear regions This is printed once for every RMA or HPT region that get preallocated. If one preallocates hundreds of such regions (in order to run hundreds of KVM guests), that gets rather painful, so make it a bit quieter. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fb4eac2..ec0a9e5 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -157,8 +157,8 @@ static void __init kvm_linear_init_one(ulong size, int count, int type) linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); for (i = 0; i < count; ++i) { linear = alloc_bootmem_align(size, size); - pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, - size >> 20); + pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, + size >> 20); linear_info[i].base_virt = linear; linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; linear_info[i].npages = npages; -- cgit v0.10.2 From 8043e494da644ec174f7df0b67f88ccf8777a1ce Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 10 Aug 2012 12:21:21 +0000 Subject: powerpc/epapr: export epapr_hypercall_start This fixes breakage introduced by the following commit: commit 6d2d82627f4f1e96a33664ace494fa363e0495cb Author: Liu Yu-B13201 Date: Tue Jul 3 05:48:56 2012 +0000 PPC: Don't use hardcoded opcode for ePAPR hcall invocation when a driver that uses ePAPR hypercalls is built as a module. Reported-by: Geert Uytterhoeven Signed-off-by: Scott Wood Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 3e40315..e597dde 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef CONFIG_PPC32 extern void transfer_to_handler(void); @@ -192,3 +193,7 @@ EXPORT_SYMBOL(__arch_hweight64); #ifdef CONFIG_PPC_BOOK3S_64 EXPORT_SYMBOL_GPL(mmu_psize_defs); #endif + +#ifdef CONFIG_EPAPR_PARAVIRT +EXPORT_SYMBOL(epapr_hypercall_start); +#endif -- cgit v0.10.2 From 4ffc6356ec690f77f65b7b78e0047a3fe8316371 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 8 Aug 2012 20:31:13 +0200 Subject: KVM: PPC: BookE: Add check_requests helper function We need a central place to check for pending requests in. Add one that only does the timer check we already do in a different place. Later, this central function can be extended by more checks. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1d4ce9a..bcf87fe 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -419,13 +419,6 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned int priority; - if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) { - smp_mb(); - update_timer_ints(vcpu); - } - } - priority = __ffs(*pending); while (priority < BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -461,6 +454,14 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return r; } +static void kvmppc_check_requests(struct kvm_vcpu *vcpu) +{ + if (vcpu->requests) { + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) + update_timer_ints(vcpu); + } +} + /* * Common checks before entering the guest world. Call with interrupts * disabled. @@ -485,6 +486,15 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) break; } + smp_mb(); + if (vcpu->requests) { + /* Make sure we process requests preemptable */ + local_irq_enable(); + kvmppc_check_requests(vcpu); + local_irq_disable(); + continue; + } + if (kvmppc_core_prepare_to_enter(vcpu)) { /* interrupts got enabled in between, so we are back at square 1 */ -- cgit v0.10.2 From d69c6436443c05a64452054f51a79316297755f4 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 8 Aug 2012 20:44:20 +0200 Subject: KVM: PPC: BookE: Add support for vcpu->mode Generic KVM code might want to know whether we are inside guest context or outside. It also wants to be able to push us out of guest context. Add support to the BookE code for the generic vcpu->mode field that describes the above states. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bcf87fe..70a86c0 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -501,6 +501,15 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } + if (vcpu->mode == EXITING_GUEST_MODE) { + r = 1; + break; + } + + /* Going into guest context! Yay! */ + vcpu->mode = IN_GUEST_MODE; + smp_wmb(); + break; } @@ -572,6 +581,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvm_guest_exit(); out: + vcpu->mode = OUTSIDE_GUEST_MODE; + smp_wmb(); local_irq_enable(); return ret; } -- cgit v0.10.2 From 862d31f788f9a249f7656d02d8d4006e306108ce Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 31 Jul 2012 00:19:50 +0200 Subject: KVM: PPC: E500: Implement MMU notifiers The e500 target has lived without mmu notifiers ever since it got introduced, but fails for the user space check on them with hugetlbfs. So in order to get that one working, implement mmu notifiers in a reasonably dumb fashion and be happy. On embedded hardware, we almost never end up with mmu notifier calls, since most people don't overcommit. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 28e8f5e..cea9d3a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -46,7 +46,8 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV +#if defined(CONFIG_KVM_BOOK3S_64_HV) || defined(CONFIG_KVM_E500V2) || \ + defined(CONFIG_KVM_E500MC) #include #define KVM_ARCH_WANT_MMU_NOTIFIER diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e006f0b..88de314 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -104,6 +104,7 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); +extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int op, int *advance); diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index f4dacb9..40cad8c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -123,6 +123,7 @@ config KVM_E500V2 depends on EXPERIMENTAL && E500 && !PPC_E500MC select KVM select KVM_MMIO + select MMU_NOTIFIER ---help--- Support running unmodified E500 guest kernels in virtual machines on E500v2 host processors. @@ -138,6 +139,7 @@ config KVM_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV + select MMU_NOTIFIER ---help--- Support running unmodified E500MC/E5500 (32-bit) guest kernels in virtual machines on E500MC/E5500 host processors. diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 70a86c0..52f6cbb 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -459,6 +459,10 @@ static void kvmppc_check_requests(struct kvm_vcpu *vcpu) if (vcpu->requests) { if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) update_timer_ints(vcpu); +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_core_flush_tlb(vcpu); +#endif } } @@ -579,6 +583,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif kvm_guest_exit(); + vcpu->mode = OUTSIDE_GUEST_MODE; + smp_wmb(); out: vcpu->mode = OUTSIDE_GUEST_MODE; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index b56b6e1..de8ea29 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -303,18 +303,15 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, ref->pfn = pfn; ref->flags = E500_TLB_VALID; - if (tlbe_is_writable(gtlbe)) + if (tlbe_is_writable(gtlbe)) { ref->flags |= E500_TLB_DIRTY; + kvm_set_pfn_dirty(pfn); + } } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { if (ref->flags & E500_TLB_VALID) { - if (ref->flags & E500_TLB_DIRTY) - kvm_release_pfn_dirty(ref->pfn); - else - kvm_release_pfn_clean(ref->pfn); - ref->flags = 0; } } @@ -357,6 +354,13 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) clear_tlb_privs(vcpu_e500); } +void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + clear_tlb_refs(vcpu_e500); + clear_tlb1_bitmap(vcpu_e500); +} + static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, unsigned int eaddr, int as) { @@ -541,6 +545,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, /* Clear i-cache for new pages */ kvmppc_mmu_flush_icache(pfn); + + /* Drop refcount on page, so that mmu notifiers can clear it */ + kvm_release_pfn_clean(pfn); } /* XXX only map the one-one case, for now use TLB0 */ @@ -1064,6 +1071,47 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) { int i; -- cgit v0.10.2 From 6346046c3a69edc9149311473b940f3af7c93752 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 8 Aug 2012 00:44:52 +0200 Subject: KVM: PPC: BookE: Add some more trace points Without trace points, debugging what exactly is going on inside guest code can be very tricky. Add a few more trace points at places that hopefully tell us more when things go wrong. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 52f6cbb..00bcc57 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -143,6 +143,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) { + trace_kvm_booke_queue_irqprio(vcpu, priority); set_bit(priority, &vcpu->arch.pending_exceptions); } @@ -457,6 +458,8 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) static void kvmppc_check_requests(struct kvm_vcpu *vcpu) { if (vcpu->requests) { + trace_kvm_check_requests(vcpu); + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) update_timer_ints(vcpu); #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index de8ea29..1af6fab 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -312,6 +312,7 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { if (ref->flags & E500_TLB_VALID) { + trace_kvm_booke206_ref_release(ref->pfn, ref->flags); ref->flags = 0; } } @@ -1075,6 +1076,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { + trace_kvm_unmap_hva(hva); + /* * Flush all shadow tlb entries everywhere. This is slow, but * we are 100% sure that we catch the to be unmapped page diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 9fab6ed..cb2780a 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -82,6 +82,21 @@ TRACE_EVENT(kvm_exit, ) ); +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), TP_ARGS(stlb_index), @@ -149,6 +164,24 @@ TRACE_EVENT(kvm_gtlb_write, __entry->word1, __entry->word2) ); +TRACE_EVENT(kvm_check_requests, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, requests ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->requests = vcpu->requests; + ), + + TP_printk("vcpu=%x requests=%x", + __entry->cpu_nr, __entry->requests) +); + /************************************************************************* * Book3S trace points * @@ -418,6 +451,44 @@ TRACE_EVENT(kvm_booke206_gtlb_write, __entry->mas2, __entry->mas7_3) ); +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%x pending=%lx", + __entry->cpu_nr, __entry->priority, __entry->pending) +); + #endif #endif /* _TRACE_KVM_H */ -- cgit v0.10.2 From 2d8185d4ee22f425001d28d1817fc8d478e6fa02 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 10 Aug 2012 12:31:12 +0200 Subject: KVM: PPC: BookE: No duplicate request != 0 check We only call kvmppc_check_requests() when vcpu->requests != 0, so drop the redundant check in the function itself Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 00bcc57..683cbd6 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -457,16 +457,14 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) static void kvmppc_check_requests(struct kvm_vcpu *vcpu) { - if (vcpu->requests) { - trace_kvm_check_requests(vcpu); + trace_kvm_check_requests(vcpu); - if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) - update_timer_ints(vcpu); + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) + update_timer_ints(vcpu); #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvmppc_core_flush_tlb(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_core_flush_tlb(vcpu); #endif - } } /* -- cgit v0.10.2 From 03d25c5bd5c3125055bd36f4813ddb817def19dd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 10 Aug 2012 12:28:50 +0200 Subject: KVM: PPC: Use same kvmppc_prepare_to_enter code for booke and book3s_pr We need to do the same things when preparing to enter a guest for booke and book3s_pr cores. Fold the generic code into a generic function that both call. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 88de314..59b7c87 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -112,6 +112,7 @@ extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong val); extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *val); +extern void kvmppc_core_check_requests(struct kvm_vcpu *vcpu); extern int kvmppc_booke_init(void); extern void kvmppc_booke_exit(void); @@ -150,6 +151,8 @@ extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, extern int kvmppc_bookehv_init(void); extern void kvmppc_bookehv_exit(void); +extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 7f0fe6f..cae2def 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -88,6 +88,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) kvmppc_giveup_ext(vcpu, MSR_VSX); } +void kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ +} + static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) { ulong smsr = vcpu->arch.shared->msr; @@ -815,19 +819,9 @@ program_interrupt: * again due to a host external interrupt. */ __hard_irq_disable(); - if (signal_pending(current)) { - __hard_irq_enable(); -#ifdef EXIT_DEBUG - printk(KERN_EMERG "KVM: Going back to host\n"); -#endif - vcpu->stat.signal_exits++; + if (kvmppc_prepare_to_enter(vcpu)) { run->exit_reason = KVM_EXIT_INTR; r = -EINTR; - } else { - /* In case an interrupt came in that was triggered - * from userspace (like DEC), we need to check what - * to inject now! */ - kvmppc_core_prepare_to_enter(vcpu); } } @@ -1029,8 +1023,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) goto out; } - kvmppc_core_prepare_to_enter(vcpu); - /* * Interrupts could be timers for the guest which we have to inject * again, so let's postpone them until we're in the guest and if we @@ -1038,9 +1030,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * a host external interrupt. */ __hard_irq_disable(); - - /* No need to go into the guest when all we do is going out */ - if (signal_pending(current)) { + if (kvmppc_prepare_to_enter(vcpu)) { __hard_irq_enable(); kvm_run->exit_reason = KVM_EXIT_INTR; ret = -EINTR; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 683cbd6..4652e0b 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -455,10 +455,8 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return r; } -static void kvmppc_check_requests(struct kvm_vcpu *vcpu) +void kvmppc_core_check_requests(struct kvm_vcpu *vcpu) { - trace_kvm_check_requests(vcpu); - if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) update_timer_ints(vcpu); #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) @@ -467,60 +465,6 @@ static void kvmppc_check_requests(struct kvm_vcpu *vcpu) #endif } -/* - * Common checks before entering the guest world. Call with interrupts - * disabled. - * - * returns !0 if a signal is pending and check_signal is true - */ -static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) -{ - int r = 0; - - WARN_ON_ONCE(!irqs_disabled()); - while (true) { - if (need_resched()) { - local_irq_enable(); - cond_resched(); - local_irq_disable(); - continue; - } - - if (signal_pending(current)) { - r = 1; - break; - } - - smp_mb(); - if (vcpu->requests) { - /* Make sure we process requests preemptable */ - local_irq_enable(); - kvmppc_check_requests(vcpu); - local_irq_disable(); - continue; - } - - if (kvmppc_core_prepare_to_enter(vcpu)) { - /* interrupts got enabled in between, so we - are back at square 1 */ - continue; - } - - if (vcpu->mode == EXITING_GUEST_MODE) { - r = 1; - break; - } - - /* Going into guest context! Yay! */ - vcpu->mode = IN_GUEST_MODE; - smp_wmb(); - - break; - } - - return r; -} - int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 45fe433..153a26a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -47,6 +47,63 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_KVM_BOOK3S_64_HV +/* + * Common checks before entering the guest world. Call with interrupts + * disabled. + * + * returns !0 if a signal is pending and check_signal is true + */ +int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r = 0; + + WARN_ON_ONCE(!irqs_disabled()); + while (true) { + if (need_resched()) { + local_irq_enable(); + cond_resched(); + local_irq_disable(); + continue; + } + + if (signal_pending(current)) { + r = 1; + break; + } + + smp_mb(); + if (vcpu->requests) { + /* Make sure we process requests preemptable */ + local_irq_enable(); + trace_kvm_check_requests(vcpu); + kvmppc_core_check_requests(vcpu); + local_irq_disable(); + continue; + } + + if (kvmppc_core_prepare_to_enter(vcpu)) { + /* interrupts got enabled in between, so we + are back at square 1 */ + continue; + } + + if (vcpu->mode == EXITING_GUEST_MODE) { + r = 1; + break; + } + + /* Going into guest context! Yay! */ + vcpu->mode = IN_GUEST_MODE; + smp_wmb(); + + break; + } + + return r; +} +#endif /* CONFIG_KVM_BOOK3S_64_HV */ + int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { int nr = kvmppc_get_gpr(vcpu, 11); -- cgit v0.10.2 From 9b0cb3c808fef0d75d6f79ab9684246e6879f9c1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 10 Aug 2012 13:23:55 +0200 Subject: KVM: PPC: Book3s: PR: Add (dumb) MMU Notifier support Now that we have very simple MMU Notifier support for e500 in place, also add the same simple support to book3s. It gets us one step closer to actual fast support. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index cea9d3a..4a5ec8f 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -46,8 +46,7 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif -#if defined(CONFIG_KVM_BOOK3S_64_HV) || defined(CONFIG_KVM_E500V2) || \ - defined(CONFIG_KVM_E500MC) +#if !defined(CONFIG_KVM_440) #include #define KVM_ARCH_WANT_MMU_NOTIFIER diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 40cad8c..71f0cd9 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM_BOOK3S_64_HANDLER config KVM_BOOK3S_PR bool select KVM_MMIO + select MMU_NOTIFIER config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 837f13e..9fac010 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -254,6 +254,7 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); out: return r; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 0688b6b..6b2c80e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -168,6 +168,7 @@ map_again: kvmppc_mmu_hpte_cache_map(vcpu, pte); } + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); out: return r; diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 41cb001..2c86b0d 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -114,11 +114,6 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_del_init_rcu(&pte->list_vpte); hlist_del_init_rcu(&pte->list_vpte_long); - if (pte->pte.may_write) - kvm_release_pfn_dirty(pte->pfn); - else - kvm_release_pfn_clean(pte->pfn); - spin_unlock(&vcpu3s->mmu_lock); vcpu3s->hpte_cache_count--; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index cae2def..10f8217 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -90,8 +90,55 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) void kvmppc_core_check_requests(struct kvm_vcpu *vcpu) { + /* We misuse TLB_FLUSH to indicate that we want to clear + all shadow cache entries */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_mmu_pte_flush(vcpu, 0, 0); } +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) { ulong smsr = vcpu->arch.shared->msr; -- cgit v0.10.2 From e85ad380c6bf6dcd4776d313c81d16a6293db136 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 12 Aug 2012 11:13:25 +0200 Subject: KVM: PPC: BookE: Drop redundant vcpu->mode set We only need to set vcpu->mode to outside once. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4652e0b..492c343 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -528,8 +528,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif kvm_guest_exit(); - vcpu->mode = OUTSIDE_GUEST_MODE; - smp_wmb(); out: vcpu->mode = OUTSIDE_GUEST_MODE; -- cgit v0.10.2 From c63ddcb4540db95e5a4223cfa8cdbe6efbd5e386 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 12 Aug 2012 11:27:49 +0200 Subject: KVM: PPC: Book3S: PR: Only do resched check once per exit Now that we use our generic exit helper, we can safely drop our previous kvm_resched that we used to trigger at the beginning of the exit handler function. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 10f8217..2c268a1 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -602,7 +602,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, trace_kvm_exit(exit_nr, vcpu); preempt_enable(); - kvm_resched(vcpu); + switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: { -- cgit v0.10.2 From 706fb730cb4f9db2e3de33391475dd0616c2c935 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 12 Aug 2012 11:29:09 +0200 Subject: KVM: PPC: Exit guest context while handling exit The x86 implementation of KVM accounts for host time while processing guest exits. Do the same for us. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 2c268a1..b4ae11e 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -601,6 +601,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, __hard_irq_enable(); trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); preempt_enable(); switch (exit_nr) { @@ -872,6 +873,7 @@ program_interrupt: } } + kvm_guest_enter(); trace_kvm_book3s_reenter(r, vcpu); return r; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 492c343..887c7cc 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -650,6 +650,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, local_irq_enable(); trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; @@ -952,6 +953,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } } + kvm_guest_enter(); + return r; } -- cgit v0.10.2 From 0652eaaebea0995b3236e51dec727d62264f4248 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 12 Aug 2012 11:34:21 +0200 Subject: KVM: PPC: Book3S: PR: Indicate we're out of guest mode When going out of guest mode, indicate that we are in vcpu->mode. That way requests from other CPUs don't needlessly need to kick us to process them, because it'll just happen next time we enter the guest. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b4ae11e..9430a36 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1152,6 +1152,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif out: + vcpu->mode = OUTSIDE_GUEST_MODE; preempt_enable(); return ret; } -- cgit v0.10.2 From 24afa37b9c8f035d2fe2028e4824bc4e49bafe73 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 12 Aug 2012 12:42:30 +0200 Subject: KVM: PPC: Consistentify vcpu exit path When getting out of __vcpu_run, let's be consistent about the state we return in. We want to always * have IRQs enabled * have called kvm_guest_exit before Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 9430a36..3dec346 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -868,12 +868,15 @@ program_interrupt: */ __hard_irq_disable(); if (kvmppc_prepare_to_enter(vcpu)) { + /* local_irq_enable(); */ run->exit_reason = KVM_EXIT_INTR; r = -EINTR; + } else { + /* Going back to guest */ + kvm_guest_enter(); } } - kvm_guest_enter(); trace_kvm_book3s_reenter(r, vcpu); return r; @@ -1123,7 +1126,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); - kvm_guest_exit(); + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ current->thread.regs->msr = ext_msr; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 887c7cc..aae535f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -481,6 +481,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) local_irq_disable(); if (kvmppc_prepare_to_enter(vcpu)) { + local_irq_enable(); kvm_run->exit_reason = KVM_EXIT_INTR; ret = -EINTR; goto out; @@ -512,6 +513,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ + #ifdef CONFIG_PPC_FPU kvmppc_save_guest_fp(vcpu); @@ -527,12 +531,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) current->thread.fpexc_mode = fpexc_mode; #endif - kvm_guest_exit(); - out: vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - local_irq_enable(); return ret; } @@ -947,14 +948,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!(r & RESUME_HOST)) { local_irq_disable(); if (kvmppc_prepare_to_enter(vcpu)) { + local_irq_enable(); run->exit_reason = KVM_EXIT_INTR; r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); kvmppc_account_exit(vcpu, SIGNAL_EXITS); + } else { + /* Going back to guest */ + kvm_guest_enter(); } } - kvm_guest_enter(); - return r; } -- cgit v0.10.2 From bd2be6836ee493d41fe42367a2b129aa771185c1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 01:04:19 +0200 Subject: KVM: PPC: Book3S: PR: Rework irq disabling Today, we disable preemption while inside guest context, because we need to expose to the world that we are not in a preemptible context. However, during that time we already have interrupts disabled, which would indicate that we are in a non-preemptible context. The reason the checks for irqs_disabled() fail for us though is that we manually control hard IRQs and ignore all the lazy EE framework. Let's stop doing that. Instead, let's always use lazy EE to indicate when we want to disable IRQs, but do a special final switch that gets us into EE disabled, but soft enabled state. That way when we get back out of guest state, we are immediately ready to process interrupts. This simplifies the code drastically and reduces the time that we appear as preempt disabled. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 59b7c87..5459364 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -234,5 +234,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn) } } +/* Please call after prepare_to_enter. This function puts the lazy ee state + back to normal mode, without actually enabling interrupts. */ +static inline void kvmppc_lazy_ee_enable(void) +{ +#ifdef CONFIG_PPC64 + /* Only need to enable IRQs by hard enabling them after this */ + local_paca->irq_happened = 0; + local_paca->soft_enabled = 1; +#endif +} #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3dec346..e737db8 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -52,8 +52,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE -#define __hard_irq_disable local_irq_disable -#define __hard_irq_enable local_irq_enable #endif void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -597,12 +595,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; - /* We get here with MSR.EE=0, so enable it to be a nice citizen */ - __hard_irq_enable(); + /* We get here with MSR.EE=1 */ trace_kvm_exit(exit_nr, vcpu); kvm_guest_exit(); - preempt_enable(); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -854,7 +850,6 @@ program_interrupt: } } - preempt_disable(); if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if * we aren't already exiting to userspace for some other @@ -866,14 +861,15 @@ program_interrupt: * and if we really did time things so badly, then we just exit * again due to a host external interrupt. */ - __hard_irq_disable(); + local_irq_disable(); if (kvmppc_prepare_to_enter(vcpu)) { - /* local_irq_enable(); */ + local_irq_enable(); run->exit_reason = KVM_EXIT_INTR; r = -EINTR; } else { /* Going back to guest */ kvm_guest_enter(); + kvmppc_lazy_ee_enable(); } } @@ -1066,8 +1062,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; - preempt_disable(); - /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -1081,9 +1075,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * really did time things so badly, then we just exit again due to * a host external interrupt. */ - __hard_irq_disable(); + local_irq_disable(); if (kvmppc_prepare_to_enter(vcpu)) { - __hard_irq_enable(); + local_irq_enable(); kvm_run->exit_reason = KVM_EXIT_INTR; ret = -EINTR; goto out; @@ -1122,7 +1116,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (vcpu->arch.shared->msr & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); - kvm_guest_enter(); + kvmppc_lazy_ee_enable(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -1157,7 +1151,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) out: vcpu->mode = OUTSIDE_GUEST_MODE; - preempt_enable(); return ret; } diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 9ecf6e3..b2f8258 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -170,20 +170,21 @@ kvmppc_handler_skip_ins: * Call kvmppc_handler_trampoline_enter in real mode * * On entry, r4 contains the guest shadow MSR + * MSR.EE has to be 0 when calling this function */ _GLOBAL(kvmppc_entry_trampoline) mfmsr r5 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) toreal(r7) - li r9, MSR_RI - ori r9, r9, MSR_EE - andc r9, r5, r9 /* Clear EE and RI in MSR value */ li r6, MSR_IR | MSR_DR - ori r6, r6, MSR_EE - andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ - MTMSR_EERI(r9) /* Clear EE and RI in MSR */ - mtsrr0 r7 /* before we set srr0/1 */ + andc r6, r5, r6 /* Clear DR and IR in MSR value */ + /* + * Set EE in HOST_MSR so that it's enabled when we get into our + * C exit handler function + */ + ori r5, r5, MSR_EE + mtsrr0 r7 mtsrr1 r6 RFI diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index aae535f..2bd190c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -486,6 +486,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = -EINTR; goto out; } + kvmppc_lazy_ee_enable(); kvm_guest_enter(); @@ -955,6 +956,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { /* Going back to guest */ kvm_guest_enter(); + kvmppc_lazy_ee_enable(); } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 153a26a..2665499 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "timing.h" #include "../mm/mmu_decl.h" @@ -93,6 +94,19 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) break; } +#ifdef CONFIG_PPC64 + /* lazy EE magic */ + hard_irq_disable(); + if (lazy_irq_pending()) { + /* Got an interrupt in between, try again */ + local_irq_enable(); + local_irq_disable(); + continue; + } + + trace_hardirqs_on(); +#endif + /* Going into guest context! Yay! */ vcpu->mode = IN_GUEST_MODE; smp_wmb(); -- cgit v0.10.2 From 3766a4c693358cff33441310413e3776dbbf8ef0 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 01:24:01 +0200 Subject: KVM: PPC: Move kvm_guest_enter call into generic code We need to call kvm_guest_enter in booke and book3s, so move its call to generic code. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index e737db8..1ff0d6c 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -867,8 +867,6 @@ program_interrupt: run->exit_reason = KVM_EXIT_INTR; r = -EINTR; } else { - /* Going back to guest */ - kvm_guest_enter(); kvmppc_lazy_ee_enable(); } } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 2bd190c..5e8dc19 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -954,8 +954,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); kvmppc_account_exit(vcpu, SIGNAL_EXITS); } else { - /* Going back to guest */ - kvm_guest_enter(); kvmppc_lazy_ee_enable(); } } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2665499..6646574 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -101,12 +101,15 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) /* Got an interrupt in between, try again */ local_irq_enable(); local_irq_disable(); + kvm_guest_exit(); continue; } trace_hardirqs_on(); #endif + kvm_guest_enter(); + /* Going into guest context! Yay! */ vcpu->mode = IN_GUEST_MODE; smp_wmb(); -- cgit v0.10.2 From 206c2ed7f1ea55222bde2954ee3d65c2e9cfb750 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 12:43:33 +0200 Subject: KVM: PPC: Ignore EXITING_GUEST_MODE mode We don't need to do anything when mode is EXITING_GUEST_MODE, because we essentially are outside of guest mode and did everything it asked us to do by the time we check it. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6646574..dc86371 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -89,11 +89,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - if (vcpu->mode == EXITING_GUEST_MODE) { - r = 1; - break; - } - #ifdef CONFIG_PPC64 /* lazy EE magic */ hard_irq_disable(); -- cgit v0.10.2 From 7ee788556bf395a8ef413bea33494df29a3409e0 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 12:44:41 +0200 Subject: KVM: PPC: Add return value in prepare_to_enter Our prepare_to_enter helper wants to be able to return in more circumstances to the host than only when an interrupt is pending. Broaden the interface a bit and move even more generic code to the generic helper. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 1ff0d6c..71fa0f1 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -589,6 +589,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { int r = RESUME_HOST; + int s; vcpu->stat.sum_exits++; @@ -862,10 +863,10 @@ program_interrupt: * again due to a host external interrupt. */ local_irq_disable(); - if (kvmppc_prepare_to_enter(vcpu)) { + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { local_irq_enable(); - run->exit_reason = KVM_EXIT_INTR; - r = -EINTR; + r = s; } else { kvmppc_lazy_ee_enable(); } @@ -1074,10 +1075,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * a host external interrupt. */ local_irq_disable(); - if (kvmppc_prepare_to_enter(vcpu)) { + ret = kvmppc_prepare_to_enter(vcpu); + if (ret <= 0) { local_irq_enable(); - kvm_run->exit_reason = KVM_EXIT_INTR; - ret = -EINTR; goto out; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 5e8dc19..1917802 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -467,7 +467,7 @@ void kvmppc_core_check_requests(struct kvm_vcpu *vcpu) int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - int ret; + int ret, s; #ifdef CONFIG_PPC_FPU unsigned int fpscr; int fpexc_mode; @@ -480,10 +480,10 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } local_irq_disable(); - if (kvmppc_prepare_to_enter(vcpu)) { + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { local_irq_enable(); - kvm_run->exit_reason = KVM_EXIT_INTR; - ret = -EINTR; + ret = s; goto out; } kvmppc_lazy_ee_enable(); @@ -642,6 +642,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { int r = RESUME_HOST; + int s; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -948,11 +949,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, */ if (!(r & RESUME_HOST)) { local_irq_disable(); - if (kvmppc_prepare_to_enter(vcpu)) { + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { local_irq_enable(); - run->exit_reason = KVM_EXIT_INTR; - r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); - kvmppc_account_exit(vcpu, SIGNAL_EXITS); + r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); } else { kvmppc_lazy_ee_enable(); } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index dc86371..0e2a98a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -53,11 +53,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) * Common checks before entering the guest world. Call with interrupts * disabled. * - * returns !0 if a signal is pending and check_signal is true + * returns: + * + * == 1 if we're ready to go into guest state + * <= 0 if we need to go back to the host with return value */ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) { - int r = 0; + int r = 1; WARN_ON_ONCE(!irqs_disabled()); while (true) { @@ -69,7 +72,9 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) } if (signal_pending(current)) { - r = 1; + kvmppc_account_exit(vcpu, SIGNAL_EXITS); + vcpu->run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; break; } -- cgit v0.10.2 From 7c973a2ebb8fb9c8ee2ae9647f9ad7b0ad58a3e6 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 12:50:35 +0200 Subject: KVM: PPC: Add return value to core_check_requests Requests may want to tell us that we need to go back into host state, so add a return value for the checks. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 5459364..3dfc437 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -112,7 +112,7 @@ extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong val); extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *val); -extern void kvmppc_core_check_requests(struct kvm_vcpu *vcpu); +extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu); extern int kvmppc_booke_init(void); extern void kvmppc_booke_exit(void); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 71fa0f1..b3c584f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -86,12 +86,16 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) kvmppc_giveup_ext(vcpu, MSR_VSX); } -void kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) { + int r = 1; /* Indicate we want to get back into the guest */ + /* We misuse TLB_FLUSH to indicate that we want to clear all shadow cache entries */ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return r; } /************* MMU Notifiers *************/ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1917802..c364930 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -455,14 +455,18 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return r; } -void kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) { + int r = 1; /* Indicate we want to get back into the guest */ + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) update_timer_ints(vcpu); #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvmppc_core_flush_tlb(vcpu); #endif + + return r; } int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0e2a98a..54b12af 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -83,9 +83,11 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) /* Make sure we process requests preemptable */ local_irq_enable(); trace_kvm_check_requests(vcpu); - kvmppc_core_check_requests(vcpu); + r = kvmppc_core_check_requests(vcpu); local_irq_disable(); - continue; + if (r > 0) + continue; + break; } if (kvmppc_core_prepare_to_enter(vcpu)) { -- cgit v0.10.2 From f61c94bb99ca4253ac5dd57750e1af209a4beb7a Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 8 Aug 2012 20:38:19 +0000 Subject: KVM: PPC: booke: Add watchdog emulation This patch adds the watchdog emulation in KVM. The watchdog emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl. The kernel timer are used for watchdog emulation and emulates h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how it is configured. Signed-off-by: Liu Yu Signed-off-by: Scott Wood [bharat.bhushan@freescale.com: reworked patch] Signed-off-by: Bharat Bhushan [agraf: adjust to new request framework] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4a5ec8f..51b0ccd 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -471,6 +471,8 @@ struct kvm_vcpu_arch { ulong fault_esr; ulong queued_dear; ulong queued_esr; + spinlock_t wdt_lock; + struct timer_list wdt_timer; u32 tlbcfg[4]; u32 mmucfg; u32 epr; @@ -486,6 +488,7 @@ struct kvm_vcpu_arch { u8 osi_needed; u8 osi_enabled; u8 papr_enabled; + u8 watchdog_enabled; u8 sane; u8 cpu_type; u8 hcall_needed; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3dfc437..c06a64b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); extern void kvmppc_decrementer_func(unsigned long data); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); /* Core-specific hooks */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 2d916c4..e07e6af 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -539,6 +539,13 @@ #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ #define TCR_ARE 0x00400000 /* Auto Reload Enable */ +#ifdef CONFIG_E500 +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ + (((tcr) & 0x1E0000) >> 15)) +#else +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) +#endif + /* Bit definitions for the TSR. */ #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3f2a836..e946665 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index c364930..09e8bf3 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -209,6 +209,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } +static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); +} + +static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); +} + static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) { #ifdef CONFIG_KVM_BOOKE_HV @@ -328,6 +338,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, msr_mask = MSR_CE | MSR_ME | MSR_DE; int_class = INT_CLASS_NONCRIT; break; + case BOOKE_IRQPRIO_WATCHDOG: case BOOKE_IRQPRIO_CRITICAL: case BOOKE_IRQPRIO_DBELL_CRIT: allowed = vcpu->arch.shared->msr & MSR_CE; @@ -407,12 +418,121 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } +/* + * Return the number of jiffies until the next timeout. If the timeout is + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * because the larger value can break the timer APIs. + */ +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) +{ + u64 tb, wdt_tb, wdt_ticks = 0; + u64 nr_jiffies = 0; + u32 period = TCR_GET_WP(vcpu->arch.tcr); + + wdt_tb = 1ULL << (63 - period); + tb = get_tb(); + /* + * The watchdog timeout will hapeen when TB bit corresponding + * to watchdog will toggle from 0 to 1. + */ + if (tb & wdt_tb) + wdt_ticks = wdt_tb; + + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); + + /* Convert timebase ticks to jiffies */ + nr_jiffies = wdt_ticks; + + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) + nr_jiffies++; + + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); +} + +static void arm_next_watchdog(struct kvm_vcpu *vcpu) +{ + unsigned long nr_jiffies; + unsigned long flags; + + /* + * If TSR_ENW and TSR_WIS are not set then no need to exit to + * userspace, so clear the KVM_REQ_WATCHDOG request. + */ + if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) + clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); + nr_jiffies = watchdog_next_timeout(vcpu); + /* + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA + * then do not run the watchdog timer as this can break timer APIs. + */ + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); + else + del_timer(&vcpu->arch.wdt_timer); + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); +} + +void kvmppc_watchdog_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + u32 tsr, new_tsr; + int final; + + do { + new_tsr = tsr = vcpu->arch.tsr; + final = 0; + + /* Time out event */ + if (tsr & TSR_ENW) { + if (tsr & TSR_WIS) + final = 1; + else + new_tsr = tsr | TSR_WIS; + } else { + new_tsr = tsr | TSR_ENW; + } + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); + + if (new_tsr & TSR_WIS) { + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * If this is final watchdog expiry and some action is required + * then exit to userspace. + */ + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && + vcpu->arch.watchdog_enabled) { + smp_wmb(); + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * Stop running the watchdog timer after final expiration to + * prevent the host from being flooded with timers if the + * guest sets a short period. + * Timers will resume when TSR/TCR is updated next time. + */ + if (!final) + arm_next_watchdog(vcpu); +} + static void update_timer_ints(struct kvm_vcpu *vcpu) { if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) kvmppc_core_queue_dec(vcpu); else kvmppc_core_dequeue_dec(vcpu); + + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) + kvmppc_core_queue_watchdog(vcpu); + else + kvmppc_core_dequeue_watchdog(vcpu); } static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) @@ -466,6 +586,11 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) kvmppc_core_flush_tlb(vcpu); #endif + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_WATCHDOG; + r = 0; + } + return r; } @@ -995,6 +1120,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + /* setup watchdog timer once */ + spin_lock_init(&vcpu->arch.wdt_lock); + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, + (unsigned long)vcpu); + + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + del_timer_sync(&vcpu->arch.wdt_timer); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -1090,7 +1230,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, } if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { + u32 old_tsr = vcpu->arch.tsr; + vcpu->arch.tsr = sregs->u.e.tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); } @@ -1251,6 +1397,7 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) { vcpu->arch.tcr = new_tcr; + arm_next_watchdog(vcpu); update_timer_ints(vcpu); } @@ -1265,6 +1412,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) { clear_bits(tsr_bits, &vcpu->arch.tsr); + + /* + * We may have stopped the watchdog due to + * being stuck on final expiration. + */ + if (tsr_bits & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); } diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 12834bb..5a66ade 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) kvmppc_clr_tsr_bits(vcpu, spr_val); break; case SPRN_TCR: + /* + * WRC is a 2-bit field that is supposed to preserve its + * value once written to non-zero. + */ + if (vcpu->arch.tcr & TCR_WRC_MASK) { + spr_val &= ~TCR_WRC_MASK; + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; + } kvmppc_set_tcr(vcpu, spr_val); break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 54b12af..0ffd7d1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -300,6 +300,7 @@ int kvm_dev_ioctl_check_extension(long ext) switch (ext) { #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_SREGS: + case KVM_CAP_PPC_BOOKE_WATCHDOG: #else case KVM_CAP_PPC_SEGSTATE: case KVM_CAP_PPC_HIOR: @@ -476,6 +477,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + int ret; + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; @@ -484,13 +487,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); #endif - - return 0; + ret = kvmppc_subarch_vcpu_init(vcpu); + return ret; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -735,6 +739,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_WATCHDOG: + r = 0; + vcpu->arch.watchdog_enabled = true; + break; +#endif #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: { struct kvm_config_tlb cfg; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 4cb3761..1649d4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -167,6 +167,7 @@ struct kvm_pit_config { #define KVM_EXIT_OSI 18 #define KVM_EXIT_PAPR_HCALL 19 #define KVM_EXIT_S390_UCONTROL 20 +#define KVM_EXIT_WATCHDOG 21 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -628,6 +629,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_READONLY_MEM 81 #endif #define KVM_CAP_IRQFD_RESAMPLE 82 +#define KVM_CAP_PPC_BOOKE_WATCHDOG 83 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2850656e..0ca3663 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -118,6 +118,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_IMMEDIATE_EXIT 15 #define KVM_REQ_PMU 16 #define KVM_REQ_PMI 17 +#define KVM_REQ_WATCHDOG 18 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 -- cgit v0.10.2 From 6df8d3fc58dde84fc82a9ec2581440e54dfd3d14 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 8 Aug 2012 21:17:55 +0000 Subject: booke: Added ONE_REG interface for IAC/DAC debug registers IAC/DAC are defined as 32 bit while they are 64 bit wide. So ONE_REG interface is added to set/get them. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 1bea4d8..3c14202 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -221,6 +221,12 @@ struct kvm_sregs { __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */ __u32 dbcr[3]; + /* + * iac/dac registers are 64bit wide, while this API + * interface provides only lower 32 bits on 64 bit + * processors. ONE_REG interface is added for 64bit + * iac/dac registers. + */ __u32 iac[4]; __u32 dac[2]; __u32 dvc[2]; @@ -326,5 +332,11 @@ struct kvm_book3e_206_tlb_params { }; #define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) +#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) +#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) +#define KVM_REG_PPC_IAC3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4) +#define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5) +#define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6) +#define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 51b0ccd..f20a5ef 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -346,6 +346,27 @@ struct kvmppc_slb { bool class : 1; }; +# ifdef CONFIG_PPC_FSL_BOOK3E +#define KVMPPC_BOOKE_IAC_NUM 2 +#define KVMPPC_BOOKE_DAC_NUM 2 +# else +#define KVMPPC_BOOKE_IAC_NUM 4 +#define KVMPPC_BOOKE_DAC_NUM 2 +# endif +#define KVMPPC_BOOKE_MAX_IAC 4 +#define KVMPPC_BOOKE_MAX_DAC 2 + +struct kvmppc_booke_debug_reg { + u32 dbcr0; + u32 dbcr1; + u32 dbcr2; +#ifdef CONFIG_KVM_E500MC + u32 dbcr4; +#endif + u64 iac[KVMPPC_BOOKE_MAX_IAC]; + u64 dac[KVMPPC_BOOKE_MAX_DAC]; +}; + struct kvm_vcpu_arch { ulong host_stack; u32 host_pid; @@ -440,8 +461,6 @@ struct kvm_vcpu_arch { u32 ccr0; u32 ccr1; - u32 dbcr0; - u32 dbcr1; u32 dbsr; u64 mmcr[3]; @@ -476,6 +495,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; + struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 09e8bf3..959aae9 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1351,12 +1351,56 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - return -EINVAL; + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_IAC1: + case KVM_REG_PPC_IAC2: + case KVM_REG_PPC_IAC3: + case KVM_REG_PPC_IAC4: { + int iac = reg->id - KVM_REG_PPC_IAC1; + r = copy_to_user((u64 __user *)(long)reg->addr, + &vcpu->arch.dbg_reg.iac[iac], sizeof(u64)); + break; + } + case KVM_REG_PPC_DAC1: + case KVM_REG_PPC_DAC2: { + int dac = reg->id - KVM_REG_PPC_DAC1; + r = copy_to_user((u64 __user *)(long)reg->addr, + &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); + break; + } + default: + break; + } + return r; } int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - return -EINVAL; + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_IAC1: + case KVM_REG_PPC_IAC2: + case KVM_REG_PPC_IAC3: + case KVM_REG_PPC_IAC4: { + int iac = reg->id - KVM_REG_PPC_IAC1; + r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], + (u64 __user *)(long)reg->addr, sizeof(u64)); + break; + } + case KVM_REG_PPC_DAC1: + case KVM_REG_PPC_DAC2: { + int dac = reg->id - KVM_REG_PPC_DAC1; + r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], + (u64 __user *)(long)reg->addr, sizeof(u64)); + break; + } + default: + break; + } + return r; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 5a66ade..cc99a0b 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -133,10 +133,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) vcpu->arch.csrr1 = spr_val; break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = spr_val; + vcpu->arch.dbg_reg.dbcr0 = spr_val; break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = spr_val; + vcpu->arch.dbg_reg.dbcr1 = spr_val; break; case SPRN_DBSR: vcpu->arch.dbsr &= ~spr_val; @@ -266,10 +266,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = vcpu->arch.csrr1; break; case SPRN_DBCR0: - *spr_val = vcpu->arch.dbcr0; + *spr_val = vcpu->arch.dbg_reg.dbcr0; break; case SPRN_DBCR1: - *spr_val = vcpu->arch.dbcr1; + *spr_val = vcpu->arch.dbg_reg.dbcr1; break; case SPRN_DBSR: *spr_val = vcpu->arch.dbsr; -- cgit v0.10.2 From 491dd5b8a4926393308172da80c73faf242a4057 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 14:40:29 +0200 Subject: KVM: PPC: 44x: Initialize PVR We need to make sure that vcpu->arch.pvr is initialized to a sane value, so let's just take the host PVR. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 50e7dbc..3d7fd21 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -83,6 +83,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu_44x->shadow_refs[i].gtlb_index = -1; vcpu->arch.cpu_type = KVM_CPU_440; + vcpu->arch.pvr = mfspr(SPRN_PVR); return 0; } -- cgit v0.10.2 From 50c871edf59b4585fd2c17acfe4e7cd3752418b7 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 13 Aug 2012 14:50:54 +0200 Subject: KVM: PPC: BookE: Add MCSR SPR support Add support for the MCSR SPR. This only implements the SPR storage bits, not actual machine checks. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index cc99a0b..514790f 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -237,6 +237,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_IVOR15: vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; break; + case SPRN_MCSR: + vcpu->arch.mcsr &= ~spr_val; + break; default: emulated = EMULATE_FAIL; @@ -329,6 +332,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_IVOR15: *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; break; + case SPRN_MCSR: + *spr_val = vcpu->arch.mcsr; + break; default: emulated = EMULATE_FAIL; -- cgit v0.10.2 From 166a2b7000c388aee81168987ce2eddb6783f550 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 15 Aug 2012 01:38:43 +0200 Subject: KVM: PPC: Use symbols for exit trace Exit traces are a lot easier to read when you don't have to remember cryptic numbers for guest exit reasons. Symbolify them in our trace output. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index cb2780a..519aba8 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -31,6 +31,60 @@ TRACE_EVENT(kvm_ppc_instr, __entry->inst, __entry->pc, __entry->emulate) ); +#ifdef CONFIG_PPC_BOOK3S +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} +#else +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} +#endif + TRACE_EVENT(kvm_exit, TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), TP_ARGS(exit_nr, vcpu), @@ -62,7 +116,7 @@ TRACE_EVENT(kvm_exit, __entry->last_inst = vcpu->arch.last_inst; ), - TP_printk("exit=0x%x" + TP_printk("exit=%s" " | pc=0x%lx" " | msr=0x%lx" " | dar=0x%lx" @@ -71,7 +125,7 @@ TRACE_EVENT(kvm_exit, #endif " | last_inst=0x%lx" , - __entry->exit_nr, + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), __entry->pc, __entry->msr, __entry->dar, -- cgit v0.10.2 From 430c7ff52ffb902e1e08b255b93c28fcad8cb9ef Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 15 Aug 2012 11:42:07 +0200 Subject: KVM: PPC: E500: Remove E500_TLB_DIRTY flag Since we always mark pages as dirty immediately when mapping them read/write now, there's no need for the dirty flag in our cache. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index aa8b814..d162286 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -27,8 +27,7 @@ #define E500_TLB_NUM 2 #define E500_TLB_VALID 1 -#define E500_TLB_DIRTY 2 -#define E500_TLB_BITMAP 4 +#define E500_TLB_BITMAP 2 struct tlbe_ref { pfn_t pfn; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 1af6fab..43489a8 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -303,10 +303,8 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, ref->pfn = pfn; ref->flags = E500_TLB_VALID; - if (tlbe_is_writable(gtlbe)) { - ref->flags |= E500_TLB_DIRTY; + if (tlbe_is_writable(gtlbe)) kvm_set_pfn_dirty(pfn); - } } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) -- cgit v0.10.2 From 2e2327023fe090d68aacad395178298645eaffea Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 15 Aug 2012 17:37:13 +0000 Subject: Document IACx/DACx registers access using ONE_REG API Patch to access the debug registers (IACx/DACx) using ONE_REG api was sent earlier. But that missed the respective documentation. Also corrected the index number referencing in section 4.69 Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 170afb1..c84739e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1734,7 +1734,12 @@ registers, find a list below: Arch | Register | Width (bits) | | PPC | KVM_REG_PPC_HIOR | 64 - + PPC | KVM_REG_PPC_IAC1 | 64 + PPC | KVM_REG_PPC_IAC2 | 64 + PPC | KVM_REG_PPC_IAC3 | 64 + PPC | KVM_REG_PPC_IAC4 | 64 + PPC | KVM_REG_PPC_DAC1 | 64 + PPC | KVM_REG_PPC_DAC2 | 64 4.69 KVM_GET_ONE_REG @@ -1750,7 +1755,7 @@ kvm_one_reg struct passed in. On success, the register value can be found at the memory location pointed to by "addr". The list of registers accessible using this interface is identical to the -list in 4.64. +list in 4.68. 4.70 KVM_KVMCLOCK_CTRL -- cgit v0.10.2 From e4dcfe88fb30bcedda80c151018086fffb8280e6 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 16 Aug 2012 00:28:09 +0200 Subject: KVM: PPC: 440: Implement mtdcrx We need mtdcrx to execute properly on 460 cores. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index c8c6157..3843a75 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -28,11 +28,29 @@ #include "44x_tlb.h" #define XOP_MFDCR 323 +#define XOP_MTDCRX 387 #define XOP_MTDCR 451 #define XOP_TLBSX 914 #define XOP_ICCCI 966 #define XOP_TLBWE 978 +static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) +{ + /* emulate some access in kernel */ + switch (dcrn) { + case DCRN_CPR0_CONFIG_ADDR: + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); + return EMULATE_DONE; + default: + vcpu->run->dcr.dcrn = dcrn; + vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs); + vcpu->run->dcr.is_write = 1; + vcpu->arch.dcr_needed = 1; + kvmppc_account_exit(vcpu, DCR_EXITS); + return EMULATE_DO_DCR; + } +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -85,20 +103,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_MTDCR: - /* emulate some access in kernel */ - switch (dcrn) { - case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); - break; - default: - run->dcr.dcrn = dcrn; - run->dcr.data = kvmppc_get_gpr(vcpu, rs); - run->dcr.is_write = 1; - vcpu->arch.dcr_needed = 1; - kvmppc_account_exit(vcpu, DCR_EXITS); - emulated = EMULATE_DO_DCR; - } + emulated = emulate_mtdcr(vcpu, rs, dcrn); + break; + case XOP_MTDCRX: + emulated = emulate_mtdcr(vcpu, rs, + kvmppc_get_gpr(vcpu, ra)); break; case XOP_TLBWE: -- cgit v0.10.2 From ceb985f9d18cba2efdef08b8d31751c2c2b20d77 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 16 Aug 2012 00:34:58 +0200 Subject: KVM: PPC: 440: Implement mfdcrx We need mfdcrx to execute properly on 460 cores. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 3843a75..1a793c4 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -27,6 +27,7 @@ #include "booke.h" #include "44x_tlb.h" +#define XOP_MFDCRX 259 #define XOP_MFDCR 323 #define XOP_MTDCRX 387 #define XOP_MTDCR 451 @@ -51,6 +52,43 @@ static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) } } +static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) +{ + /* The guest may access CPR0 registers to determine the timebase + * frequency, and it must know the real host frequency because it + * can directly access the timebase registers. + * + * It would be possible to emulate those accesses in userspace, + * but userspace can really only figure out the end frequency. + * We could decompose that into the factors that compute it, but + * that's tricky math, and it's easier to just report the real + * CPR0 values. + */ + switch (dcrn) { + case DCRN_CPR0_CONFIG_ADDR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); + break; + case DCRN_CPR0_CONFIG_DATA: + local_irq_disable(); + mtdcr(DCRN_CPR0_CONFIG_ADDR, + vcpu->arch.cpr0_cfgaddr); + kvmppc_set_gpr(vcpu, rt, + mfdcr(DCRN_CPR0_CONFIG_DATA)); + local_irq_enable(); + break; + default: + vcpu->run->dcr.dcrn = dcrn; + vcpu->run->dcr.data = 0; + vcpu->run->dcr.is_write = 0; + vcpu->arch.io_gpr = rt; + vcpu->arch.dcr_needed = 1; + kvmppc_account_exit(vcpu, DCR_EXITS); + return EMULATE_DO_DCR; + } + + return EMULATE_DONE; +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -68,38 +106,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (get_xop(inst)) { case XOP_MFDCR: - /* The guest may access CPR0 registers to determine the timebase - * frequency, and it must know the real host frequency because it - * can directly access the timebase registers. - * - * It would be possible to emulate those accesses in userspace, - * but userspace can really only figure out the end frequency. - * We could decompose that into the factors that compute it, but - * that's tricky math, and it's easier to just report the real - * CPR0 values. - */ - switch (dcrn) { - case DCRN_CPR0_CONFIG_ADDR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); - break; - case DCRN_CPR0_CONFIG_DATA: - local_irq_disable(); - mtdcr(DCRN_CPR0_CONFIG_ADDR, - vcpu->arch.cpr0_cfgaddr); - kvmppc_set_gpr(vcpu, rt, - mfdcr(DCRN_CPR0_CONFIG_DATA)); - local_irq_enable(); - break; - default: - run->dcr.dcrn = dcrn; - run->dcr.data = 0; - run->dcr.is_write = 0; - vcpu->arch.io_gpr = rt; - vcpu->arch.dcr_needed = 1; - kvmppc_account_exit(vcpu, DCR_EXITS); - emulated = EMULATE_DO_DCR; - } + emulated = emulate_mfdcr(vcpu, rt, dcrn); + break; + case XOP_MFDCRX: + emulated = emulate_mfdcr(vcpu, rt, + kvmppc_get_gpr(vcpu, ra)); break; case XOP_MTDCR: -- cgit v0.10.2 From 7a08c2740f07fb8c3769d1f137721835ead7652f Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 16 Aug 2012 13:10:16 +0200 Subject: KVM: PPC: BookE: Support FPU on non-hv systems When running on HV aware hosts, we can not trap when the guest sets the FP bit, so we just let it do so when it wants to, because it has full access to MSR. For non-HV aware hosts with an FPU (like 440), we need to also adjust the shadow MSR though. Otherwise the guest gets an FP unavailable trap even when it really enabled the FP bit in MSR. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 959aae9..5f0476a 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -122,6 +122,16 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) } #endif +static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) + /* We always treat the FP bit as enabled from the host + perspective, so only need to adjust the shadow MSR */ + vcpu->arch.shadow_msr &= ~MSR_FP; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP; +#endif +} + /* * Helper function for "full" MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -138,6 +148,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvmppc_mmu_msr_notify(vcpu, old_msr); kvmppc_vcpu_sync_spe(vcpu); + kvmppc_vcpu_sync_fpu(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, -- cgit v0.10.2 From d61966fc08b84857b697ebae4489c652dd87e48a Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Wed, 12 Sep 2012 03:18:14 +0000 Subject: KVM: PPC: bookehv: Allow duplicate calls of DO_KVM macro The current form of DO_KVM macro restricts its use to one call per input parameter set. This is caused by kvmppc_resume_\intno\()_\srr1 symbol definition. Duplicate calls of DO_KVM are required by distinct implementations of exeption handlers which are delegated at runtime. Use a rare label number to avoid conflicts with the calling contexts. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h index 30a600f..a37a12a 100644 --- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -38,9 +38,9 @@ #ifdef CONFIG_KVM_BOOKE_HV BEGIN_FTR_SECTION mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ - bf 3, kvmppc_resume_\intno\()_\srr1 + bf 3, 1975f b kvmppc_handler_\intno\()_\srr1 -kvmppc_resume_\intno\()_\srr1: +1975: END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) #endif .endm -- cgit v0.10.2 From 2c9097e4c1340208ef93371abd4b3bd7e989381b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 Sep 2012 13:27:01 +0000 Subject: KVM: PPC: Book3S HV: Take the SRCU read lock before looking up memslots The generic KVM code uses SRCU (sleeping RCU) to protect accesses to the memslots data structures against updates due to userspace adding, modifying or removing memory slots. We need to do that too, both to avoid accessing stale copies of the memslots and to avoid lockdep warnings. This therefore adds srcu_read_lock/unlock pairs around code that accesses and uses memslots. Since the real-mode handlers for H_ENTER, H_REMOVE and H_BULK_REMOVE need to access the memslots, and we don't want to call the SRCU code in real mode (since we have no assurance that it would only access the linear mapping), we hold the SRCU read lock for the VM while in the guest. This does mean that adding or removing memory slots while some vcpus are executing in the guest will block for up to two jiffies. This tradeoff is acceptable since adding/removing memory slots only happens rarely, while H_ENTER/H_REMOVE/H_BULK_REMOVE are performance-critical hot paths. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d95d113..0f031c0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1057,20 +1058,22 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, unsigned long hva, psize, offset; unsigned long pa; unsigned long *physp; + int srcu_idx; + srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) - return NULL; + goto err; if (!kvm->arch.using_mmu_notifiers) { physp = kvm->arch.slot_phys[memslot->id]; if (!physp) - return NULL; + goto err; physp += gfn - memslot->base_gfn; pa = *physp; if (!pa) { if (kvmppc_get_guest_page(kvm, gfn, memslot, PAGE_SIZE) < 0) - return NULL; + goto err; pa = *physp; } page = pfn_to_page(pa >> PAGE_SHIFT); @@ -1079,9 +1082,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, hva = gfn_to_hva_memslot(memslot, gfn); npages = get_user_pages_fast(hva, 1, 1, pages); if (npages < 1) - return NULL; + goto err; page = pages[0]; } + srcu_read_unlock(&kvm->srcu, srcu_idx); + psize = PAGE_SIZE; if (PageHuge(page)) { page = compound_head(page); @@ -1091,6 +1096,10 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (nb_ret) *nb_ret = psize - offset; return page_address(page) + offset; + + err: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return NULL; } void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 83e929e..48b0d4a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -366,13 +367,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) unsigned long req = kvmppc_get_gpr(vcpu, 3); unsigned long target, ret = H_SUCCESS; struct kvm_vcpu *tvcpu; + int idx; switch (req) { case H_ENTER: + idx = srcu_read_lock(&vcpu->kvm->srcu); ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6), kvmppc_get_gpr(vcpu, 7)); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case H_CEDE: break; @@ -411,6 +415,7 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { int r = RESUME_HOST; + int srcu_idx; vcpu->stat.sum_exits++; @@ -470,12 +475,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_book3s_hv_page_fault(run, vcpu, kvmppc_get_pc(vcpu), 0); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); break; /* * This occurs if the guest executes an illegal instruction. @@ -820,6 +829,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) long ret; u64 now; int ptid, i, need_vpa_update; + int srcu_idx; /* don't start if any threads have a signal pending */ need_vpa_update = 0; @@ -898,6 +908,9 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) spin_unlock(&vc->lock); kvm_guest_enter(); + + srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu); + __kvmppc_vcore_entry(NULL, vcpu0); for (i = 0; i < threads_per_core; ++i) kvmppc_release_hwthread(vc->pcpu + i); @@ -913,6 +926,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); + srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx); + /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); kvm_guest_exit(); @@ -1362,6 +1377,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long rmls; unsigned long *physp; unsigned long i, npages; + int srcu_idx; mutex_lock(&kvm->lock); if (kvm->arch.rma_setup_done) @@ -1377,12 +1393,13 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } /* Look up the memslot for guest physical address 0 */ + srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, 0); /* We must have some memory at 0 by now */ err = -EINVAL; if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) - goto out; + goto out_srcu; /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; @@ -1406,14 +1423,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) err = -EPERM; if (cpu_has_feature(CPU_FTR_ARCH_201)) { pr_err("KVM: CPU requires an RMO\n"); - goto out; + goto out_srcu; } /* We can handle 4k, 64k or 16M pages in the VRMA */ err = -EINVAL; if (!(psize == 0x1000 || psize == 0x10000 || psize == 0x1000000)) - goto out; + goto out_srcu; /* Update VRMASD field in the LPCR */ senc = slb_pgsize_encoding(psize); @@ -1436,7 +1453,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) err = -EINVAL; if (rmls < 0) { pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); - goto out; + goto out_srcu; } atomic_inc(&ri->use_count); kvm->arch.rma = ri; @@ -1476,6 +1493,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) smp_wmb(); kvm->arch.rma_setup_done = 1; err = 0; + out_srcu: + srcu_read_unlock(&kvm->srcu, srcu_idx); out: mutex_unlock(&kvm->lock); return err; -- cgit v0.10.2 From a66b48c3a39fa1c4223d4f847fdc7a04ed1618de Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 Sep 2012 13:27:46 +0000 Subject: KVM: PPC: Move kvm->arch.slot_phys into memslot.arch Now that we have an architecture-specific field in the kvm_memory_slot structure, we can use it to store the array of page physical addresses that we need for Book3S HV KVM on PPC970 processors. This reduces the size of struct kvm_arch for Book3S HV, and also reduces the size of struct kvm_arch_memory_slot for other PPC KVM variants since the fields in it are now only compiled in for Book3S HV. This necessitates making the kvm_arch_create_memslot and kvm_arch_free_memslot operations specific to each PPC KVM variant. That in turn means that we now don't allocate the rmap arrays on Book3S PR and Book E. Since we now unpin pages and free the slot_phys array in kvmppc_core_free_memslot, we no longer need to do it in kvmppc_core_destroy_vm, since the generic code takes care to free all the memslots when destroying a VM. We now need the new memslot to be passed in to kvmppc_core_prepare_memory_region, since we need to initialize its arch.slot_phys member on Book3S HV. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index f20a5ef..68f5a30 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -204,7 +204,7 @@ struct revmap_entry { }; /* - * We use the top bit of each memslot->rmap entry as a lock bit, + * We use the top bit of each memslot->arch.rmap entry as a lock bit, * and bit 32 as a present flag. The bottom 32 bits are the * index in the guest HPT of a HPTE that points to the page. */ @@ -215,14 +215,17 @@ struct revmap_entry { #define KVMPPC_RMAP_PRESENT 0x100000000ul #define KVMPPC_RMAP_INDEX 0xfffffffful -/* Low-order bits in kvm->arch.slot_phys[][] */ +/* Low-order bits in memslot->arch.slot_phys[] */ #define KVMPPC_PAGE_ORDER_MASK 0x1f #define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ #define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch_memory_slot { +#ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long *rmap; + unsigned long *slot_phys; +#endif /* CONFIG_KVM_BOOK3S_64_HV */ }; struct kvm_arch { @@ -246,8 +249,6 @@ struct kvm_arch { unsigned long hpt_npte; unsigned long hpt_mask; spinlock_t slot_phys_lock; - unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; - int slot_npages[KVM_MEM_SLOTS_NUM]; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; struct kvmppc_linear_info *hpt_li; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c06a64b..41a00ea 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -143,7 +143,12 @@ extern struct kvmppc_linear_info *kvm_alloc_hpt(void); extern void kvm_release_hpt(struct kvmppc_linear_info *li); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); +extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); +extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem); extern void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 0f031c0..a389cc6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -261,7 +261,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) /* * This is called to get a reference to a guest page if there isn't - * one already in the kvm->arch.slot_phys[][] arrays. + * one already in the memslot->arch.slot_phys[] array. */ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, struct kvm_memory_slot *memslot, @@ -276,7 +276,7 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, struct vm_area_struct *vma; unsigned long pfn, i, npages; - physp = kvm->arch.slot_phys[memslot->id]; + physp = memslot->arch.slot_phys; if (!physp) return -EINVAL; if (physp[gfn - memslot->base_gfn]) @@ -1065,7 +1065,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) goto err; if (!kvm->arch.using_mmu_notifiers) { - physp = kvm->arch.slot_phys[memslot->id]; + physp = memslot->arch.slot_phys; if (!physp) goto err; physp += gfn - memslot->base_gfn; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 48b0d4a..817837d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1314,48 +1314,67 @@ static unsigned long slb_pgsize_encoding(unsigned long psize) return senc; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) +static void unpin_slot(struct kvm_memory_slot *memslot) { - unsigned long npages; - unsigned long *phys; + unsigned long *physp; + unsigned long j, npages, pfn; + struct page *page; - /* Allocate a slot_phys array */ - phys = kvm->arch.slot_phys[mem->slot]; - if (!kvm->arch.using_mmu_notifiers && !phys) { - npages = mem->memory_size >> PAGE_SHIFT; - phys = vzalloc(npages * sizeof(unsigned long)); - if (!phys) - return -ENOMEM; - kvm->arch.slot_phys[mem->slot] = phys; - kvm->arch.slot_npages[mem->slot] = npages; + physp = memslot->arch.slot_phys; + npages = memslot->npages; + if (!physp) + return; + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + SetPageDirty(page); + put_page(page); + } +} + +void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->arch.rmap != dont->arch.rmap) { + vfree(free->arch.rmap); + free->arch.rmap = NULL; } + if (!dont || free->arch.slot_phys != dont->arch.slot_phys) { + unpin_slot(free); + vfree(free->arch.slot_phys); + free->arch.slot_phys = NULL; + } +} + +int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + return -ENOMEM; + slot->arch.slot_phys = NULL; return 0; } -static void unpin_slot(struct kvm *kvm, int slot_id) +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) { - unsigned long *physp; - unsigned long j, npages, pfn; - struct page *page; + unsigned long *phys; - physp = kvm->arch.slot_phys[slot_id]; - npages = kvm->arch.slot_npages[slot_id]; - if (physp) { - spin_lock(&kvm->arch.slot_phys_lock); - for (j = 0; j < npages; j++) { - if (!(physp[j] & KVMPPC_GOT_PAGE)) - continue; - pfn = physp[j] >> PAGE_SHIFT; - page = pfn_to_page(pfn); - SetPageDirty(page); - put_page(page); - } - kvm->arch.slot_phys[slot_id] = NULL; - spin_unlock(&kvm->arch.slot_phys_lock); - vfree(physp); + /* Allocate a slot_phys array if needed */ + phys = memslot->arch.slot_phys; + if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) { + phys = vzalloc(memslot->npages * sizeof(unsigned long)); + if (!phys) + return -ENOMEM; + memslot->arch.slot_phys = phys; } + + return 0; } void kvmppc_core_commit_memory_region(struct kvm *kvm, @@ -1482,11 +1501,16 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Initialize phys addrs of pages in RMO */ npages = ri->npages; porder = __ilog2(npages); - physp = kvm->arch.slot_phys[memslot->id]; - spin_lock(&kvm->arch.slot_phys_lock); - for (i = 0; i < npages; ++i) - physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; - spin_unlock(&kvm->arch.slot_phys_lock); + physp = memslot->arch.slot_phys; + if (physp) { + if (npages > memslot->npages) + npages = memslot->npages; + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + + porder; + spin_unlock(&kvm->arch.slot_phys_lock); + } } /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ @@ -1547,12 +1571,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) void kvmppc_core_destroy_vm(struct kvm *kvm) { - unsigned long i; - - if (!kvm->arch.using_mmu_notifiers) - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - unpin_slot(kvm, i); - if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index fb0e821..63eb94e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -183,7 +183,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, rmap = &memslot->arch.rmap[slot_fn]; if (!kvm->arch.using_mmu_notifiers) { - physp = kvm->arch.slot_phys[memslot->id]; + physp = memslot->arch.slot_phys; if (!physp) return H_PARAMETER; physp += slot_fn; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b3c584f..fdadc9e 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1220,7 +1220,19 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) } #endif /* CONFIG_PPC64 */ +void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem) { return 0; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 5f0476a..5144057 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1438,7 +1438,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } +void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem) { return 0; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0ffd7d1..33122dd 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -389,19 +389,12 @@ long kvm_arch_dev_ioctl(struct file *filp, void kvm_arch_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - if (!dont || free->arch.rmap != dont->arch.rmap) { - vfree(free->arch.rmap); - free->arch.rmap = NULL; - } + kvmppc_core_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { - slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); - if (!slot->arch.rmap) - return -ENOMEM; - - return 0; + return kvmppc_core_create_memslot(slot, npages); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -410,7 +403,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - return kvmppc_core_prepare_memory_region(kvm, mem); + return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, -- cgit v0.10.2 From dfe49dbd1fc7310a4e0e2f83ae737cd7d34fa0cd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 Sep 2012 13:28:18 +0000 Subject: KVM: PPC: Book3S HV: Handle memory slot deletion and modification correctly This adds an implementation of kvm_arch_flush_shadow_memslot for Book3S HV, and arranges for kvmppc_core_commit_memory_region to flush the dirty log when modifying an existing slot. With this, we can handle deletion and modification of memory slots. kvm_arch_flush_shadow_memslot calls kvmppc_core_flush_memslot, which on Book3S HV now traverses the reverse map chains to remove any HPT (hashed page table) entries referring to pages in the memslot. This gets called by generic code whenever deleting a memslot or changing the guest physical address for a memslot. We flush the dirty log in kvmppc_core_commit_memory_region for consistency with what x86 does. We only need to flush when an existing memslot is being modified, because for a new memslot the rmap array (which stores the dirty bits) is all zero, meaning that every page is considered clean already, and when deleting a memslot we obviously don't care about the dirty bits any more. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index f0e0c6a..ab73800 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -160,7 +160,7 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot); + struct kvm_memory_slot *memslot, unsigned long *map); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 41a00ea..3fb980d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -151,9 +151,12 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem); extern void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old); extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info); +extern void kvmppc_core_flush_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot); extern int kvmppc_bookehv_init(void); extern void kvmppc_bookehv_exit(void); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index a389cc6..f598366 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -851,7 +851,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, psize = hpte_page_size(hptep[0], ptel); if ((hptep[0] & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { - hptep[0] |= HPTE_V_ABSENT; + if (kvm->arch.using_mmu_notifiers) + hptep[0] |= HPTE_V_ABSENT; kvmppc_invalidate_hpte(kvm, hptep, i); /* Harvest R and C */ rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); @@ -878,6 +879,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return 0; } +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + unsigned long *rmapp; + unsigned long gfn; + unsigned long n; + + rmapp = memslot->arch.rmap; + gfn = memslot->base_gfn; + for (n = memslot->npages; n; --n) { + /* + * Testing the present bit without locking is OK because + * the memslot has been marked invalid already, and hence + * no new HPTEs referencing this page can be created, + * thus the present bit can't go from 0 to 1. + */ + if (*rmapp & KVMPPC_RMAP_PRESENT) + kvm_unmap_rmapp(kvm, rmapp, gfn); + ++rmapp; + ++gfn; + } +} + static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long gfn) { @@ -1031,16 +1054,16 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) return ret; } -long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long *map) { unsigned long i; - unsigned long *rmapp, *map; + unsigned long *rmapp; preempt_disable(); rmapp = memslot->arch.rmap; - map = memslot->dirty_bitmap; for (i = 0; i < memslot->npages; ++i) { - if (kvm_test_clear_dirty(kvm, rmapp)) + if (kvm_test_clear_dirty(kvm, rmapp) && map) __set_bit_le(i, map); ++rmapp; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 817837d..38c7f1b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1288,7 +1288,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) n = kvm_dirty_bitmap_bytes(memslot); memset(memslot->dirty_bitmap, 0, n); - r = kvmppc_hv_get_dirty_log(kvm, memslot); + r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); if (r) goto out; @@ -1378,8 +1378,22 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) { + unsigned long npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + if (npages && old.npages) { + /* + * If modifying a memslot, reset all the rmap dirty bits. + * If this is a new memslot, we don't need to do anything + * since the rmap array starts out as all zeroes, + * i.e. no pages are dirty. + */ + memslot = id_to_memslot(kvm->memslots, mem->slot); + kvmppc_hv_get_dirty_log(kvm, memslot, NULL); + } } static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63eb94e..9955216 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -81,7 +81,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, ptel = rev->guest_rpte |= rcbits; gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + if (!memslot) return; rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index fdadc9e..4d0667a 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1239,7 +1239,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) +{ +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 5144057..3a6490f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1457,7 +1457,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) +{ +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 33122dd..8443e23 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -411,7 +411,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot old, int user_alloc) { - kvmppc_core_commit_memory_region(kvm, mem); + kvmppc_core_commit_memory_region(kvm, mem, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -421,6 +421,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + kvmppc_core_flush_memslot(kvm, slot); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) -- cgit v0.10.2 From ed7a8d7a3c5dd4adedb271112b6faba45c7037f9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 13 Sep 2012 21:44:30 +0000 Subject: KVM: Move some PPC ioctl definitions to the correct place This moves the definitions of KVM_CREATE_SPAPR_TCE and KVM_ALLOCATE_RMA in include/linux/kvm.h from the section listing the vcpu ioctls to the section listing VM ioctls, as these are both implemented and documented as VM ioctls. Fortunately there is no actual collision of ioctl numbers at this point. Moving these to the correct section will reduce the probability of a future collision. This does not change the user/kernel ABI at all. Signed-off-by: Paul Mackerras Acked-by: Alexander Graf Signed-off-by: Alexander Graf diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 1649d4b..65ad5c6 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -852,6 +852,9 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) /* Available with KVM_CAP_PPC_ALLOC_HTAB */ #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) +#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +/* Available with KVM_CAP_RMA */ +#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* * ioctls for vcpu fds @@ -915,9 +918,6 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_XCRS */ #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) -#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) -/* Available with KVM_CAP_RMA */ -#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* Available with KVM_CAP_SW_TLB */ #define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) /* Available with KVM_CAP_ONE_REG */ -- cgit v0.10.2 From a47d72f3613d5edfd8e752c9b804d7df35810649 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 20 Sep 2012 19:35:51 +0000 Subject: KVM: PPC: Book3S HV: Fix updates of vcpu->cpu This removes the powerpc "generic" updates of vcpu->cpu in load and put, and moves them to the various backends. The reason is that "HV" KVM does its own sauce with that field and the generic updates might corrupt it. The field contains the CPU# of the -first- HW CPU of the core always for all the VCPU threads of a core (the one that's online from a host Linux perspective). However, the preempt notifiers are going to be called on the threads VCPUs when they are running (due to them sleeping on our private waitqueue) causing unload to be called, potentially clobbering the value. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 4d0667a..bf3ec5d 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -64,7 +64,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; svcpu_put(svcpu); #endif - + vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; #endif @@ -84,6 +84,7 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) kvmppc_giveup_ext(vcpu, MSR_FP); kvmppc_giveup_ext(vcpu, MSR_VEC); kvmppc_giveup_ext(vcpu, MSR_VSX); + vcpu->cpu = -1; } int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3a6490f..69d047c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1509,12 +1509,14 @@ void kvmppc_decrementer_func(unsigned long data) void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + vcpu->cpu = smp_processor_id(); current->thread.kvm_vcpu = vcpu; } void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) { current->thread.kvm_vcpu = NULL; + vcpu->cpu = -1; } int __init kvmppc_booke_init(void) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8443e23..6002ea9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -504,7 +504,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); #endif kvmppc_core_vcpu_load(vcpu, cpu); - vcpu->cpu = smp_processor_id(); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -513,7 +512,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #ifdef CONFIG_BOOKE vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); #endif - vcpu->cpu = -1; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, -- cgit v0.10.2 From 964ee98ccde0534548565a201827cf06d813180f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 20 Sep 2012 19:36:32 +0000 Subject: KVM: PPC: Book3S HV: Remove bogus update of physical thread IDs When making a vcpu non-runnable we incorrectly changed the thread IDs of all other threads on the core, just remove that code. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 38c7f1b..c9ae314 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -706,17 +706,11 @@ extern void xics_wake_cpu(int cpu); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) { - struct kvm_vcpu *v; - if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; --vc->n_runnable; ++vc->n_busy; - /* decrement the physical thread id of each following vcpu */ - v = vcpu; - list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) - --v->arch.ptid; list_del(&vcpu->arch.run_list); } -- cgit v0.10.2 From 70bddfefbdcdbfdebd81d8b59ff8a7fa5d450ccc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 20 Sep 2012 19:39:21 +0000 Subject: KVM: PPC: Book3S HV: Fix calculation of guest phys address for MMIO emulation In the case where the host kernel is using a 64kB base page size and the guest uses a 4k HPTE (hashed page table entry) to map an emulated MMIO device, we were calculating the guest physical address wrongly. We were calculating a gfn as the guest physical address shifted right 16 bits (PAGE_SHIFT) but then only adding back in 12 bits from the effective address, since the HPTE had a 4k page size. Thus the gpa reported to userspace was missing 4 bits. Instead, we now compute the guest physical address from the HPTE without reference to the host page size, and then compute the gfn by shifting the gpa right PAGE_SHIFT bits. Reported-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f598366..7a4aae9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -571,7 +571,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; unsigned long *hptep, hpte[3], r; unsigned long mmu_seq, psize, pte_size; - unsigned long gfn, hva, pfn; + unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; unsigned long *rmap; struct revmap_entry *rev; @@ -609,15 +609,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Translate the logical address and get the page */ psize = hpte_page_size(hpte[0], r); - gfn = hpte_rpn(r, psize); + gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1)); + gfn = gpa >> PAGE_SHIFT; memslot = gfn_to_memslot(kvm, gfn); /* No memslot means it's an emulated MMIO region */ - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { - unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, dsisr & DSISR_ISSTORE); - } if (!kvm->arch.using_mmu_notifiers) return -EFAULT; /* should never get here */ -- cgit v0.10.2 From e400e72f250d2567e89c9bafb47ab91e8d9a15a2 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 22 Aug 2012 15:04:23 +0000 Subject: KVM: PPC: e500: fix allocation size error on g2h_tlb1_map We were only allocating half the bytes we need, which was made more obvious by a recent fix to the memset in clear_tlb1_bitmap(). Signed-off-by: Scott Wood Signed-off-by: Alexander Graf Cc: stable@vger.kernel.org diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 43489a8..a27d134 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1385,7 +1385,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500->gtlb_priv[1]) goto err; - vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) * + vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * vcpu_e500->gtlb_params[1].entries, GFP_KERNEL); if (!vcpu_e500->g2h_tlb1_map) -- cgit v0.10.2 From adbb48a854bf8dee556dc42b96dd61503351a82d Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 22 Aug 2012 15:04:24 +0000 Subject: KVM: PPC: e500: MMU API: fix leak of shared_tlb_pages This was found by kmemleak. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index a27d134..641f978 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1134,6 +1134,8 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) } vcpu_e500->num_shared_tlb_pages = 0; + + kfree(vcpu_e500->shared_tlb_pages); vcpu_e500->shared_tlb_pages = NULL; } else { kfree(vcpu_e500->gtlb_arch); -- cgit v0.10.2 From 5bd1cf118533aba41b3fbd4834e6362a9237db71 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 22 Aug 2012 15:03:50 +0000 Subject: KVM: PPC: set IN_GUEST_MODE before checking requests Avoid a race as described in the code comment. Also remove a related smp_wmb() from booke's kvmppc_prepare_to_enter(). I can't see any reason for it, and the book3s_pr version doesn't have it. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 69d047c..3d1f35d 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -674,7 +674,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) out: vcpu->mode = OUTSIDE_GUEST_MODE; - smp_wmb(); return ret; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6002ea9..deb0d59 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -78,7 +78,16 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) break; } + vcpu->mode = IN_GUEST_MODE; + + /* + * Reading vcpu->requests must happen after setting vcpu->mode, + * so we don't miss a request because the requester sees + * OUTSIDE_GUEST_MODE and assumes we'll be checking requests + * before next entering the guest (and thus doesn't IPI). + */ smp_mb(); + if (vcpu->requests) { /* Make sure we process requests preemptable */ local_irq_enable(); @@ -111,11 +120,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) #endif kvm_guest_enter(); - - /* Going into guest context! Yay! */ - vcpu->mode = IN_GUEST_MODE; - smp_wmb(); - break; } -- cgit v0.10.2 From a136a8bdc02fc14625ac45ee846cc646fc46597e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Sep 2012 20:31:56 +0000 Subject: KVM: PPC: Book3S: Get/set guest SPRs using the GET/SET_ONE_REG interface This enables userspace to get and set various SPRs (special-purpose registers) using the KVM_[GS]ET_ONE_REG ioctls. With this, userspace can get and set all the SPRs that are part of the guest state, either through the KVM_[GS]ET_REGS ioctls, the KVM_[GS]ET_SREGS ioctls, or the KVM_[GS]ET_ONE_REG ioctls. The SPRs that are added here are: - DABR: Data address breakpoint register - DSCR: Data stream control register - PURR: Processor utilization of resources register - SPURR: Scaled PURR - DAR: Data address register - DSISR: Data storage interrupt status register - AMR: Authority mask register - UAMOR: User authority mask override register - MMCR0, MMCR1, MMCRA: Performance monitor unit control registers - PMC1..PMC8: Performance monitor unit counter registers In order to reduce code duplication between PR and HV KVM code, this moves the kvm_vcpu_ioctl_[gs]et_one_reg functions into book3s.c and centralizes the copying between user and kernel space there. The registers that are handled differently between PR and HV, and those that exist only in one flavor, are handled in kvmppc_[gs]et_one_reg() functions that are specific to each flavor. Signed-off-by: Paul Mackerras [agraf: minimal style fixes] Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c84739e..af9b7a0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1740,6 +1740,25 @@ registers, find a list below: PPC | KVM_REG_PPC_IAC4 | 64 PPC | KVM_REG_PPC_DAC1 | 64 PPC | KVM_REG_PPC_DAC2 | 64 + PPC | KVM_REG_PPC_DABR | 64 + PPC | KVM_REG_PPC_DSCR | 64 + PPC | KVM_REG_PPC_PURR | 64 + PPC | KVM_REG_PPC_SPURR | 64 + PPC | KVM_REG_PPC_DAR | 64 + PPC | KVM_REG_PPC_DSISR | 32 + PPC | KVM_REG_PPC_AMR | 64 + PPC | KVM_REG_PPC_UAMOR | 64 + PPC | KVM_REG_PPC_MMCR0 | 64 + PPC | KVM_REG_PPC_MMCR1 | 64 + PPC | KVM_REG_PPC_MMCRA | 64 + PPC | KVM_REG_PPC_PMC1 | 32 + PPC | KVM_REG_PPC_PMC2 | 32 + PPC | KVM_REG_PPC_PMC3 | 32 + PPC | KVM_REG_PPC_PMC4 | 32 + PPC | KVM_REG_PPC_PMC5 | 32 + PPC | KVM_REG_PPC_PMC6 | 32 + PPC | KVM_REG_PPC_PMC7 | 32 + PPC | KVM_REG_PPC_PMC8 | 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 3c14202..9557576 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -338,5 +338,26 @@ struct kvm_book3e_206_tlb_params { #define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5) #define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6) #define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7) +#define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8) +#define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9) +#define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa) +#define KVM_REG_PPC_SPURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb) +#define KVM_REG_PPC_DAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc) +#define KVM_REG_PPC_DSISR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd) +#define KVM_REG_PPC_AMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe) +#define KVM_REG_PPC_UAMOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf) + +#define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10) +#define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11) +#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12) + +#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18) +#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19) +#define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a) +#define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b) +#define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c) +#define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d) +#define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e) +#define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3fb980d..709f0dd 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef CONFIG_PPC_BOOK3S #include #else @@ -196,6 +197,35 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) return r; } +union kvmppc_one_reg { + u32 wval; + u64 dval; +}; + +#define one_reg_size(id) \ + (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) + +#define get_reg_val(id, reg) ({ \ + union kvmppc_one_reg __u; \ + switch (one_reg_size(id)) { \ + case 4: __u.wval = (reg); break; \ + case 8: __u.dval = (reg); break; \ + default: BUG(); \ + } \ + __u; \ +}) + + +#define set_reg_val(id, val) ({ \ + u64 __v; \ + switch (one_reg_size(id)) { \ + case 4: __v = (val).wval; break; \ + case 8: __v = (val).dval; break; \ + default: BUG(); \ + } \ + __v; \ +}) + void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); @@ -204,6 +234,8 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *); +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *); void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e946665..a5af28f 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -485,6 +485,74 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return -ENOTSUPP; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + r = kvmppc_get_one_reg(vcpu, reg->id, &val); + + if (r == -EINVAL) { + r = 0; + switch (reg->id) { + case KVM_REG_PPC_DAR: + val = get_reg_val(reg->id, vcpu->arch.shared->dar); + break; + case KVM_REG_PPC_DSISR: + val = get_reg_val(reg->id, vcpu->arch.shared->dsisr); + break; + default: + r = -EINVAL; + break; + } + } + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; + + r = kvmppc_set_one_reg(vcpu, reg->id, &val); + + if (r == -EINVAL) { + r = 0; + switch (reg->id) { + case KVM_REG_PPC_DAR: + vcpu->arch.shared->dar = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_DSISR: + vcpu->arch.shared->dsisr = set_reg_val(reg->id, val); + break; + default: + r = -EINVAL; + break; + } + } + + return r; +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c9ae314..1cc6b77 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -544,36 +544,88 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; + long int i; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - r = put_user(0, (u64 __user *)reg->addr); + *val = get_reg_val(id, 0); + break; + case KVM_REG_PPC_DABR: + *val = get_reg_val(id, vcpu->arch.dabr); + break; + case KVM_REG_PPC_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr); + break; + case KVM_REG_PPC_PURR: + *val = get_reg_val(id, vcpu->arch.purr); + break; + case KVM_REG_PPC_SPURR: + *val = get_reg_val(id, vcpu->arch.spurr); + break; + case KVM_REG_PPC_AMR: + *val = get_reg_val(id, vcpu->arch.amr); + break; + case KVM_REG_PPC_UAMOR: + *val = get_reg_val(id, vcpu->arch.uamor); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: + i = id - KVM_REG_PPC_MMCR0; + *val = get_reg_val(id, vcpu->arch.mmcr[i]); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + *val = get_reg_val(id, vcpu->arch.pmc[i]); break; default: + r = -EINVAL; break; } return r; } -int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; + long int i; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - { - u64 hior; /* Only allow this to be set to zero */ - r = get_user(hior, (u64 __user *)reg->addr); - if (!r && (hior != 0)) + if (set_reg_val(id, *val)) r = -EINVAL; break; - } + case KVM_REG_PPC_DABR: + vcpu->arch.dabr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DSCR: + vcpu->arch.dscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PURR: + vcpu->arch.purr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SPURR: + vcpu->arch.spurr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_AMR: + vcpu->arch.amr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_UAMOR: + vcpu->arch.uamor = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: + i = id - KVM_REG_PPC_MMCR0; + vcpu->arch.mmcr[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + vcpu->arch.pmc[i] = set_reg_val(id, *val); + break; default: + r = -EINVAL; break; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index bf3ec5d..c81109f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -945,34 +945,33 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - r = copy_to_user((u64 __user *)(long)reg->addr, - &to_book3s(vcpu)->hior, sizeof(u64)); + *val = get_reg_val(id, to_book3s(vcpu)->hior); break; default: + r = -EINVAL; break; } return r; } -int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - r = copy_from_user(&to_book3s(vcpu)->hior, - (u64 __user *)(long)reg->addr, sizeof(u64)); - if (!r) - to_book3s(vcpu)->hior_explicit = true; + to_book3s(vcpu)->hior = set_reg_val(id, *val); + to_book3s(vcpu)->hior_explicit = true; break; default: + r = -EINVAL; break; } -- cgit v0.10.2 From a8bd19ef4dd49f0eef86a4a8eb43d60f967236b8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Sep 2012 20:32:30 +0000 Subject: KVM: PPC: Book3S: Get/set guest FP regs using the GET/SET_ONE_REG interface This enables userspace to get and set all the guest floating-point state using the KVM_[GS]ET_ONE_REG ioctls. The floating-point state includes all of the traditional floating-point registers and the FPSCR (floating point status/control register), all the VMX/Altivec vector registers and the VSCR (vector status/control register), and on POWER7, the vector-scalar registers (note that each FP register is the high-order half of the corresponding VSR). Most of these are implemented in common Book 3S code, except for VSX on POWER7. Because HV and PR differ in how they store the FP and VSX registers on POWER7, the code for these cases is not common. On POWER7, the FP registers are the upper halves of the VSX registers vsr0 - vsr31. PR KVM stores vsr0 - vsr31 in two halves, with the upper halves in the arch.fpr[] array and the lower halves in the arch.vsr[] array, whereas HV KVM on POWER7 stores the whole VSX register in arch.vsr[]. Signed-off-by: Paul Mackerras [agraf: fix whitespace, vsx compilation] Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index af9b7a0..71dc7e2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1759,6 +1759,17 @@ registers, find a list below: PPC | KVM_REG_PPC_PMC6 | 32 PPC | KVM_REG_PPC_PMC7 | 32 PPC | KVM_REG_PPC_PMC8 | 32 + PPC | KVM_REG_PPC_FPR0 | 64 + ... + PPC | KVM_REG_PPC_FPR31 | 64 + PPC | KVM_REG_PPC_VR0 | 128 + ... + PPC | KVM_REG_PPC_VR31 | 128 + PPC | KVM_REG_PPC_VSR0 | 128 + ... + PPC | KVM_REG_PPC_VSR31 | 128 + PPC | KVM_REG_PPC_FPSCR | 64 + PPC | KVM_REG_PPC_VSCR | 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 9557576..1466975 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -360,4 +360,24 @@ struct kvm_book3e_206_tlb_params { #define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e) #define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f) +/* 32 floating-point registers */ +#define KVM_REG_PPC_FPR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20) +#define KVM_REG_PPC_FPR(n) (KVM_REG_PPC_FPR0 + (n)) +#define KVM_REG_PPC_FPR31 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f) + +/* 32 VMX/Altivec vector registers */ +#define KVM_REG_PPC_VR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40) +#define KVM_REG_PPC_VR(n) (KVM_REG_PPC_VR0 + (n)) +#define KVM_REG_PPC_VR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f) + +/* 32 double-width FP registers for VSX */ +/* High-order halves overlap with FP regs */ +#define KVM_REG_PPC_VSR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60) +#define KVM_REG_PPC_VSR(n) (KVM_REG_PPC_VSR0 + (n)) +#define KVM_REG_PPC_VSR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f) + +/* FP and vector status/control registers */ +#define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80) +#define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 709f0dd..51604a1 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -200,6 +200,8 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) union kvmppc_one_reg { u32 wval; u64 dval; + vector128 vval; + u64 vsxval[2]; }; #define one_reg_size(id) \ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a5af28f..a4b6452 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -490,6 +490,7 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int r; union kvmppc_one_reg val; int size; + long int i; size = one_reg_size(reg->id); if (size > sizeof(val)) @@ -506,6 +507,29 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_DSISR: val = get_reg_val(reg->id, vcpu->arch.shared->dsisr); break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = reg->id - KVM_REG_PPC_FPR0; + val = get_reg_val(reg->id, vcpu->arch.fpr[i]); + break; + case KVM_REG_PPC_FPSCR: + val = get_reg_val(reg->id, vcpu->arch.fpscr); + break; +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0]; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); + break; +#endif /* CONFIG_ALTIVEC */ default: r = -EINVAL; break; @@ -525,6 +549,7 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int r; union kvmppc_one_reg val; int size; + long int i; size = one_reg_size(reg->id); if (size > sizeof(val)) @@ -544,6 +569,29 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_DSISR: vcpu->arch.shared->dsisr = set_reg_val(reg->id, val); break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = reg->id - KVM_REG_PPC_FPR0; + vcpu->arch.fpr[i] = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_FPSCR: + vcpu->arch.fpscr = set_reg_val(reg->id, val); + break; +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); + break; +#endif /* CONFIG_ALTIVEC */ default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1cc6b77..94ec0e3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -579,6 +579,27 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; *val = get_reg_val(id, vcpu->arch.pmc[i]); break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + /* VSX => FP reg i is stored in arch.vsr[2*i] */ + long int i = id - KVM_REG_PPC_FPR0; + *val = get_reg_val(id, vcpu->arch.vsr[2 * i]); + } else { + /* let generic code handle it */ + r = -EINVAL; + } + break; + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = id - KVM_REG_PPC_VSR0; + val->vsxval[0] = vcpu->arch.vsr[2 * i]; + val->vsxval[1] = vcpu->arch.vsr[2 * i + 1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ default: r = -EINVAL; break; @@ -624,6 +645,27 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; vcpu->arch.pmc[i] = set_reg_val(id, *val); break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + /* VSX => FP reg i is stored in arch.vsr[2*i] */ + long int i = id - KVM_REG_PPC_FPR0; + vcpu->arch.vsr[2 * i] = set_reg_val(id, *val); + } else { + /* let generic code handle it */ + r = -EINVAL; + } + break; + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = id - KVM_REG_PPC_VSR0; + vcpu->arch.vsr[2 * i] = val->vsxval[0]; + vcpu->arch.vsr[2 * i + 1] = val->vsxval[1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c81109f..b853696 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -953,6 +953,19 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) case KVM_REG_PPC_HIOR: *val = get_reg_val(id, to_book3s(vcpu)->hior); break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { + long int i = id - KVM_REG_PPC_VSR0; + + if (!cpu_has_feature(CPU_FTR_VSX)) { + r = -ENXIO; + break; + } + val->vsxval[0] = vcpu->arch.fpr[i]; + val->vsxval[1] = vcpu->arch.vsr[i]; + break; + } +#endif /* CONFIG_VSX */ default: r = -EINVAL; break; @@ -970,6 +983,19 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) to_book3s(vcpu)->hior = set_reg_val(id, *val); to_book3s(vcpu)->hior_explicit = true; break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { + long int i = id - KVM_REG_PPC_VSR0; + + if (!cpu_has_feature(CPU_FTR_VSX)) { + r = -ENXIO; + break; + } + vcpu->arch.fpr[i] = val->vsxval[0]; + vcpu->arch.vsr[i] = val->vsxval[1]; + break; + } +#endif /* CONFIG_VSX */ default: r = -EINVAL; break; -- cgit v0.10.2 From 55b665b0263ae88a776071306ef1eee4b769016b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Sep 2012 20:33:06 +0000 Subject: KVM: PPC: Book3S HV: Provide a way for userspace to get/set per-vCPU areas The PAPR paravirtualization interface lets guests register three different types of per-vCPU buffer areas in its memory for communication with the hypervisor. These are called virtual processor areas (VPAs). Currently the hypercalls to register and unregister VPAs are handled by KVM in the kernel, and userspace has no way to know about or save and restore these registrations across a migration. This adds "register" codes for these three areas that userspace can use with the KVM_GET/SET_ONE_REG ioctls to see what addresses have been registered, and to register or unregister them. This will be needed for guest hibernation and migration, and is also needed so that userspace can unregister them on reset (otherwise we corrupt guest memory after reboot by writing to the VPAs registered by the previous kernel). The "register" for the VPA is a 64-bit value containing the address, since the length of the VPA is fixed. The "registers" for the SLB shadow buffer and dispatch trace log (DTL) are 128 bits long, consisting of the guest physical address in the high (first) 64 bits and the length in the low 64 bits. This also fixes a bug where we were calling init_vpa unconditionally, leading to an oops when unregistering the VPA. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 71dc7e2..e726d76 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1770,6 +1770,9 @@ registers, find a list below: PPC | KVM_REG_PPC_VSR31 | 128 PPC | KVM_REG_PPC_FPSCR | 64 PPC | KVM_REG_PPC_VSCR | 32 + PPC | KVM_REG_PPC_VPA_ADDR | 64 + PPC | KVM_REG_PPC_VPA_SLB | 128 + PPC | KVM_REG_PPC_VPA_DTL | 128 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 1466975..b89ae4d 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -380,4 +380,10 @@ struct kvm_book3e_206_tlb_params { #define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80) #define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81) +/* Virtual processor areas */ +/* For SLB & DTL, address in high (first) half, length in low half */ +#define KVM_REG_PPC_VPA_ADDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82) +#define KVM_REG_PPC_VPA_SLB (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83) +#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 51604a1..609cca3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -202,6 +202,10 @@ union kvmppc_one_reg { u64 dval; vector128 vval; u64 vsxval[2]; + struct { + u64 addr; + u64 length; + } vpaval; }; #define one_reg_size(id) \ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 94ec0e3..9a15da7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -143,6 +143,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) vpa->yield_count = 1; } +static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, + unsigned long addr, unsigned long len) +{ + /* check address is cacheline aligned */ + if (addr & (L1_CACHE_BYTES - 1)) + return -EINVAL; + spin_lock(&vcpu->arch.vpa_update_lock); + if (v->next_gpa != addr || v->len != len) { + v->next_gpa = addr; + v->len = addr ? len : 0; + v->update_pending = 1; + } + spin_unlock(&vcpu->arch.vpa_update_lock); + return 0; +} + /* Length for a per-processor buffer is passed in at offset 4 in the buffer */ struct reg_vpa { u32 dummy; @@ -321,7 +337,8 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) spin_lock(&vcpu->arch.vpa_update_lock); if (vcpu->arch.vpa.update_pending) { kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); - init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); + if (vcpu->arch.vpa.pinned_addr) + init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); } if (vcpu->arch.dtl.update_pending) { kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); @@ -600,6 +617,23 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) } break; #endif /* CONFIG_VSX */ + case KVM_REG_PPC_VPA_ADDR: + spin_lock(&vcpu->arch.vpa_update_lock); + *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_SLB: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; + val->vpaval.length = vcpu->arch.slb_shadow.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_DTL: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.dtl.next_gpa; + val->vpaval.length = vcpu->arch.dtl.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; default: r = -EINVAL; break; @@ -612,6 +646,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { int r = 0; long int i; + unsigned long addr, len; switch (id) { case KVM_REG_PPC_HIOR: @@ -666,6 +701,33 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) } break; #endif /* CONFIG_VSX */ + case KVM_REG_PPC_VPA_ADDR: + addr = set_reg_val(id, *val); + r = -EINVAL; + if (!addr && (vcpu->arch.slb_shadow.next_gpa || + vcpu->arch.dtl.next_gpa)) + break; + r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); + break; + case KVM_REG_PPC_VPA_SLB: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && !vcpu->arch.vpa.next_gpa) + break; + r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); + break; + case KVM_REG_PPC_VPA_DTL: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (len < sizeof(struct dtl_entry)) + break; + if (addr && !vcpu->arch.vpa.next_gpa) + break; + len -= len % sizeof(struct dtl_entry); + r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); + break; default: r = -EINVAL; break; -- cgit v0.10.2 From 12ecd9570d8941c15602a11725ec9b0ede48d6c2 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 4 Aug 2012 23:52:33 +0000 Subject: arch/powerpc/kvm/e500_tlb.c: fix error return code Convert a 0 error return code to a negative one, as returned elsewhere in the function. A new label is also added to avoid freeing things that are known to not yet be allocated. A simplified version of the semantic match that finds the first problem is as follows: (http://coccinelle.lip6.fr/) // @@ identifier ret; expression e,e1,e2,e3,e4,x; @@ ( if (\(ret != 0\|ret < 0\) || ...) { ... return ...; } | ret = 0 ) ... when != ret = e1 *x = \(kmalloc\|kzalloc\|kcalloc\|devm_kzalloc\|ioremap\|ioremap_nocache\|devm_ioremap\|devm_ioremap_nocache\)(...); ... when != x = e2 when != ret = e3 *if (x == NULL || ...) { ... when != ret = e4 * return ret; } // Signed-off-by: Julia Lawall Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 641f978..c733894 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -1233,21 +1233,27 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, } virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); - if (!virt) + if (!virt) { + ret = -ENOMEM; goto err_put_page; + } privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], GFP_KERNEL); privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], GFP_KERNEL); - if (!privs[0] || !privs[1]) - goto err_put_page; + if (!privs[0] || !privs[1]) { + ret = -ENOMEM; + goto err_privs; + } g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], GFP_KERNEL); - if (!g2h_bitmap) - goto err_put_page; + if (!g2h_bitmap) { + ret = -ENOMEM; + goto err_privs; + } free_gtlb(vcpu_e500); @@ -1287,10 +1293,11 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; -err_put_page: +err_privs: kfree(privs[0]); kfree(privs[1]); +err_put_page: for (i = 0; i < num_pages; i++) put_page(pages[i]); -- cgit v0.10.2 From 8b6e4547e0e4b6aac11df6d8d4e71ea2ab159b5e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 20 Sep 2012 07:43:08 +0200 Subject: KVM: x86: Convert kvm_arch_vcpu_reset into private kvm_vcpu_reset There are no external callers of this function as there is no concept of resetting a vcpu from generic code. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1eefebe..dfed7ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -160,6 +160,8 @@ u64 __read_mostly host_xcr0; int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); + static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -5426,7 +5428,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -6036,7 +6038,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); @@ -6058,7 +6060,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 93bfc9f..c35b1c0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -582,7 +582,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); int kvm_arch_hardware_enable(void *garbage); void kvm_arch_hardware_disable(void *garbage); int kvm_arch_hardware_setup(void); -- cgit v0.10.2 From b6785def8368f9a69505797fcbd9f280d91b673e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 20 Sep 2012 07:43:17 +0200 Subject: KVM: x86: Make emulator_fix_hypercall static No users outside of kvm/x86.c. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfed7ca..841d09b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -158,7 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); @@ -5068,7 +5068,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; -- cgit v0.10.2 From d151f63f20133aef9dbda209faddebcddbfccf31 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 8 Oct 2012 18:36:11 -0600 Subject: KVM: change kvm_assign_device() to print return value when iommu_attach_device() fails Change existing kernel error message to include return value from iommu_attach_device() when it fails. This will help debug device assignment failures more effectively. Signed-off-by: Shuah Khan Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 037cb67..18e1e30 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm, r = iommu_attach_device(domain, &pdev->dev); if (r) { - printk(KERN_ERR "assign device %x:%x:%x.%x failed", - pci_domain_nr(pdev->bus), - pdev->bus->number, - PCI_SLOT(pdev->devfn), - PCI_FUNC(pdev->devfn)); + dev_err(&pdev->dev, "kvm assign device failed ret %d", r); return r; } -- cgit v0.10.2 From 82a1273713df37375a75d6a700deef70c0d21360 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Tue, 2 Oct 2012 16:25:36 +0200 Subject: s390/kvm: Interrupt injection bugfix EXTERNAL_CALL and EMERGENCY type interrupts need to preserve their interrupt code parameter when being injected from user space. Signed-off-by: Jason J. Herne Reviewed-by: Cornelia Huck Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ff1e2f8..c30615e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -629,10 +629,27 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, break; case KVM_S390_SIGP_STOP: case KVM_S390_RESTART: + VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); + inti->type = s390int->type; + break; case KVM_S390_INT_EXTERNAL_CALL: + if (s390int->parm & 0xffff0000) { + kfree(inti); + return -EINVAL; + } + VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u", + s390int->parm); + inti->type = s390int->type; + inti->extcall.code = s390int->parm; + break; case KVM_S390_INT_EMERGENCY: - VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); + if (s390int->parm & 0xffff0000) { + kfree(inti); + return -EINVAL; + } + VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm); inti->type = s390int->type; + inti->emerg.code = s390int->parm; break; case KVM_S390_INT_VIRTIO: case KVM_S390_INT_SERVICE: -- cgit v0.10.2 From 416ad65f73f8a68783c98dd734f570e4829d996b Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 2 Oct 2012 16:25:37 +0200 Subject: s390/kvm: Add documentation for KVM_S390_INTERRUPT Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f6ec3a9..a46a416 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1997,6 +1997,39 @@ return the hash table order in the parameter. (If the guest is using the virtualized real-mode area (VRMA) facility, the kernel will re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +4.77 KVM_S390_INTERRUPT + +Capability: basic +Architectures: s390 +Type: vm ioctl, vcpu ioctl +Parameters: struct kvm_s390_interrupt (in) +Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt: + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) - sigp restart +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt + parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm + +Note that the vcpu ioctl is asynchronous to vcpu execution. + 5. The kvm_run structure ------------------------ -- cgit v0.10.2 From 87cac8f879a5ecd7109dbe688087e8810b3364eb Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 2 Oct 2012 16:25:38 +0200 Subject: s390/kvm: dont announce RRBM support Newer kernels (linux-next with the transparent huge page patches) use rrbm if the feature is announced via feature bit 66. RRBM will cause intercepts, so KVM does not handle it right now, causing an illegal instruction in the guest. The easy solution is to disable the feature bit for the guest. This fixes bugs like: Kernel BUG at 0000000000124c2a [verbose debug info unavailable] illegal operation: 0001 [#1] SMP Modules linked in: virtio_balloon virtio_net ipv6 autofs4 CPU: 0 Not tainted 3.5.4 #1 Process fmempig (pid: 659, task: 000000007b712fd0, ksp: 000000007bed3670) Krnl PSW : 0704d00180000000 0000000000124c2a (pmdp_clear_flush_young+0x5e/0x80) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3 00000000003cc000 0000000000000004 0000000000000000 0000000079800000 0000000000040000 0000000000000000 000000007bed3918 000000007cf40000 0000000000000001 000003fff7f00000 000003d281a94000 000000007bed383c 000000007bed3918 00000000005ecbf8 00000000002314a6 000000007bed36e0 Krnl Code:>0000000000124c2a: b9810025 ogr %r2,%r5 0000000000124c2e: 41343000 la %r3,0(%r4,%r3) 0000000000124c32: a716fffa brct %r1,124c26 0000000000124c36: b9010022 lngr %r2,%r2 0000000000124c3a: e3d0f0800004 lg %r13,128(%r15) 0000000000124c40: eb22003f000c srlg %r2,%r2,63 [ 2150.713198] Call Trace: [ 2150.713223] ([<00000000002312c4>] page_referenced_one+0x6c/0x27c) [ 2150.713749] [<0000000000233812>] page_referenced+0x32a/0x410 [...] CC: stable@vger.kernel.org CC: Alex Graf Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ecced9d..38883f0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -997,7 +997,7 @@ static int __init kvm_s390_init(void) } memcpy(facilities, S390_lowcore.stfle_fac_list, 16); facilities[0] &= 0xff00fff3f47c0000ULL; - facilities[1] &= 0x201c000000000000ULL; + facilities[1] &= 0x001c000000000000ULL; return 0; } -- cgit v0.10.2 From bd660776da89b031632128fc8dbf6635f94cb659 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 16 Oct 2012 20:08:01 +0800 Subject: KVM: MMU: remove mmu_is_invalid Remove mmu_is_invalid and use is_invalid_pfn instead Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d289fee..7fe1e03 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2700,11 +2700,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } } -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, pfn_t pfn, unsigned access, int *ret_val) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 714e2c0..045d31a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -340,7 +340,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) + if (is_invalid_pfn(pfn)) return; /* @@ -416,7 +416,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) + if (is_invalid_pfn(pfn)) break; mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, -- cgit v0.10.2 From d4878f24e32f5ea5330e6a48977c8997396bc014 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 16 Oct 2012 20:08:43 +0800 Subject: KVM: MMU: cleanup FNAME(page_fault) Let it return emulate state instead of spte like __direct_map Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 045d31a..c555532 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -427,21 +427,21 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) + pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw->pt_access; + int top_level, emulate = 0; if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; + return 0; direct_access = gw->pte_access; @@ -505,17 +505,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, clear_sp_write_flooding_count(it.sptep); mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, + user_fault, write_fault, &emulate, it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); - return NULL; + return 0; } /* @@ -538,8 +538,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; @@ -601,17 +599,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - + r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return emulate; + return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); -- cgit v0.10.2 From a052b42b0e618f34ca891f00b4e8b8ac0e4b80c0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 16 Oct 2012 20:09:36 +0800 Subject: KVM: MMU: move prefetch_invalid_gpte out of pagaing_tmp.h The function does not depend on guest mmu mode, move it out from paging_tmpl.h Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7fe1e03..3d5ca79 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2506,6 +2506,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; +} + static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2518,6 +2526,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } +static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) + goto no_present; + + if (!(gpte & PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -3395,14 +3423,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static inline void protect_clean_gpte(unsigned *access, unsigned gpte) { unsigned mask; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c555532..36a80ed 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -305,26 +305,6 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { @@ -333,7 +313,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pfn_t pfn; gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) return; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); @@ -408,7 +388,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, gpte = gptep[i]; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) continue; pte_access = sp->role.access & gpte_access(vcpu, gpte); @@ -751,7 +731,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } -- cgit v0.10.2 From bd6360cc0a545544b5e69ae4428ac031c4e43588 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 16 Oct 2012 20:10:12 +0800 Subject: KVM: MMU: introduce FNAME(prefetch_gpte) The only difference between FNAME(update_pte) and FNAME(pte_prefetch) is that the former is allowed to prefetch gfn from dirty logged slot, so introduce a common function to prefetch spte Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 36a80ed..f887e4c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -305,31 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - pt_element_t gpte; unsigned pte_access; + gfn_t gfn; pfn_t pfn; - gpte = *(const pt_element_t *)pte; if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) - return; + return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); + + gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_invalid_pfn(pfn)) - return; + return false; /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). + * we call mmu_set_spte() with host_writable = true because + * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); + NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); + + return true; +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte) +{ + pt_element_t gpte = *(const pt_element_t *)pte; + + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -375,33 +387,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (is_invalid_pfn(pfn)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); } } -- cgit v0.10.2 From 471842ec49726bc589322e8498e6620b4d723c74 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 15 Oct 2012 09:49:08 +0200 Subject: KVM: do not de-cache cr4 bits needlessly Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 841d09b..6e5f069 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -635,7 +635,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) return 1; } else -- cgit v0.10.2 From b9bf6882c1f9451ce1e80aaed32107673a735613 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 17 Oct 2012 13:46:52 +0800 Subject: KVM: VMX: report internal error for the unhandleable event VM exits during Event Delivery is really unexpected if it is not caused by Exceptions/EPT-VIOLATION/TASK_SWITCH, we'd better to report an internal and freeze the guest, the VMM has the chance to check the guest Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad6b1dd..b8a0841 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5979,13 +5979,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_TASK_SWITCH)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info " - "(0x%x) and exit reason is 0x%x\n", - __func__, vectoring_info, exit_reason); + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + return 0; + } if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 65ad5c6..494a84c 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -170,8 +170,12 @@ struct kvm_pit_config { #define KVM_EXIT_WATCHDOG 21 /* For KVM_EXIT_INTERNAL_ERROR */ -#define KVM_INTERNAL_ERROR_EMULATION 1 -#define KVM_INTERNAL_ERROR_SIMUL_EX 2 +/* Emulate instruction failed. */ +#define KVM_INTERNAL_ERROR_EMULATION 1 +/* Encounter unexpected simultaneous exceptions. */ +#define KVM_INTERNAL_ERROR_SIMUL_EX 2 +/* Encounter unexpected vm-exit due to delivery event. */ +#define KVM_INTERNAL_ERROR_DELIVERY_EV 3 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { -- cgit v0.10.2 From bf4ca23ef597a440ca46c695d8711ec0b387f12c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 17 Oct 2012 13:48:06 +0800 Subject: KVM: VMX: report internal error for MMIO #PF due to delivery event The #PF with PFEC.RSV = 1 indicates that the guest is accessing MMIO, we can not fix it if it is caused by delivery event. Reporting internal error for this case Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8a0841..6599e45 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4287,16 +4287,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - return 0; - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -4315,6 +4305,22 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); -- cgit v0.10.2 From 1f5b77f51a221f5eb10af08da9067fba360dc52f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Oct 2012 20:20:04 +0200 Subject: KVM: SVM: Cleanup error statements Use __func__ instead of the function name in svm_hardware_enable since those things tend to get out of sync. This also slims down printk line length in conjunction with using pr_err. No functionality change. Cc: Joerg Roedel Cc: Avi Kivity Signed-off-by: Borislav Petkov Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d017df3..e93908f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -630,15 +630,12 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", - me); + pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } sd = per_cpu(svm_data, me); - if (!sd) { - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", - me); + pr_err("%s: svm_data is NULL on %d\n", __func__, me); return -EINVAL; } -- cgit v0.10.2 From 8ca40a70a70988c0bdea106c894843f763ca2989 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sun, 14 Oct 2012 23:10:18 -0400 Subject: KVM: Take kvm instead of vcpu to mmu_notifier_retry The mmu_notifier_retry is not specific to any vcpu (and never will be) so only take struct kvm as a parameter. The motivation is the ARM mmu code that needs to call this from somewhere where we long let go of the vcpu pointer. Signed-off-by: Christoffer Dall Signed-off-by: Avi Kivity diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 7a4aae9..2a89a36 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -710,7 +710,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Check if we might have been invalidated; let the guest retry if so */ ret = RESUME_GUEST; - if (mmu_notifier_retry(vcpu, mmu_seq)) { + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { unlock_rmap(rmap); goto out_unlock; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 9955216..5e06e31 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -297,7 +297,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ if (kvm->arch.using_mmu_notifiers && - mmu_notifier_retry(vcpu, mmu_seq)) { + mmu_notifier_retry(vcpu->kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d5ca79..6f78fa3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2886,7 +2886,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3355,7 +3355,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f887e4c..d17deca 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -565,7 +565,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6afc5be..82e2c78 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -841,9 +841,9 @@ extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) -static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) +static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) { - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(kvm->mmu_notifier_count)) return 1; /* * Ensure the read of mmu_notifier_count happens before the read @@ -856,7 +856,7 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se * can't rely on kvm->mmu_lock to keep things ordered. */ smp_rmb(); - if (vcpu->kvm->mmu_notifier_seq != mmu_seq) + if (kvm->mmu_notifier_seq != mmu_seq) return 1; return 0; } -- cgit v0.10.2 From 787c57c0fb393fe8a3974d300ddcfe30373386fe Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Oct 2012 23:03:42 +0800 Subject: KVM: ia64: remove unused variable in kvm_release_vm_pages() The variable base_gfn is initialized but never used otherwise, so remove the unused variable. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Acked-by: Xiantao Zhang Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 8b3a9c0..c71acd7 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1362,11 +1362,9 @@ static void kvm_release_vm_pages(struct kvm *kvm) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int j; - unsigned long base_gfn; slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { - base_gfn = memslot->base_gfn; for (j = 0; j < memslot->npages; j++) { if (memslot->rmap[j]) put_page((struct page *)memslot->rmap[j]); -- cgit v0.10.2 From 81c52c56e2b43589091ee29038bcf793d3f184ab Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 16 Oct 2012 20:10:59 +0800 Subject: KVM: do not treat noslot pfn as a error pfn This patch filters noslot pfn out from error pfns based on Marcelo comment: noslot pfn is not a error pfn After this patch, - is_noslot_pfn indicates that the gfn is not in slot - is_error_pfn indicates that the gfn is in slot but the error is occurred when translate the gfn to pfn - is_error_noslot_pfn indicates that the pfn either it is error pfns or it is noslot pfn And is_invalid_pfn can be removed, it makes the code more clean Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index d1107a9..00e619b 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -155,7 +155,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_pfn(hpaddr)) { + if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); r = -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index d0205a5..ead58e3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -93,7 +93,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_pfn(hpaddr)) { + if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); r = -EINVAL; goto out; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index c733894..6305ee6 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -524,7 +524,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(slot, gfn); - if (is_error_pfn(pfn)) { + if (is_error_noslot_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", (long)gfn); return; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aabb128..b875a9e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2699,7 +2699,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { @@ -2733,7 +2733,7 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, bool ret = true; /* The pfn is invalid, report the error! */ - if (unlikely(is_invalid_pfn(pfn))) { + if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); goto exit; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d17deca..891eb6d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -323,7 +323,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, protect_clean_gpte(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_invalid_pfn(pfn)) + if (is_error_pfn(pfn)) return false; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6e5f069..49fa1f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4504,7 +4504,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) * instruction -> ... */ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - if (!is_error_pfn(pfn)) { + if (!is_error_noslot_pfn(pfn)) { kvm_release_pfn_clean(pfn); return true; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 82e2c78..99a4762 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -58,28 +58,40 @@ /* * For the normal pfn, the highest 12 bits should be zero, - * so we can mask these bits to indicate the error. + * so we can mask bit 62 ~ bit 52 to indicate the error pfn, + * mask bit 63 to indicate the noslot pfn. */ -#define KVM_PFN_ERR_MASK (0xfffULL << 52) +#define KVM_PFN_ERR_MASK (0x7ffULL << 52) +#define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) +#define KVM_PFN_NOSLOT (0x1ULL << 63) #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) -#define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2) -#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3) +#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) +/* + * error pfns indicate that the gfn is in slot but faild to + * translate it to pfn on host. + */ static inline bool is_error_pfn(pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_MASK); } -static inline bool is_noslot_pfn(pfn_t pfn) +/* + * error_noslot pfns indicate that the gfn can not be + * translated to pfn - it is not in slot or failed to + * translate it to pfn. + */ +static inline bool is_error_noslot_pfn(pfn_t pfn) { - return pfn == KVM_PFN_ERR_BAD; + return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); } -static inline bool is_invalid_pfn(pfn_t pfn) +/* noslot pfn indicates that the gfn is not in slot. */ +static inline bool is_noslot_pfn(pfn_t pfn) { - return !is_noslot_pfn(pfn) && is_error_pfn(pfn); + return pfn == KVM_PFN_NOSLOT; } #define KVM_HVA_ERR_BAD (PAGE_OFFSET) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 18e1e30..4a340cb 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -52,7 +52,7 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, end_gfn = gfn + (size >> PAGE_SHIFT); gfn += 1; - if (is_error_pfn(pfn)) + if (is_error_noslot_pfn(pfn)) return pfn; while (gfn < end_gfn) @@ -106,7 +106,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) * important because we unmap and unpin in 4kb steps later. */ pfn = kvm_pin_pages(slot, gfn, page_size); - if (is_error_pfn(pfn)) { + if (is_error_noslot_pfn(pfn)) { gfn += 1; continue; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index be70035..2fb7319 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1208,7 +1208,7 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, return KVM_PFN_ERR_RO_FAULT; if (kvm_is_error_hva(addr)) - return KVM_PFN_ERR_BAD; + return KVM_PFN_NOSLOT; /* Do not map writable pfn in the readonly memslot. */ if (writable && memslot_is_readonly(slot)) { @@ -1290,7 +1290,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); static struct page *kvm_pfn_to_page(pfn_t pfn) { - if (is_error_pfn(pfn)) + if (is_error_noslot_pfn(pfn)) return KVM_ERR_PTR_BAD_PAGE; if (kvm_is_mmio_pfn(pfn)) { @@ -1322,7 +1322,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); -- cgit v0.10.2 From e43a028752fed049e4bd94ef895542f96d79fa74 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sat, 6 Oct 2012 03:56:35 +0200 Subject: KVM: PPC: 44x: fix DCR read/write When remembering the direction of a DCR transaction, we should write to the same variable that we interpret on later when doing vcpu_run again. Signed-off-by: Alexander Graf Cc: stable@vger.kernel.org diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 1a793c4..35ec0a8 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -46,6 +46,7 @@ static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) vcpu->run->dcr.dcrn = dcrn; vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs); vcpu->run->dcr.is_write = 1; + vcpu->arch.dcr_is_write = 1; vcpu->arch.dcr_needed = 1; kvmppc_account_exit(vcpu, DCR_EXITS); return EMULATE_DO_DCR; @@ -80,6 +81,7 @@ static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) vcpu->run->dcr.dcrn = dcrn; vcpu->run->dcr.data = 0; vcpu->run->dcr.is_write = 0; + vcpu->arch.dcr_is_write = 0; vcpu->arch.io_gpr = rt; vcpu->arch.dcr_needed = 1; kvmppc_account_exit(vcpu, DCR_EXITS); -- cgit v0.10.2 From 686de182a217d315e50fef74ba5e7bcd7ea6c246 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 7 Oct 2012 15:22:59 +0200 Subject: KVM: Documentation: Fix reentry-to-be-consistent paragraph All user space offloaded instruction emulation needs to reenter kvm to produce consistent state again. Fix the section in the documentation to mention all of them. Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4258180..6671fdc 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2183,7 +2183,8 @@ executed a memory-mapped I/O instruction which could not be satisfied by kvm. The 'data' member contains the written data if 'is_write' is true, and should be filled by application code otherwise. -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR + and KVM_EXIT_PAPR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. Userspace -- cgit v0.10.2 From 388cf9ee3c751c3a4cf8776987143354d6d8c797 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sat, 6 Oct 2012 23:19:01 +0200 Subject: KVM: PPC: Move mtspr/mfspr emulation into own functions The mtspr/mfspr emulation code became quite big over time. Move it into its own function so things stay more readable. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index ee04aba..b0855e5 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) return vcpu->arch.dec - jd; } +static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); + + switch (sprn) { + case SPRN_SRR0: + vcpu->arch.shared->srr0 = spr_val; + break; + case SPRN_SRR1: + vcpu->arch.shared->srr1 = spr_val; + break; + + /* XXX We need to context-switch the timebase for + * watchdog and FIT. */ + case SPRN_TBWL: break; + case SPRN_TBWU: break; + + case SPRN_MSSSR0: break; + + case SPRN_DEC: + vcpu->arch.dec = spr_val; + kvmppc_emulate_dec(vcpu); + break; + + case SPRN_SPRG0: + vcpu->arch.shared->sprg0 = spr_val; + break; + case SPRN_SPRG1: + vcpu->arch.shared->sprg1 = spr_val; + break; + case SPRN_SPRG2: + vcpu->arch.shared->sprg2 = spr_val; + break; + case SPRN_SPRG3: + vcpu->arch.shared->sprg3 = spr_val; + break; + + default: + emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, + spr_val); + if (emulated == EMULATE_FAIL) + printk(KERN_INFO "mtspr: unknown spr " + "0x%x\n", sprn); + break; + } + + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + + return emulated; +} + +static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = 0; + + switch (sprn) { + case SPRN_SRR0: + spr_val = vcpu->arch.shared->srr0; + break; + case SPRN_SRR1: + spr_val = vcpu->arch.shared->srr1; + break; + case SPRN_PVR: + spr_val = vcpu->arch.pvr; + break; + case SPRN_PIR: + spr_val = vcpu->vcpu_id; + break; + case SPRN_MSSSR0: + spr_val = 0; + break; + + /* Note: mftb and TBRL/TBWL are user-accessible, so + * the guest can always access the real TB anyways. + * In fact, we probably will never see these traps. */ + case SPRN_TBWL: + spr_val = get_tb() >> 32; + break; + case SPRN_TBWU: + spr_val = get_tb(); + break; + + case SPRN_SPRG0: + spr_val = vcpu->arch.shared->sprg0; + break; + case SPRN_SPRG1: + spr_val = vcpu->arch.shared->sprg1; + break; + case SPRN_SPRG2: + spr_val = vcpu->arch.shared->sprg2; + break; + case SPRN_SPRG3: + spr_val = vcpu->arch.shared->sprg3; + break; + /* Note: SPRG4-7 are user-readable, so we don't get + * a trap. */ + + case SPRN_DEC: + spr_val = kvmppc_get_dec(vcpu, get_tb()); + break; + default: + emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, + &spr_val); + if (unlikely(emulated == EMULATE_FAIL)) { + printk(KERN_INFO "mfspr: unknown spr " + "0x%x\n", sprn); + } + break; + } + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, rt, spr_val); + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + + return emulated; +} + /* XXX to do: * lhax * lhaux @@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) int sprn = get_sprn(inst); enum emulation_result emulated = EMULATE_DONE; int advance = 1; - ulong spr_val = 0; /* this default type might be overwritten by subcategories */ kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); @@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_MFSPR: - switch (sprn) { - case SPRN_SRR0: - spr_val = vcpu->arch.shared->srr0; - break; - case SPRN_SRR1: - spr_val = vcpu->arch.shared->srr1; - break; - case SPRN_PVR: - spr_val = vcpu->arch.pvr; - break; - case SPRN_PIR: - spr_val = vcpu->vcpu_id; - break; - case SPRN_MSSSR0: - spr_val = 0; - break; - - /* Note: mftb and TBRL/TBWL are user-accessible, so - * the guest can always access the real TB anyways. - * In fact, we probably will never see these traps. */ - case SPRN_TBWL: - spr_val = get_tb() >> 32; - break; - case SPRN_TBWU: - spr_val = get_tb(); - break; - - case SPRN_SPRG0: - spr_val = vcpu->arch.shared->sprg0; - break; - case SPRN_SPRG1: - spr_val = vcpu->arch.shared->sprg1; - break; - case SPRN_SPRG2: - spr_val = vcpu->arch.shared->sprg2; - break; - case SPRN_SPRG3: - spr_val = vcpu->arch.shared->sprg3; - break; - /* Note: SPRG4-7 are user-readable, so we don't get - * a trap. */ - - case SPRN_DEC: - spr_val = kvmppc_get_dec(vcpu, get_tb()); - break; - default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, - &spr_val); - if (unlikely(emulated == EMULATE_FAIL)) { - printk(KERN_INFO "mfspr: unknown spr " - "0x%x\n", sprn); - } - break; - } - kvmppc_set_gpr(vcpu, rt, spr_val); - kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt); break; case OP_31_XOP_STHX: @@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_MTSPR: - spr_val = kvmppc_get_gpr(vcpu, rs); - switch (sprn) { - case SPRN_SRR0: - vcpu->arch.shared->srr0 = spr_val; - break; - case SPRN_SRR1: - vcpu->arch.shared->srr1 = spr_val; - break; - - /* XXX We need to context-switch the timebase for - * watchdog and FIT. */ - case SPRN_TBWL: break; - case SPRN_TBWU: break; - - case SPRN_MSSSR0: break; - - case SPRN_DEC: - vcpu->arch.dec = spr_val; - kvmppc_emulate_dec(vcpu); - break; - - case SPRN_SPRG0: - vcpu->arch.shared->sprg0 = spr_val; - break; - case SPRN_SPRG1: - vcpu->arch.shared->sprg1 = spr_val; - break; - case SPRN_SPRG2: - vcpu->arch.shared->sprg2 = spr_val; - break; - case SPRN_SPRG3: - vcpu->arch.shared->sprg3 = spr_val; - break; - - default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, - spr_val); - if (emulated == EMULATE_FAIL) - printk(KERN_INFO "mtspr: unknown spr " - "0x%x\n", sprn); - break; - } - kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); break; case OP_31_XOP_DCBI: -- cgit v0.10.2 From c99ec973a63e2249020d6d93a46d7572432da6a2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sat, 27 Oct 2012 19:26:14 +0200 Subject: PPC: ePAPR: Convert header to uapi The new uapi framework splits kernel internal and user space exported bits of header files more cleanly. Adjust the ePAPR header accordingly. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 13d6b7bf..7e313f1 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -34,6 +34,5 @@ header-y += termios.h header-y += types.h header-y += ucontext.h header-y += unistd.h -header-y += epapr_hcalls.h generic-y += rwsem.h diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index b8d9445..58997af 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -50,60 +50,7 @@ #ifndef _EPAPR_HCALLS_H #define _EPAPR_HCALLS_H -#define EV_BYTE_CHANNEL_SEND 1 -#define EV_BYTE_CHANNEL_RECEIVE 2 -#define EV_BYTE_CHANNEL_POLL 3 -#define EV_INT_SET_CONFIG 4 -#define EV_INT_GET_CONFIG 5 -#define EV_INT_SET_MASK 6 -#define EV_INT_GET_MASK 7 -#define EV_INT_IACK 9 -#define EV_INT_EOI 10 -#define EV_INT_SEND_IPI 11 -#define EV_INT_SET_TASK_PRIORITY 12 -#define EV_INT_GET_TASK_PRIORITY 13 -#define EV_DOORBELL_SEND 14 -#define EV_MSGSND 15 -#define EV_IDLE 16 - -/* vendor ID: epapr */ -#define EV_LOCAL_VENDOR_ID 0 /* for private use */ -#define EV_EPAPR_VENDOR_ID 1 -#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */ -#define EV_IBM_VENDOR_ID 3 /* IBM */ -#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */ -#define EV_ENEA_VENDOR_ID 5 /* Enea */ -#define EV_WR_VENDOR_ID 6 /* Wind River Systems */ -#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */ -#define EV_KVM_VENDOR_ID 42 /* KVM */ - -/* The max number of bytes that a byte channel can send or receive per call */ -#define EV_BYTE_CHANNEL_MAX_BYTES 16 - - -#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) -#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) - -/* epapr return codes */ -#define EV_SUCCESS 0 -#define EV_EPERM 1 /* Operation not permitted */ -#define EV_ENOENT 2 /* Entry Not Found */ -#define EV_EIO 3 /* I/O error occured */ -#define EV_EAGAIN 4 /* The operation had insufficient - * resources to complete and should be - * retried - */ -#define EV_ENOMEM 5 /* There was insufficient memory to - * complete the operation */ -#define EV_EFAULT 6 /* Bad guest address */ -#define EV_ENODEV 7 /* No such device */ -#define EV_EINVAL 8 /* An argument supplied to the hcall - was out of range or invalid */ -#define EV_INTERNAL 9 /* An internal error occured */ -#define EV_CONFIG 10 /* A configuration error was detected */ -#define EV_INVALID_STATE 11 /* The object is in an invalid state */ -#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ -#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ +#include #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index baebb3d..e6b5be8 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -1,3 +1,4 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm +header-y += epapr_hcalls.h diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h new file mode 100644 index 0000000..046c793 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h @@ -0,0 +1,98 @@ +/* + * ePAPR hcall interface + * + * Copyright 2008-2011 Freescale Semiconductor, Inc. + * + * Author: Timur Tabi + * + * This file is provided under a dual BSD/GPL license. When using or + * redistributing this file, you may do so under either license. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _UAPI__EPAPR_HCALLS_H +#define _UAPI__EPAPR_HCALLS_H + +#define EV_BYTE_CHANNEL_SEND 1 +#define EV_BYTE_CHANNEL_RECEIVE 2 +#define EV_BYTE_CHANNEL_POLL 3 +#define EV_INT_SET_CONFIG 4 +#define EV_INT_GET_CONFIG 5 +#define EV_INT_SET_MASK 6 +#define EV_INT_GET_MASK 7 +#define EV_INT_IACK 9 +#define EV_INT_EOI 10 +#define EV_INT_SEND_IPI 11 +#define EV_INT_SET_TASK_PRIORITY 12 +#define EV_INT_GET_TASK_PRIORITY 13 +#define EV_DOORBELL_SEND 14 +#define EV_MSGSND 15 +#define EV_IDLE 16 + +/* vendor ID: epapr */ +#define EV_LOCAL_VENDOR_ID 0 /* for private use */ +#define EV_EPAPR_VENDOR_ID 1 +#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */ +#define EV_IBM_VENDOR_ID 3 /* IBM */ +#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */ +#define EV_ENEA_VENDOR_ID 5 /* Enea */ +#define EV_WR_VENDOR_ID 6 /* Wind River Systems */ +#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */ +#define EV_KVM_VENDOR_ID 42 /* KVM */ + +/* The max number of bytes that a byte channel can send or receive per call */ +#define EV_BYTE_CHANNEL_MAX_BYTES 16 + + +#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) +#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) + +/* epapr return codes */ +#define EV_SUCCESS 0 +#define EV_EPERM 1 /* Operation not permitted */ +#define EV_ENOENT 2 /* Entry Not Found */ +#define EV_EIO 3 /* I/O error occured */ +#define EV_EAGAIN 4 /* The operation had insufficient + * resources to complete and should be + * retried + */ +#define EV_ENOMEM 5 /* There was insufficient memory to + * complete the operation */ +#define EV_EFAULT 6 /* Bad guest address */ +#define EV_ENODEV 7 /* No such device */ +#define EV_EINVAL 8 /* An argument supplied to the hcall + was out of range or invalid */ +#define EV_INTERNAL 9 /* An internal error occured */ +#define EV_CONFIG 10 /* A configuration error was detected */ +#define EV_INVALID_STATE 11 /* The object is in an invalid state */ +#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ +#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ + +#endif /* _UAPI__EPAPR_HCALLS_H */ -- cgit v0.10.2 From 512691d4907d7cf4b8d05c6f8572d1fa60ccec20 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:15:41 +0000 Subject: KVM: PPC: Book3S HV: Allow KVM guests to stop secondary threads coming online When a Book3S HV KVM guest is running, we need the host to be in single-thread mode, that is, all of the cores (or at least all of the cores where the KVM guest could run) to be running only one active hardware thread. This is because of the hardware restriction in POWER processors that all of the hardware threads in the core must be in the same logical partition. Complying with this restriction is much easier if, from the host kernel's point of view, only one hardware thread is active. This adds two hooks in the SMP hotplug code to allow the KVM code to make sure that secondary threads (i.e. hardware threads other than thread 0) cannot come online while any KVM guest exists. The KVM code still has to check that any core where it runs a guest has the secondary threads offline, but having done that check it can now be sure that they will not come online while the guest is running. Signed-off-by: Paul Mackerras Acked-by: Benjamin Herrenschmidt Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index ebc24dc..b625a1a 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -66,6 +66,14 @@ void generic_cpu_die(unsigned int cpu); void generic_mach_cpu_die(void); void generic_set_cpu_dead(unsigned int cpu); int generic_check_cpu_restart(unsigned int cpu); + +extern void inhibit_secondary_onlining(void); +extern void uninhibit_secondary_onlining(void); + +#else /* HOTPLUG_CPU */ +static inline void inhibit_secondary_onlining(void) {} +static inline void uninhibit_secondary_onlining(void) {} + #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8d4214a..c4f420c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -417,6 +417,45 @@ int generic_check_cpu_restart(unsigned int cpu) { return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; } + +static atomic_t secondary_inhibit_count; + +/* + * Don't allow secondary CPU threads to come online + */ +void inhibit_secondary_onlining(void) +{ + /* + * This makes secondary_inhibit_count stable during cpu + * online/offline operations. + */ + get_online_cpus(); + + atomic_inc(&secondary_inhibit_count); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(inhibit_secondary_onlining); + +/* + * Allow secondary CPU threads to come online again + */ +void uninhibit_secondary_onlining(void) +{ + get_online_cpus(); + atomic_dec(&secondary_inhibit_count); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining); + +static int secondaries_inhibited(void) +{ + return atomic_read(&secondary_inhibit_count); +} + +#else /* HOTPLUG_CPU */ + +#define secondaries_inhibited() 0 + #endif static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) @@ -435,6 +474,13 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc, c; + /* + * Don't allow secondary threads to come online if inhibited + */ + if (threads_per_core > 1 && secondaries_inhibited() && + cpu % threads_per_core != 0) + return -EBUSY; + if (smp_ops == NULL || (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu))) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9a15da7..c5ddf04 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1016,8 +1017,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) /* * Make sure we are running on thread 0, and that * secondary threads are offline. - * XXX we should also block attempts to bring any - * secondary threads online. */ if (threads_per_core > 1 && !on_primary_thread()) { list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) @@ -1730,11 +1729,20 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); spin_lock_init(&kvm->arch.slot_phys_lock); + + /* + * Don't allow secondary CPU threads to come online + * while any KVM VMs exist. + */ + inhibit_secondary_onlining(); + return 0; } void kvmppc_core_destroy_vm(struct kvm *kvm) { + uninhibit_secondary_onlining(); + if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; -- cgit v0.10.2 From 7b444c6710c6c4994e31eb19216ce055836e65c4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:16:14 +0000 Subject: KVM: PPC: Book3S HV: Fix some races in starting secondary threads Subsequent patches implementing in-kernel XICS emulation will make it possible for IPIs to arrive at secondary threads at arbitrary times. This fixes some races in how we start the secondary threads, which if not fixed could lead to occasional crashes of the host kernel. This makes sure that (a) we have grabbed all the secondary threads, and verified that they are no longer in the kernel, before we start any thread, (b) that the secondary thread loads its vcpu pointer after clearing the IPI that woke it up (so we don't miss a wakeup), and (c) that the secondary thread clears its vcpu pointer before incrementing the nap count. It also removes unnecessary setting of the vcpu and vcore pointers in the paca in kvmppc_core_vcpu_load. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c5ddf04..77dec0f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -64,8 +64,6 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - local_paca->kvm_hstate.kvm_vcpu = vcpu; - local_paca->kvm_hstate.kvm_vcore = vc; if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) vc->stolen_tb += mftb() - vc->preempt_tb; } @@ -880,6 +878,7 @@ static int kvmppc_grab_hwthread(int cpu) /* Ensure the thread won't go into the kernel if it wakes */ tpaca->kvm_hstate.hwthread_req = 1; + tpaca->kvm_hstate.kvm_vcpu = NULL; /* * If the thread is already executing in the kernel (e.g. handling @@ -929,7 +928,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) smp_wmb(); #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (vcpu->arch.ptid) { - kvmppc_grab_hwthread(cpu); xics_wake_cpu(cpu); ++vc->n_woken; } @@ -955,7 +953,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) /* * Check that we are on thread 0 and that any other threads in - * this core are off-line. + * this core are off-line. Then grab the threads so they can't + * enter the kernel. */ static int on_primary_thread(void) { @@ -967,6 +966,17 @@ static int on_primary_thread(void) while (++thr < threads_per_core) if (cpu_online(cpu + thr)) return 0; + + /* Grab all hw threads so they can't go into the kernel */ + for (thr = 1; thr < threads_per_core; ++thr) { + if (kvmppc_grab_hwthread(cpu + thr)) { + /* Couldn't grab one; let the others go */ + do { + kvmppc_release_hwthread(cpu + thr); + } while (--thr > 0); + return 0; + } + } return 1; } @@ -1015,16 +1025,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) } /* - * Make sure we are running on thread 0, and that - * secondary threads are offline. - */ - if (threads_per_core > 1 && !on_primary_thread()) { - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - vcpu->arch.ret = -EBUSY; - goto out; - } - - /* * Assign physical thread IDs, first to non-ceded vcpus * and then to ceded ones. */ @@ -1043,15 +1043,22 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) if (vcpu->arch.ceded) vcpu->arch.ptid = ptid++; + /* + * Make sure we are running on thread 0, and that + * secondary threads are offline. + */ + if (threads_per_core > 1 && !on_primary_thread()) { + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->arch.ret = -EBUSY; + goto out; + } + vc->stolen_tb += mftb() - vc->preempt_tb; vc->pcpu = smp_processor_id(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { kvmppc_start_thread(vcpu); kvmppc_create_dtl_entry(vcpu, vc); } - /* Grab any remaining hw threads so they can't go into the kernel */ - for (i = ptid; i < threads_per_core; ++i) - kvmppc_grab_hwthread(vc->pcpu + i); preempt_disable(); spin_unlock(&vc->lock); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 44b72fe..1e90ef6 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -134,8 +134,11 @@ kvm_start_guest: 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ + /* reload vcpu pointer after clearing the IPI */ + ld r4,HSTATE_KVM_VCPU(r13) + cmpdi r4,0 /* if we have no vcpu to run, go back to sleep */ - beq cr1,kvm_no_guest + beq kvm_no_guest /* were we napping due to cede? */ lbz r0,HSTATE_NAPPING(r13) @@ -1587,6 +1590,10 @@ secondary_too_late: .endr secondary_nap: + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + lwsync /* Clear any pending IPI - assume we're a secondary thread */ ld r5, HSTATE_XICS_PHYS(r13) li r7, XICS_XIRR @@ -1612,8 +1619,6 @@ secondary_nap: kvm_no_guest: li r0, KVM_HWTHREAD_IN_NAP stb r0, HSTATE_HWTHREAD_STATE(r13) - li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR -- cgit v0.10.2 From 913d3ff9a3c3a13c3115eb4b3265aa35a9e0a7ad Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:16:48 +0000 Subject: KVM: PPC: Book3s HV: Don't access runnable threads list without vcore lock There were a few places where we were traversing the list of runnable threads in a virtual core, i.e. vc->runnable_threads, without holding the vcore spinlock. This extends the places where we hold the vcore spinlock to cover everywhere that we traverse that list. Since we possibly need to sleep inside kvmppc_book3s_hv_page_fault, this moves the call of it from kvmppc_handle_exit out to kvmppc_vcpu_run, where we don't hold the vcore lock. In kvmppc_vcore_blocked, we don't actually need to check whether all vcpus are ceded and don't have any pending exceptions, since the caller has already done that. The caller (kvmppc_run_vcpu) wasn't actually checking for pending exceptions, so we add that. The change of if to while in kvmppc_run_vcpu is to make sure that we never call kvmppc_remove_runnable() when the vcore state is RUNNING or EXITING. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 76fdcfe..aabcdba 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -118,6 +118,7 @@ #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ +#define RESUME_FLAG_ARCH1 (1<<2) #define RESUME_GUEST 0 #define RESUME_GUEST_NV RESUME_FLAG_NV diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 77dec0f..3a737a4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -57,6 +57,9 @@ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ +/* Used to indicate that a guest page fault needs to be handled */ +#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -431,7 +434,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { int r = RESUME_HOST; - int srcu_idx; vcpu->stat.sum_exits++; @@ -491,16 +493,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvmppc_book3s_hv_page_fault(run, vcpu, - vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + r = RESUME_PAGE_FAULT; break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvmppc_book3s_hv_page_fault(run, vcpu, - kvmppc_get_pc(vcpu), 0); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = 0; + r = RESUME_PAGE_FAULT; break; /* * This occurs if the guest executes an illegal instruction. @@ -984,22 +982,24 @@ static int on_primary_thread(void) * Run a set of guest threads on a physical core. * Called with vc->lock held. */ -static int kvmppc_run_core(struct kvmppc_vcore *vc) +static void kvmppc_run_core(struct kvmppc_vcore *vc) { struct kvm_vcpu *vcpu, *vcpu0, *vnext; long ret; u64 now; int ptid, i, need_vpa_update; int srcu_idx; + struct kvm_vcpu *vcpus_to_update[threads_per_core]; /* don't start if any threads have a signal pending */ need_vpa_update = 0; list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { if (signal_pending(vcpu->arch.run_task)) - return 0; - need_vpa_update |= vcpu->arch.vpa.update_pending | - vcpu->arch.slb_shadow.update_pending | - vcpu->arch.dtl.update_pending; + return; + if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + vcpus_to_update[need_vpa_update++] = vcpu; } /* @@ -1019,8 +1019,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) */ if (need_vpa_update) { spin_unlock(&vc->lock); - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - kvmppc_update_vpas(vcpu); + for (i = 0; i < need_vpa_update; ++i) + kvmppc_update_vpas(vcpus_to_update[i]); spin_lock(&vc->lock); } @@ -1037,8 +1037,10 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) vcpu->arch.ptid = ptid++; } } - if (!vcpu0) - return 0; /* nothing to run */ + if (!vcpu0) { + vc->vcore_state = VCORE_INACTIVE; + return; /* nothing to run; should never happen */ + } list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) if (vcpu->arch.ceded) vcpu->arch.ptid = ptid++; @@ -1091,6 +1093,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) preempt_enable(); kvm_resched(vcpu); + spin_lock(&vc->lock); now = get_tb(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { /* cancel pending dec exception if dec is positive */ @@ -1114,7 +1117,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) } } - spin_lock(&vc->lock); out: vc->vcore_state = VCORE_INACTIVE; vc->preempt_tb = mftb(); @@ -1125,8 +1127,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) wake_up(&vcpu->arch.cpu_run); } } - - return 1; } /* @@ -1150,20 +1150,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { DEFINE_WAIT(wait); - struct kvm_vcpu *v; - int all_idle = 1; prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); vc->vcore_state = VCORE_SLEEPING; spin_unlock(&vc->lock); - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { - if (!v->arch.ceded || v->arch.pending_exceptions) { - all_idle = 0; - break; - } - } - if (all_idle) - schedule(); + schedule(); finish_wait(&vc->wq, &wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; @@ -1219,7 +1210,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vc->runner = vcpu; n_ceded = 0; list_for_each_entry(v, &vc->runnable_threads, arch.run_list) - n_ceded += v->arch.ceded; + if (!v->arch.pending_exceptions) + n_ceded += v->arch.ceded; if (n_ceded == vc->n_runnable) kvmppc_vcore_blocked(vc); else @@ -1240,8 +1232,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if (signal_pending(current)) { - if (vc->vcore_state == VCORE_RUNNING || - vc->vcore_state == VCORE_EXITING) { + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING)) { spin_unlock(&vc->lock); kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); spin_lock(&vc->lock); @@ -1261,6 +1254,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; + int srcu_idx; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -1299,6 +1293,11 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) !(vcpu->arch.shregs.msr & MSR_PR)) { r = kvmppc_pseries_do_hcall(vcpu); kvmppc_core_prepare_to_enter(vcpu); + } else if (r == RESUME_PAGE_FAULT) { + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } } while (r == RESUME_GUEST); -- cgit v0.10.2 From 2f12f03436847e063cda8cc4c339ad84961cbf39 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:17:17 +0000 Subject: KVM: PPC: Book3S HV: Fixes for late-joining threads If a thread in a virtual core becomes runnable while other threads in the same virtual core are already running in the guest, it is possible for the latecomer to join the others on the core without first pulling them all out of the guest. Currently this only happens rarely, when a vcpu is first started. This fixes some bugs and omissions in the code in this case. First, we need to check for VPA updates for the latecomer and make a DTL entry for it. Secondly, if it comes along while the master vcpu is doing a VPA update, we don't need to do anything since the master will pick it up in kvmppc_run_core. To handle this correctly we introduce a new vcore state, VCORE_STARTING. Thirdly, there is a race because we currently clear the hardware thread's hwthread_req before waiting to see it get to nap. A latecomer thread could have its hwthread_req cleared before it gets to test it, and therefore never increment the nap_count, leading to messages about wait_for_nap timeouts. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 68f5a30..218534d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -289,9 +289,10 @@ struct kvmppc_vcore { /* Values for vcore_state */ #define VCORE_INACTIVE 0 -#define VCORE_RUNNING 1 -#define VCORE_EXITING 2 -#define VCORE_SLEEPING 3 +#define VCORE_SLEEPING 1 +#define VCORE_STARTING 2 +#define VCORE_RUNNING 3 +#define VCORE_EXITING 4 /* * Struct used to manage memory for a virtual processor area diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3a737a4..89995fa 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -336,6 +336,11 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) { + if (!(vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending)) + return; + spin_lock(&vcpu->arch.vpa_update_lock); if (vcpu->arch.vpa.update_pending) { kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); @@ -1009,7 +1014,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) vc->n_woken = 0; vc->nap_count = 0; vc->entry_exit_count = 0; - vc->vcore_state = VCORE_RUNNING; + vc->vcore_state = VCORE_STARTING; vc->in_guest = 0; vc->napping_threads = 0; @@ -1062,6 +1067,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_create_dtl_entry(vcpu, vc); } + vc->vcore_state = VCORE_RUNNING; preempt_disable(); spin_unlock(&vc->lock); @@ -1070,8 +1076,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu); __kvmppc_vcore_entry(NULL, vcpu0); - for (i = 0; i < threads_per_core; ++i) - kvmppc_release_hwthread(vc->pcpu + i); spin_lock(&vc->lock); /* disable sending of IPIs on virtual external irqs */ @@ -1080,6 +1084,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) /* wait for secondary threads to finish writing their state to memory */ if (vc->nap_count < vc->n_woken) kvmppc_wait_for_nap(vc); + for (i = 0; i < threads_per_core; ++i) + kvmppc_release_hwthread(vc->pcpu + i); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); @@ -1170,6 +1176,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvm_run->exit_reason = 0; vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; + kvmppc_update_vpas(vcpu); /* * Synchronize with other threads in this virtual core @@ -1193,6 +1200,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (vc->vcore_state == VCORE_RUNNING && VCORE_EXIT_COUNT(vc) == 0) { vcpu->arch.ptid = vc->n_runnable - 1; + kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); } -- cgit v0.10.2 From 8455d79e2163997e479931b8d5b7e60a92cd2b86 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:17:42 +0000 Subject: KVM: PPC: Book3S HV: Run virtual core whenever any vcpus in it can run Currently the Book3S HV code implements a policy on multi-threaded processors (i.e. POWER7) that requires all of the active vcpus in a virtual core to be ready to run before we run the virtual core. However, that causes problems on reset, because reset stops all vcpus except vcpu 0, and can also reduce throughput since all four threads in a virtual core have to wait whenever any one of them hits a hypervisor page fault. This relaxes the policy, allowing the virtual core to run as soon as any vcpu in it is runnable. With this, the KVMPPC_VCPU_STOPPED state and the KVMPPC_VCPU_BUSY_IN_HOST state have been combined into a single KVMPPC_VCPU_NOTREADY state, since we no longer need to distinguish between them. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 218534d..1e8cbd1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -563,9 +563,8 @@ struct kvm_vcpu_arch { }; /* Values for vcpu->arch.state */ -#define KVMPPC_VCPU_STOPPED 0 -#define KVMPPC_VCPU_BUSY_IN_HOST 1 -#define KVMPPC_VCPU_RUNNABLE 2 +#define KVMPPC_VCPU_NOTREADY 0 +#define KVMPPC_VCPU_RUNNABLE 1 /* Values for vcpu->arch.io_gpr */ #define KVM_MMIO_REG_MASK 0x001f diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 89995fa..61d2934 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -776,10 +776,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) kvmppc_mmu_book3s_hv_init(vcpu); - /* - * We consider the vcpu stopped until we see the first run ioctl for it. - */ - vcpu->arch.state = KVMPPC_VCPU_STOPPED; + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; init_waitqueue_head(&vcpu->arch.cpu_run); @@ -866,9 +863,8 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, { if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; - vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; --vc->n_runnable; - ++vc->n_busy; list_del(&vcpu->arch.run_list); } @@ -1169,7 +1165,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int n_ceded; - int prev_state; struct kvmppc_vcore *vc; struct kvm_vcpu *v, *vn; @@ -1186,7 +1181,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.kvm_run = kvm_run; - prev_state = vcpu->arch.state; vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); ++vc->n_runnable; @@ -1196,35 +1190,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * If the vcore is already running, we may be able to start * this thread straight away and have it join in. */ - if (prev_state == KVMPPC_VCPU_STOPPED) { + if (!signal_pending(current)) { if (vc->vcore_state == VCORE_RUNNING && VCORE_EXIT_COUNT(vc) == 0) { vcpu->arch.ptid = vc->n_runnable - 1; kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); + } else if (vc->vcore_state == VCORE_SLEEPING) { + wake_up(&vc->wq); } - } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) - --vc->n_busy; + } while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { - if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { + if (vc->vcore_state != VCORE_INACTIVE) { spin_unlock(&vc->lock); kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); spin_lock(&vc->lock); continue; } - vc->runner = vcpu; - n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) - if (!v->arch.pending_exceptions) - n_ceded += v->arch.ceded; - if (n_ceded == vc->n_runnable) - kvmppc_vcore_blocked(vc); - else - kvmppc_run_core(vc); - list_for_each_entry_safe(v, vn, &vc->runnable_threads, arch.run_list) { kvmppc_core_prepare_to_enter(v); @@ -1236,23 +1221,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) wake_up(&v->arch.cpu_run); } } + if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + break; + vc->runner = vcpu; + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + if (!v->arch.pending_exceptions) + n_ceded += v->arch.ceded; + if (n_ceded == vc->n_runnable) + kvmppc_vcore_blocked(vc); + else + kvmppc_run_core(vc); vc->runner = NULL; } - if (signal_pending(current)) { - while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && - (vc->vcore_state == VCORE_RUNNING || - vc->vcore_state == VCORE_EXITING)) { - spin_unlock(&vc->lock); - kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); - spin_lock(&vc->lock); - } - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { - kvmppc_remove_runnable(vc, vcpu); - vcpu->stat.signal_exits++; - kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - } + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING)) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } + + if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { + /* Wake up some vcpu to run the core */ + v = list_first_entry(&vc->runnable_threads, + struct kvm_vcpu, arch.run_list); + wake_up(&v->arch.cpu_run); } spin_unlock(&vc->lock); -- cgit v0.10.2 From c7b676709c163e12ec161c0593c2c76809c25ff4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:18:07 +0000 Subject: KVM: PPC: Book3S HV: Fix accounting of stolen time Currently the code that accounts stolen time tends to overestimate the stolen time, and will sometimes report more stolen time in a DTL (dispatch trace log) entry than has elapsed since the last DTL entry. This can cause guests to underflow the user or system time measured for some tasks, leading to ridiculous CPU percentages and total runtimes being reported by top and other utilities. In addition, the current code was designed for the previous policy where a vcore would only run when all the vcpus in it were runnable, and so only counted stolen time on a per-vcore basis. Now that a vcore can run while some of the vcpus in it are doing other things in the kernel (e.g. handling a page fault), we need to count the time when a vcpu task is preempted while it is not running as part of a vcore as stolen also. To do this, we bring back the BUSY_IN_HOST vcpu state and extend the vcpu_load/put functions to count preemption time while the vcpu is in that state. Handling the transitions between the RUNNING and BUSY_IN_HOST states requires checking and updating two variables (accumulated time stolen and time last preempted), so we add a new spinlock, vcpu->arch.tbacct_lock. This protects both the per-vcpu stolen/preempt-time variables, and the per-vcore variables while this vcpu is running the vcore. Finally, we now don't count time spent in userspace as stolen time. The task could be executing in userspace on behalf of the vcpu, or it could be preempted, or the vcpu could be genuinely stopped. Since we have no way of dividing up the time between these cases, we don't count any of it as stolen. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1e8cbd1..3093896 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -559,12 +559,17 @@ struct kvm_vcpu_arch { unsigned long dtl_index; u64 stolen_logged; struct kvmppc_vpa slb_shadow; + + spinlock_t tbacct_lock; + u64 busy_stolen; + u64 busy_preempt; #endif }; /* Values for vcpu->arch.state */ #define KVMPPC_VCPU_NOTREADY 0 #define KVMPPC_VCPU_RUNNABLE 1 +#define KVMPPC_VCPU_BUSY_IN_HOST 2 /* Values for vcpu->arch.io_gpr */ #define KVM_MMIO_REG_MASK 0x001f diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 61d2934..8b3c470 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -60,23 +60,74 @@ /* Used to indicate that a guest page fault needs to be handled */ #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) +/* Used as a "null" value for timebase values */ +#define TB_NIL (~(u64)0) + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +/* + * We use the vcpu_load/put functions to measure stolen time. + * Stolen time is counted as time when either the vcpu is able to + * run as part of a virtual core, but the task running the vcore + * is preempted or sleeping, or when the vcpu needs something done + * in the kernel by the task running the vcpu, but that task is + * preempted or sleeping. Those two things have to be counted + * separately, since one of the vcpu tasks will take on the job + * of running the core, and the other vcpu tasks in the vcore will + * sleep waiting for it to do that, but that sleep shouldn't count + * as stolen time. + * + * Hence we accumulate stolen time when the vcpu can run as part of + * a vcore using vc->stolen_tb, and the stolen time when the vcpu + * needs its task to do other things in the kernel (for example, + * service a page fault) in busy_stolen. We don't accumulate + * stolen time for a vcore when it is inactive, or for a vcpu + * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of + * a misnomer; it means that the vcpu task is not executing in + * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in + * the kernel. We don't have any way of dividing up that time + * between time that the vcpu is genuinely stopped, time that + * the task is actively working on behalf of the vcpu, and time + * that the task is preempted, so we don't count any of it as + * stolen. + * + * Updates to busy_stolen are protected by arch.tbacct_lock; + * updates to vc->stolen_tb are protected by the arch.tbacct_lock + * of the vcpu that has taken responsibility for running the vcore + * (i.e. vc->runner). The stolen times are measured in units of + * timebase ticks. (Note that the != TB_NIL checks below are + * purely defensive; they should never fail.) + */ + void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) + spin_lock(&vcpu->arch.tbacct_lock); + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE && + vc->preempt_tb != TB_NIL) { vc->stolen_tb += mftb() - vc->preempt_tb; + vc->preempt_tb = TB_NIL; + } + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && + vcpu->arch.busy_preempt != TB_NIL) { + vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; + vcpu->arch.busy_preempt = TB_NIL; + } + spin_unlock(&vcpu->arch.tbacct_lock); } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; + spin_lock(&vcpu->arch.tbacct_lock); if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) vc->preempt_tb = mftb(); + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) + vcpu->arch.busy_preempt = mftb(); + spin_unlock(&vcpu->arch.tbacct_lock); } void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) @@ -357,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->arch.vpa_update_lock); } +/* + * Return the accumulated stolen time for the vcore up until `now'. + * The caller should hold the vcore lock. + */ +static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) +{ + u64 p; + + /* + * If we are the task running the vcore, then since we hold + * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb + * can't be updated, so we don't need the tbacct_lock. + * If the vcore is inactive, it can't become active (since we + * hold the vcore lock), so the vcpu load/put functions won't + * update stolen_tb/preempt_tb, and we don't need tbacct_lock. + */ + if (vc->vcore_state != VCORE_INACTIVE && + vc->runner->arch.run_task != current) { + spin_lock(&vc->runner->arch.tbacct_lock); + p = vc->stolen_tb; + if (vc->preempt_tb != TB_NIL) + p += now - vc->preempt_tb; + spin_unlock(&vc->runner->arch.tbacct_lock); + } else { + p = vc->stolen_tb; + } + return p; +} + static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { struct dtl_entry *dt; struct lppaca *vpa; - unsigned long old_stolen; + unsigned long stolen; + unsigned long core_stolen; + u64 now; dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; - old_stolen = vcpu->arch.stolen_logged; - vcpu->arch.stolen_logged = vc->stolen_tb; + now = mftb(); + core_stolen = vcore_stolen_time(vc, now); + stolen = core_stolen - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = core_stolen; + spin_lock(&vcpu->arch.tbacct_lock); + stolen += vcpu->arch.busy_stolen; + vcpu->arch.busy_stolen = 0; + spin_unlock(&vcpu->arch.tbacct_lock); if (!dt || !vpa) return; memset(dt, 0, sizeof(struct dtl_entry)); dt->dispatch_reason = 7; dt->processor_id = vc->pcpu + vcpu->arch.ptid; - dt->timebase = mftb(); - dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen; + dt->timebase = now; + dt->enqueue_to_dispatch_time = stolen; dt->srr0 = kvmppc_get_pc(vcpu); dt->srr1 = vcpu->arch.shregs.msr; ++dt; @@ -773,6 +861,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.pvr = mfspr(SPRN_PVR); kvmppc_set_pvr(vcpu, vcpu->arch.pvr); spin_lock_init(&vcpu->arch.vpa_update_lock); + spin_lock_init(&vcpu->arch.tbacct_lock); + vcpu->arch.busy_preempt = TB_NIL; kvmppc_mmu_book3s_hv_init(vcpu); @@ -788,7 +878,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); - vcore->preempt_tb = mftb(); + vcore->preempt_tb = TB_NIL; } kvm->arch.vcores[core] = vcore; } @@ -801,7 +891,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) ++vcore->num_threads; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; - vcpu->arch.stolen_logged = vcore->stolen_tb; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -861,9 +950,17 @@ extern void xics_wake_cpu(int cpu); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) { + u64 now; + if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; - vcpu->arch.state = KVMPPC_VCPU_NOTREADY; + spin_lock(&vcpu->arch.tbacct_lock); + now = mftb(); + vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - + vcpu->arch.stolen_logged; + vcpu->arch.busy_preempt = now; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + spin_unlock(&vcpu->arch.tbacct_lock); --vc->n_runnable; list_del(&vcpu->arch.run_list); } @@ -1038,10 +1135,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) vcpu->arch.ptid = ptid++; } } - if (!vcpu0) { - vc->vcore_state = VCORE_INACTIVE; - return; /* nothing to run; should never happen */ - } + if (!vcpu0) + goto out; /* nothing to run; should never happen */ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) if (vcpu->arch.ceded) vcpu->arch.ptid = ptid++; @@ -1056,7 +1151,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) goto out; } - vc->stolen_tb += mftb() - vc->preempt_tb; vc->pcpu = smp_processor_id(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { kvmppc_start_thread(vcpu); @@ -1121,7 +1215,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) out: vc->vcore_state = VCORE_INACTIVE; - vc->preempt_tb = mftb(); list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { if (vcpu->arch.ret != RESUME_GUEST) { @@ -1181,7 +1274,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.kvm_run = kvm_run; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); ++vc->n_runnable; @@ -1295,6 +1390,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { r = kvmppc_run_vcpu(run, vcpu); @@ -1312,6 +1408,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) } while (r == RESUME_GUEST); out: + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&vcpu->kvm->arch.vcpus_running); return r; } -- cgit v0.10.2 From 9f8c8c7812976fbfaf4bb30aaf8f6b001864f20a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:18:37 +0000 Subject: KVM: PPC: Book3S HV: Allow DTL to be set to address 0, length 0 Commit 55b665b026 ("KVM: PPC: Book3S HV: Provide a way for userspace to get/set per-vCPU areas") includes a check on the length of the dispatch trace log (DTL) to make sure the buffer is at least one entry long. This is appropriate when registering a buffer, but the interface also allows for any existing buffer to be unregistered by specifying a zero address. In this case the length check is not appropriate. This makes the check conditional on the address being non-zero. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8b3c470..812764c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -811,9 +811,8 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) addr = val->vpaval.addr; len = val->vpaval.length; r = -EINVAL; - if (len < sizeof(struct dtl_entry)) - break; - if (addr && !vcpu->arch.vpa.next_gpa) + if (addr && (len < sizeof(struct dtl_entry) || + !vcpu->arch.vpa.next_gpa)) break; len -= len % sizeof(struct dtl_entry); r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); -- cgit v0.10.2 From 8b5869ad85f703ffeb25e656eab826f6b85b984c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Oct 2012 01:20:50 +0000 Subject: KVM: PPC: Book3S HV: Fix thinko in try_lock_hpte() This fixes an error in the inline asm in try_lock_hpte() where we were erroneously using a register number as an immediate operand. The bug only affects an error path, and in fact the code will still work as long as the compiler chooses some register other than r0 for the "bits" variable. Nevertheless it should still be fixed. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 0dd1d86..1472a5b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -60,7 +60,7 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) " ori %0,%0,%4\n" " stdcx. %0,0,%2\n" " beq+ 2f\n" - " li %1,%3\n" + " mr %1,%3\n" "2: isync" : "=&r" (tmp), "=&r" (old) : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) -- cgit v0.10.2 From 63a1909190a3baa0abb9463ef28c8d8c969be951 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 31 Oct 2012 13:37:59 +0100 Subject: PPC: ePAPR: Convert hcall header to uapi (round 2) The new uapi framework splits kernel internal and user space exported bits of header files more cleanly. Adjust the ePAPR header accordingly. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h new file mode 100644 index 0000000..d3d6342 --- /dev/null +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -0,0 +1,458 @@ +/* + * ePAPR hcall interface + * + * Copyright 2008-2011 Freescale Semiconductor, Inc. + * + * Author: Timur Tabi + * + * This file is provided under a dual BSD/GPL license. When using or + * redistributing this file, you may do so under either license. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* A "hypercall" is an "sc 1" instruction. This header file file provides C + * wrapper functions for the ePAPR hypervisor interface. It is inteded + * for use by Linux device drivers and other operating systems. + * + * The hypercalls are implemented as inline assembly, rather than assembly + * language functions in a .S file, for optimization. It allows + * the caller to issue the hypercall instruction directly, improving both + * performance and memory footprint. + */ + +#ifndef _EPAPR_HCALLS_H +#define _EPAPR_HCALLS_H + +#include + +#ifndef __ASSEMBLY__ +#include +#include +#include + +/* + * Hypercall register clobber list + * + * These macros are used to define the list of clobbered registers during a + * hypercall. Technically, registers r0 and r3-r12 are always clobbered, + * but the gcc inline assembly syntax does not allow us to specify registers + * on the clobber list that are also on the input/output list. Therefore, + * the lists of clobbered registers depends on the number of register + * parmeters ("+r" and "=r") passed to the hypercall. + * + * Each assembly block should use one of the HCALL_CLOBBERSx macros. As a + * general rule, 'x' is the number of parameters passed to the assembly + * block *except* for r11. + * + * If you're not sure, just use the smallest value of 'x' that does not + * generate a compilation error. Because these are static inline functions, + * the compiler will only check the clobber list for a function if you + * compile code that calls that function. + * + * r3 and r11 are not included in any clobbers list because they are always + * listed as output registers. + * + * XER, CTR, and LR are currently listed as clobbers because it's uncertain + * whether they will be clobbered. + * + * Note that r11 can be used as an output parameter. + * + * The "memory" clobber is only necessary for hcalls where the Hypervisor + * will read or write guest memory. However, we add it to all hcalls because + * the impact is minimal, and we want to ensure that it's present for the + * hcalls that need it. +*/ + +/* List of common clobbered registers. Do not use this macro. */ +#define EV_HCALL_CLOBBERS "r0", "r12", "xer", "ctr", "lr", "cc", "memory" + +#define EV_HCALL_CLOBBERS8 EV_HCALL_CLOBBERS +#define EV_HCALL_CLOBBERS7 EV_HCALL_CLOBBERS8, "r10" +#define EV_HCALL_CLOBBERS6 EV_HCALL_CLOBBERS7, "r9" +#define EV_HCALL_CLOBBERS5 EV_HCALL_CLOBBERS6, "r8" +#define EV_HCALL_CLOBBERS4 EV_HCALL_CLOBBERS5, "r7" +#define EV_HCALL_CLOBBERS3 EV_HCALL_CLOBBERS4, "r6" +#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" +#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" + +extern bool epapr_paravirt_enabled; +extern u32 epapr_hypercall_start[]; + +/* + * We use "uintptr_t" to define a register because it's guaranteed to be a + * 32-bit integer on a 32-bit platform, and a 64-bit integer on a 64-bit + * platform. + * + * All registers are either input/output or output only. Registers that are + * initialized before making the hypercall are input/output. All + * input/output registers are represented with "+r". Output-only registers + * are represented with "=r". Do not specify any unused registers. The + * clobber list will tell the compiler that the hypercall modifies those + * registers, which is good enough. + */ + +/** + * ev_int_set_config - configure the specified interrupt + * @interrupt: the interrupt number + * @config: configuration for this interrupt + * @priority: interrupt priority + * @destination: destination CPU number + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_int_set_config(unsigned int interrupt, + uint32_t config, unsigned int priority, uint32_t destination) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + register uintptr_t r5 __asm__("r5"); + register uintptr_t r6 __asm__("r6"); + + r11 = EV_HCALL_TOKEN(EV_INT_SET_CONFIG); + r3 = interrupt; + r4 = config; + r5 = priority; + r6 = destination; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6) + : : EV_HCALL_CLOBBERS4 + ); + + return r3; +} + +/** + * ev_int_get_config - return the config of the specified interrupt + * @interrupt: the interrupt number + * @config: returned configuration for this interrupt + * @priority: returned interrupt priority + * @destination: returned destination CPU number + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_int_get_config(unsigned int interrupt, + uint32_t *config, unsigned int *priority, uint32_t *destination) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + register uintptr_t r5 __asm__("r5"); + register uintptr_t r6 __asm__("r6"); + + r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG); + r3 = interrupt; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6) + : : EV_HCALL_CLOBBERS4 + ); + + *config = r4; + *priority = r5; + *destination = r6; + + return r3; +} + +/** + * ev_int_set_mask - sets the mask for the specified interrupt source + * @interrupt: the interrupt number + * @mask: 0=enable interrupts, 1=disable interrupts + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_int_set_mask(unsigned int interrupt, + unsigned int mask) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + + r11 = EV_HCALL_TOKEN(EV_INT_SET_MASK); + r3 = interrupt; + r4 = mask; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "+r" (r4) + : : EV_HCALL_CLOBBERS2 + ); + + return r3; +} + +/** + * ev_int_get_mask - returns the mask for the specified interrupt source + * @interrupt: the interrupt number + * @mask: returned mask for this interrupt (0=enabled, 1=disabled) + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_int_get_mask(unsigned int interrupt, + unsigned int *mask) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + + r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK); + r3 = interrupt; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "=r" (r4) + : : EV_HCALL_CLOBBERS2 + ); + + *mask = r4; + + return r3; +} + +/** + * ev_int_eoi - signal the end of interrupt processing + * @interrupt: the interrupt number + * + * This function signals the end of processing for the the specified + * interrupt, which must be the interrupt currently in service. By + * definition, this is also the highest-priority interrupt. + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_int_eoi(unsigned int interrupt) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + + r11 = EV_HCALL_TOKEN(EV_INT_EOI); + r3 = interrupt; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3) + : : EV_HCALL_CLOBBERS1 + ); + + return r3; +} + +/** + * ev_byte_channel_send - send characters to a byte stream + * @handle: byte stream handle + * @count: (input) num of chars to send, (output) num chars sent + * @buffer: pointer to a 16-byte buffer + * + * @buffer must be at least 16 bytes long, because all 16 bytes will be + * read from memory into registers, even if count < 16. + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_byte_channel_send(unsigned int handle, + unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES]) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + register uintptr_t r5 __asm__("r5"); + register uintptr_t r6 __asm__("r6"); + register uintptr_t r7 __asm__("r7"); + register uintptr_t r8 __asm__("r8"); + const uint32_t *p = (const uint32_t *) buffer; + + r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_SEND); + r3 = handle; + r4 = *count; + r5 = be32_to_cpu(p[0]); + r6 = be32_to_cpu(p[1]); + r7 = be32_to_cpu(p[2]); + r8 = be32_to_cpu(p[3]); + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), + "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8) + : : EV_HCALL_CLOBBERS6 + ); + + *count = r4; + + return r3; +} + +/** + * ev_byte_channel_receive - fetch characters from a byte channel + * @handle: byte channel handle + * @count: (input) max num of chars to receive, (output) num chars received + * @buffer: pointer to a 16-byte buffer + * + * The size of @buffer must be at least 16 bytes, even if you request fewer + * than 16 characters, because we always write 16 bytes to @buffer. This is + * for performance reasons. + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_byte_channel_receive(unsigned int handle, + unsigned int *count, char buffer[EV_BYTE_CHANNEL_MAX_BYTES]) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + register uintptr_t r5 __asm__("r5"); + register uintptr_t r6 __asm__("r6"); + register uintptr_t r7 __asm__("r7"); + register uintptr_t r8 __asm__("r8"); + uint32_t *p = (uint32_t *) buffer; + + r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_RECEIVE); + r3 = handle; + r4 = *count; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "+r" (r4), + "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8) + : : EV_HCALL_CLOBBERS6 + ); + + *count = r4; + p[0] = cpu_to_be32(r5); + p[1] = cpu_to_be32(r6); + p[2] = cpu_to_be32(r7); + p[3] = cpu_to_be32(r8); + + return r3; +} + +/** + * ev_byte_channel_poll - returns the status of the byte channel buffers + * @handle: byte channel handle + * @rx_count: returned count of bytes in receive queue + * @tx_count: returned count of free space in transmit queue + * + * This function reports the amount of data in the receive queue (i.e. the + * number of bytes you can read), and the amount of free space in the transmit + * queue (i.e. the number of bytes you can write). + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_byte_channel_poll(unsigned int handle, + unsigned int *rx_count, unsigned int *tx_count) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + register uintptr_t r5 __asm__("r5"); + + r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL); + r3 = handle; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5) + : : EV_HCALL_CLOBBERS3 + ); + + *rx_count = r4; + *tx_count = r5; + + return r3; +} + +/** + * ev_int_iack - acknowledge an interrupt + * @handle: handle to the target interrupt controller + * @vector: returned interrupt vector + * + * If handle is zero, the function returns the next interrupt source + * number to be handled irrespective of the hierarchy or cascading + * of interrupt controllers. If non-zero, specifies a handle to the + * interrupt controller that is the target of the acknowledge. + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_int_iack(unsigned int handle, + unsigned int *vector) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + register uintptr_t r4 __asm__("r4"); + + r11 = EV_HCALL_TOKEN(EV_INT_IACK); + r3 = handle; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3), "=r" (r4) + : : EV_HCALL_CLOBBERS2 + ); + + *vector = r4; + + return r3; +} + +/** + * ev_doorbell_send - send a doorbell to another partition + * @handle: doorbell send handle + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_doorbell_send(unsigned int handle) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + + r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND); + r3 = handle; + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "+r" (r3) + : : EV_HCALL_CLOBBERS1 + ); + + return r3; +} + +/** + * ev_idle -- wait for next interrupt on this core + * + * Returns 0 for success, or an error code. + */ +static inline unsigned int ev_idle(void) +{ + register uintptr_t r11 __asm__("r11"); + register uintptr_t r3 __asm__("r3"); + + r11 = EV_HCALL_TOKEN(EV_IDLE); + + asm volatile("bl epapr_hypercall_start" + : "+r" (r11), "=r" (r3) + : : EV_HCALL_CLOBBERS1 + ); + + return r3; +} +#endif /* !__ASSEMBLY__ */ +#endif /* _EPAPR_HCALLS_H */ diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h index 58997af..7f9c74b 100644 --- a/arch/powerpc/include/uapi/asm/epapr_hcalls.h +++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h @@ -37,422 +37,62 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* A "hypercall" is an "sc 1" instruction. This header file file provides C - * wrapper functions for the ePAPR hypervisor interface. It is inteded - * for use by Linux device drivers and other operating systems. - * - * The hypercalls are implemented as inline assembly, rather than assembly - * language functions in a .S file, for optimization. It allows - * the caller to issue the hypercall instruction directly, improving both - * performance and memory footprint. - */ - -#ifndef _EPAPR_HCALLS_H -#define _EPAPR_HCALLS_H - -#include - -#ifndef __ASSEMBLY__ -#include -#include -#include - -/* - * Hypercall register clobber list - * - * These macros are used to define the list of clobbered registers during a - * hypercall. Technically, registers r0 and r3-r12 are always clobbered, - * but the gcc inline assembly syntax does not allow us to specify registers - * on the clobber list that are also on the input/output list. Therefore, - * the lists of clobbered registers depends on the number of register - * parmeters ("+r" and "=r") passed to the hypercall. - * - * Each assembly block should use one of the HCALL_CLOBBERSx macros. As a - * general rule, 'x' is the number of parameters passed to the assembly - * block *except* for r11. - * - * If you're not sure, just use the smallest value of 'x' that does not - * generate a compilation error. Because these are static inline functions, - * the compiler will only check the clobber list for a function if you - * compile code that calls that function. - * - * r3 and r11 are not included in any clobbers list because they are always - * listed as output registers. - * - * XER, CTR, and LR are currently listed as clobbers because it's uncertain - * whether they will be clobbered. - * - * Note that r11 can be used as an output parameter. - * - * The "memory" clobber is only necessary for hcalls where the Hypervisor - * will read or write guest memory. However, we add it to all hcalls because - * the impact is minimal, and we want to ensure that it's present for the - * hcalls that need it. -*/ - -/* List of common clobbered registers. Do not use this macro. */ -#define EV_HCALL_CLOBBERS "r0", "r12", "xer", "ctr", "lr", "cc", "memory" - -#define EV_HCALL_CLOBBERS8 EV_HCALL_CLOBBERS -#define EV_HCALL_CLOBBERS7 EV_HCALL_CLOBBERS8, "r10" -#define EV_HCALL_CLOBBERS6 EV_HCALL_CLOBBERS7, "r9" -#define EV_HCALL_CLOBBERS5 EV_HCALL_CLOBBERS6, "r8" -#define EV_HCALL_CLOBBERS4 EV_HCALL_CLOBBERS5, "r7" -#define EV_HCALL_CLOBBERS3 EV_HCALL_CLOBBERS4, "r6" -#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5" -#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4" - -extern bool epapr_paravirt_enabled; -extern u32 epapr_hypercall_start[]; - -/* - * We use "uintptr_t" to define a register because it's guaranteed to be a - * 32-bit integer on a 32-bit platform, and a 64-bit integer on a 64-bit - * platform. - * - * All registers are either input/output or output only. Registers that are - * initialized before making the hypercall are input/output. All - * input/output registers are represented with "+r". Output-only registers - * are represented with "=r". Do not specify any unused registers. The - * clobber list will tell the compiler that the hypercall modifies those - * registers, which is good enough. - */ - -/** - * ev_int_set_config - configure the specified interrupt - * @interrupt: the interrupt number - * @config: configuration for this interrupt - * @priority: interrupt priority - * @destination: destination CPU number - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_int_set_config(unsigned int interrupt, - uint32_t config, unsigned int priority, uint32_t destination) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - register uintptr_t r5 __asm__("r5"); - register uintptr_t r6 __asm__("r6"); - - r11 = EV_HCALL_TOKEN(EV_INT_SET_CONFIG); - r3 = interrupt; - r4 = config; - r5 = priority; - r6 = destination; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6) - : : EV_HCALL_CLOBBERS4 - ); - - return r3; -} - -/** - * ev_int_get_config - return the config of the specified interrupt - * @interrupt: the interrupt number - * @config: returned configuration for this interrupt - * @priority: returned interrupt priority - * @destination: returned destination CPU number - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_int_get_config(unsigned int interrupt, - uint32_t *config, unsigned int *priority, uint32_t *destination) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - register uintptr_t r5 __asm__("r5"); - register uintptr_t r6 __asm__("r6"); - - r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG); - r3 = interrupt; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6) - : : EV_HCALL_CLOBBERS4 - ); - - *config = r4; - *priority = r5; - *destination = r6; - - return r3; -} - -/** - * ev_int_set_mask - sets the mask for the specified interrupt source - * @interrupt: the interrupt number - * @mask: 0=enable interrupts, 1=disable interrupts - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_int_set_mask(unsigned int interrupt, - unsigned int mask) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - - r11 = EV_HCALL_TOKEN(EV_INT_SET_MASK); - r3 = interrupt; - r4 = mask; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "+r" (r4) - : : EV_HCALL_CLOBBERS2 - ); - - return r3; -} - -/** - * ev_int_get_mask - returns the mask for the specified interrupt source - * @interrupt: the interrupt number - * @mask: returned mask for this interrupt (0=enabled, 1=disabled) - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_int_get_mask(unsigned int interrupt, - unsigned int *mask) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - - r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK); - r3 = interrupt; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "=r" (r4) - : : EV_HCALL_CLOBBERS2 - ); - - *mask = r4; - - return r3; -} - -/** - * ev_int_eoi - signal the end of interrupt processing - * @interrupt: the interrupt number - * - * This function signals the end of processing for the the specified - * interrupt, which must be the interrupt currently in service. By - * definition, this is also the highest-priority interrupt. - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_int_eoi(unsigned int interrupt) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - - r11 = EV_HCALL_TOKEN(EV_INT_EOI); - r3 = interrupt; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3) - : : EV_HCALL_CLOBBERS1 - ); - - return r3; -} - -/** - * ev_byte_channel_send - send characters to a byte stream - * @handle: byte stream handle - * @count: (input) num of chars to send, (output) num chars sent - * @buffer: pointer to a 16-byte buffer - * - * @buffer must be at least 16 bytes long, because all 16 bytes will be - * read from memory into registers, even if count < 16. - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_byte_channel_send(unsigned int handle, - unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES]) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - register uintptr_t r5 __asm__("r5"); - register uintptr_t r6 __asm__("r6"); - register uintptr_t r7 __asm__("r7"); - register uintptr_t r8 __asm__("r8"); - const uint32_t *p = (const uint32_t *) buffer; - - r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_SEND); - r3 = handle; - r4 = *count; - r5 = be32_to_cpu(p[0]); - r6 = be32_to_cpu(p[1]); - r7 = be32_to_cpu(p[2]); - r8 = be32_to_cpu(p[3]); - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), - "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8) - : : EV_HCALL_CLOBBERS6 - ); - - *count = r4; - - return r3; -} - -/** - * ev_byte_channel_receive - fetch characters from a byte channel - * @handle: byte channel handle - * @count: (input) max num of chars to receive, (output) num chars received - * @buffer: pointer to a 16-byte buffer - * - * The size of @buffer must be at least 16 bytes, even if you request fewer - * than 16 characters, because we always write 16 bytes to @buffer. This is - * for performance reasons. - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_byte_channel_receive(unsigned int handle, - unsigned int *count, char buffer[EV_BYTE_CHANNEL_MAX_BYTES]) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - register uintptr_t r5 __asm__("r5"); - register uintptr_t r6 __asm__("r6"); - register uintptr_t r7 __asm__("r7"); - register uintptr_t r8 __asm__("r8"); - uint32_t *p = (uint32_t *) buffer; - - r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_RECEIVE); - r3 = handle; - r4 = *count; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "+r" (r4), - "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8) - : : EV_HCALL_CLOBBERS6 - ); - - *count = r4; - p[0] = cpu_to_be32(r5); - p[1] = cpu_to_be32(r6); - p[2] = cpu_to_be32(r7); - p[3] = cpu_to_be32(r8); - - return r3; -} - -/** - * ev_byte_channel_poll - returns the status of the byte channel buffers - * @handle: byte channel handle - * @rx_count: returned count of bytes in receive queue - * @tx_count: returned count of free space in transmit queue - * - * This function reports the amount of data in the receive queue (i.e. the - * number of bytes you can read), and the amount of free space in the transmit - * queue (i.e. the number of bytes you can write). - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_byte_channel_poll(unsigned int handle, - unsigned int *rx_count, unsigned int *tx_count) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - register uintptr_t r5 __asm__("r5"); - - r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL); - r3 = handle; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5) - : : EV_HCALL_CLOBBERS3 - ); - - *rx_count = r4; - *tx_count = r5; - - return r3; -} - -/** - * ev_int_iack - acknowledge an interrupt - * @handle: handle to the target interrupt controller - * @vector: returned interrupt vector - * - * If handle is zero, the function returns the next interrupt source - * number to be handled irrespective of the hierarchy or cascading - * of interrupt controllers. If non-zero, specifies a handle to the - * interrupt controller that is the target of the acknowledge. - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_int_iack(unsigned int handle, - unsigned int *vector) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - register uintptr_t r4 __asm__("r4"); - - r11 = EV_HCALL_TOKEN(EV_INT_IACK); - r3 = handle; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3), "=r" (r4) - : : EV_HCALL_CLOBBERS2 - ); - - *vector = r4; - - return r3; -} - -/** - * ev_doorbell_send - send a doorbell to another partition - * @handle: doorbell send handle - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_doorbell_send(unsigned int handle) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - - r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND); - r3 = handle; - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "+r" (r3) - : : EV_HCALL_CLOBBERS1 - ); - - return r3; -} - -/** - * ev_idle -- wait for next interrupt on this core - * - * Returns 0 for success, or an error code. - */ -static inline unsigned int ev_idle(void) -{ - register uintptr_t r11 __asm__("r11"); - register uintptr_t r3 __asm__("r3"); - - r11 = EV_HCALL_TOKEN(EV_IDLE); - - asm volatile("bl epapr_hypercall_start" - : "+r" (r11), "=r" (r3) - : : EV_HCALL_CLOBBERS1 - ); - - return r3; -} -#endif /* !__ASSEMBLY__ */ -#endif +#ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H +#define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H + +#define EV_BYTE_CHANNEL_SEND 1 +#define EV_BYTE_CHANNEL_RECEIVE 2 +#define EV_BYTE_CHANNEL_POLL 3 +#define EV_INT_SET_CONFIG 4 +#define EV_INT_GET_CONFIG 5 +#define EV_INT_SET_MASK 6 +#define EV_INT_GET_MASK 7 +#define EV_INT_IACK 9 +#define EV_INT_EOI 10 +#define EV_INT_SEND_IPI 11 +#define EV_INT_SET_TASK_PRIORITY 12 +#define EV_INT_GET_TASK_PRIORITY 13 +#define EV_DOORBELL_SEND 14 +#define EV_MSGSND 15 +#define EV_IDLE 16 + +/* vendor ID: epapr */ +#define EV_LOCAL_VENDOR_ID 0 /* for private use */ +#define EV_EPAPR_VENDOR_ID 1 +#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */ +#define EV_IBM_VENDOR_ID 3 /* IBM */ +#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */ +#define EV_ENEA_VENDOR_ID 5 /* Enea */ +#define EV_WR_VENDOR_ID 6 /* Wind River Systems */ +#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */ +#define EV_KVM_VENDOR_ID 42 /* KVM */ + +/* The max number of bytes that a byte channel can send or receive per call */ +#define EV_BYTE_CHANNEL_MAX_BYTES 16 + + +#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) +#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) + +/* epapr return codes */ +#define EV_SUCCESS 0 +#define EV_EPERM 1 /* Operation not permitted */ +#define EV_ENOENT 2 /* Entry Not Found */ +#define EV_EIO 3 /* I/O error occured */ +#define EV_EAGAIN 4 /* The operation had insufficient + * resources to complete and should be + * retried + */ +#define EV_ENOMEM 5 /* There was insufficient memory to + * complete the operation */ +#define EV_EFAULT 6 /* Bad guest address */ +#define EV_ENODEV 7 /* No such device */ +#define EV_EINVAL 8 /* An argument supplied to the hcall + was out of range or invalid */ +#define EV_INTERNAL 9 /* An internal error occured */ +#define EV_CONFIG 10 /* A configuration error was detected */ +#define EV_INVALID_STATE 11 /* The object is in an invalid state */ +#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ +#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ + +#endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */ -- cgit v0.10.2 From 7de609c86722fb1cbf7132889691cfcccfa0ac12 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 29 Oct 2012 19:08:21 +0100 Subject: KVM: SVM: update MAINTAINERS entry I have no access to my AMD email address anymore. Update entry in MAINTAINERS to the new address. Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti diff --git a/MAINTAINERS b/MAINTAINERS index 1fa9074..4376c52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4242,10 +4242,10 @@ F: include/linux/kvm* F: virt/kvm/ KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V -M: Joerg Roedel +M: Joerg Roedel L: kvm@vger.kernel.org W: http://kvm.qumranet.com -S: Supported +S: Maintained F: arch/x86/include/asm/svm.h F: arch/x86/kvm/svm.c -- cgit v0.10.2 From 18595411a7146330ec19adf0b9db8e6736c84a4e Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Fri, 2 Nov 2012 18:33:21 +0800 Subject: KVM: do not kfree error pointer We should avoid kfree()ing error pointer in kvm_vcpu_ioctl() and kvm_arch_vcpu_ioctl(). Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 49fa1f0..40905ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2689,14 +2689,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_LAPIC: { - r = -EINVAL; if (!vcpu->arch.apic) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) { - r = PTR_ERR(u.lapic); - goto out; - } + if (IS_ERR(u.lapic)) + return PTR_ERR(u.lapic); r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) @@ -2877,10 +2874,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) { - r = PTR_ERR(u.xsave); - goto out; - } + if (IS_ERR(u.xsave)) + return PTR_ERR(u.xsave); r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -2902,10 +2897,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) { - r = PTR_ERR(u.xcrs); - goto out; - } + if (IS_ERR(u.xcrs)) + return PTR_ERR(u.xcrs); r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2fb7319..316d76a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1954,6 +1954,7 @@ out_free2: kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); if (IS_ERR(kvm_sregs)) { r = PTR_ERR(kvm_sregs); + kvm_sregs = NULL; goto out; } r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); @@ -2054,6 +2055,7 @@ out_free2: fpu = memdup_user(argp, sizeof(*fpu)); if (IS_ERR(fpu)) { r = PTR_ERR(fpu); + fpu = NULL; goto out; } r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); -- cgit v0.10.2 From 951179ce86f5599e2dfb9de254056e91bd865f15 Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Fri, 2 Nov 2012 18:33:22 +0800 Subject: KVM: x86: fix return value of kvm_vm_ioctl_set_tss_addr() Return value of this function will be that of ioctl(). #include #include int main () { int fd; fd = open ("/dev/kvm", 0); fd = ioctl (fd, KVM_CREATE_VM, 0); ioctl (fd, KVM_SET_TSS_ADDR, 0xfffff000); perror (""); return 0; } Output is "Operation not permitted". That's not what we want. Return -EINVAL in this case. Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 40905ce..a25b3df 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2946,7 +2946,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; + return -EINVAL; ret = kvm_x86_ops->set_tss_addr(kvm, addr); return ret; } -- cgit v0.10.2 From 807f12e57c9783458b8c07f63eff3c3e1df8ab5d Mon Sep 17 00:00:00 2001 From: Guo Chao Date: Fri, 2 Nov 2012 18:33:23 +0800 Subject: KVM: remove unnecessary return value check No need to check return value before breaking switch. Signed-off-by: Guo Chao Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a25b3df..c31f75d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2696,9 +2696,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return PTR_ERR(u.lapic); r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); - if (r) - goto out; - r = 0; break; } case KVM_INTERRUPT: { @@ -2708,16 +2705,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&irq, argp, sizeof irq)) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; break; } case KVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); - if (r) - goto out; - r = 0; break; } case KVM_SET_CPUID: { @@ -2728,8 +2719,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_SET_CPUID2: { @@ -2741,8 +2730,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_GET_CPUID2: { @@ -3207,8 +3194,6 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; break; case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; @@ -3217,14 +3202,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) goto out; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); - if (r < 0) - goto out; break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); @@ -3315,8 +3296,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; get_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_SET_IRQCHIP: { @@ -3338,8 +3317,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; set_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_GET_PIT: { @@ -3366,9 +3343,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - if (r) - goto out; - r = 0; break; } case KVM_GET_PIT2: { @@ -3392,9 +3366,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - if (r) - goto out; - r = 0; break; } case KVM_REINJECT_CONTROL: { @@ -3403,9 +3374,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&control, argp, sizeof(control))) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); - if (r) - goto out; - r = 0; break; } case KVM_XEN_HVM_CONFIG: { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 316d76a..e3f5b14 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1929,10 +1929,6 @@ out_free1: goto out; } r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); - if (r) - goto out_free2; - r = 0; -out_free2: kfree(kvm_regs); break; } @@ -1958,9 +1954,6 @@ out_free2: goto out; } r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); - if (r) - goto out; - r = 0; break; } case KVM_GET_MP_STATE: { @@ -1982,9 +1975,6 @@ out_free2: if (copy_from_user(&mp_state, argp, sizeof mp_state)) goto out; r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); - if (r) - goto out; - r = 0; break; } case KVM_TRANSLATE: { @@ -2009,9 +1999,6 @@ out_free2: if (copy_from_user(&dbg, argp, sizeof dbg)) goto out; r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); - if (r) - goto out; - r = 0; break; } case KVM_SET_SIGNAL_MASK: { @@ -2059,9 +2046,6 @@ out_free2: goto out; } r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); - if (r) - goto out; - r = 0; break; } default: @@ -2131,8 +2115,6 @@ static long kvm_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); - if (r < 0) - goto out; break; case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; @@ -2143,8 +2125,6 @@ static long kvm_vm_ioctl(struct file *filp, goto out; r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); - if (r) - goto out; break; } case KVM_GET_DIRTY_LOG: { @@ -2154,8 +2134,6 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&log, argp, sizeof log)) goto out; r = kvm_vm_ioctl_get_dirty_log(kvm, &log); - if (r) - goto out; break; } #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -2165,9 +2143,6 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&zone, argp, sizeof zone)) goto out; r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); - if (r) - goto out; - r = 0; break; } case KVM_UNREGISTER_COALESCED_MMIO: { @@ -2176,9 +2151,6 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&zone, argp, sizeof zone)) goto out; r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); - if (r) - goto out; - r = 0; break; } #endif @@ -2287,8 +2259,6 @@ static long kvm_vm_compat_ioctl(struct file *filp, log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); r = kvm_vm_ioctl_get_dirty_log(kvm, &log); - if (r) - goto out; break; } default: -- cgit v0.10.2 From 78c0337a38450f809113dd46fe038874b93909f1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:47 -0200 Subject: KVM: x86: retain pvclock guest stopped bit in guest memory Otherwise its possible for an unrelated KVM_REQ_UPDATE_CLOCK (such as due to CPU migration) to clear the bit. Noticed by Paolo Bonzini. Reviewed-by: Gleb Natapov Reviewed-by: Glauber Costa Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c31f75d..1dfe9d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1145,6 +1145,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp; + struct pvclock_vcpu_time_info *guest_hv_clock; u8 pvclock_flags; /* Keep irq disabled to prevent changes to the clock */ @@ -1228,13 +1229,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - pvclock_flags = 0; - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - - vcpu->hv_clock.flags = pvclock_flags; /* * The interface expects us to write an even number signaling that the @@ -1245,6 +1239,18 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) shared_kaddr = kmap_atomic(vcpu->time_page); + guest_hv_clock = shared_kaddr + vcpu->time_offset; + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + vcpu->hv_clock.flags = pvclock_flags; + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); -- cgit v0.10.2 From 7069ed67635b8d574541af426d752cd7fbd465a6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:48 -0200 Subject: x86: kvmclock: allocate pvclock shared memory area We want to expose the pvclock shared memory areas, which the hypervisor periodically updates, to userspace. For a linear mapping from userspace, it is necessary that entire page sized regions are used for array of pvclock structures. There is no such guarantee with per cpu areas, therefore move to memblock_alloc based allocation. Acked-by: Glauber Costa Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f1b42b3..c7d7567 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,11 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +struct pvclock_aligned_vcpu_time_info { + struct pvclock_vcpu_time_info clock; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +static struct pvclock_aligned_vcpu_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; /* @@ -52,15 +57,20 @@ static unsigned long kvm_get_wallclock(void) struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; int low, high; + int cpu; low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); - vcpu_time = &get_cpu_var(hv_clock); + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].clock; pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); - put_cpu_var(hv_clock); + + preempt_enable(); return ts.tv_sec; } @@ -74,9 +84,11 @@ static cycle_t kvm_clock_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; + int cpu; preempt_disable_notrace(); - src = &__get_cpu_var(hv_clock); + cpu = smp_processor_id(); + src = &hv_clock[cpu].clock; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; @@ -99,8 +111,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) static unsigned long kvm_get_tsc_khz(void) { struct pvclock_vcpu_time_info *src; - src = &per_cpu(hv_clock, 0); - return pvclock_tsc_khz(src); + int cpu; + unsigned long tsc_khz; + + preempt_disable(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].clock; + tsc_khz = pvclock_tsc_khz(src); + preempt_enable(); + return tsc_khz; } static void kvm_get_preset_lpj(void) @@ -119,10 +138,14 @@ bool kvm_check_and_clear_guest_paused(void) { bool ret = false; struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); + + if (!hv_clock) + return ret; - src = &__get_cpu_var(hv_clock); + src = &hv_clock[cpu].clock; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + src->flags &= ~PVCLOCK_GUEST_STOPPED; ret = true; } @@ -141,9 +164,10 @@ int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; + struct pvclock_vcpu_time_info *src = &hv_clock[cpu].clock; - low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; - high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + low = (int)__pa(src) | 1; + high = ((u64)__pa(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); @@ -197,6 +221,8 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { + unsigned long mem; + if (!kvm_para_available()) return; @@ -209,8 +235,18 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - if (kvm_register_clock("boot clock")) + mem = memblock_alloc(sizeof(struct pvclock_aligned_vcpu_time_info) * NR_CPUS, + PAGE_SIZE); + if (!mem) + return; + hv_clock = __va(mem); + + if (kvm_register_clock("boot clock")) { + hv_clock = NULL; + memblock_free(mem, + sizeof(struct pvclock_aligned_vcpu_time_info)*NR_CPUS); return; + } pv_time_ops.sched_clock = kvm_clock_read; x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; -- cgit v0.10.2 From b01578de45614c085be319229a52774b61ffe6fb Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:49 -0200 Subject: x86: pvclock: make sure rdtsc doesnt speculate out of region Originally from Jeremy Fitzhardinge. pvclock_get_time_values, which contains the memory barriers will be removed by next patch. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 42eb330..12e47e2 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -97,10 +97,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = pvclock_get_time_values(&shadow, src); - barrier(); + rdtsc_barrier(); offset = pvclock_get_nsec_offset(&shadow); ret = shadow.system_timestamp + offset; - barrier(); + rdtsc_barrier(); } while (version != src->version); if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && -- cgit v0.10.2 From 42b5637d6989f75a7a0e8ea8633583564ff443ff Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:50 -0200 Subject: x86: pvclock: remove pvclock_shadow_time Originally from Jeremy Fitzhardinge. We can copy the information directly from "struct pvclock_vcpu_time_info", remove pvclock_shadow_time. Reviewed-by: Glauber Costa Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 12e47e2..d59e606 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -19,21 +19,6 @@ #include #include -/* - * These are perodically updated - * xen: magic shared_info page - * kvm: gpa registered via msr - * and then copied here. - */ -struct pvclock_shadow_time { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; - u8 flags; -}; - static u8 valid_flags __read_mostly = 0; void pvclock_set_flags(u8 flags) @@ -41,32 +26,11 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) -{ - u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, - shadow->tsc_shift); -} - -/* - * Reads a consistent set of time-base values from hypervisor, - * into a shadow data area. - */ -static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, - struct pvclock_vcpu_time_info *src) +static u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) { - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - dst->flags = src->flags; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) || (dst->version != src->version)); - - return dst->version; + u64 delta = native_read_tsc() - src->tsc_timestamp; + return pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); } unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) @@ -90,21 +54,22 @@ void pvclock_resume(void) cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { - struct pvclock_shadow_time shadow; unsigned version; cycle_t ret, offset; u64 last; + u8 flags; do { - version = pvclock_get_time_values(&shadow, src); + version = src->version; rdtsc_barrier(); - offset = pvclock_get_nsec_offset(&shadow); - ret = shadow.system_timestamp + offset; + offset = pvclock_get_nsec_offset(src); + ret = src->system_time + offset; + flags = src->flags; rdtsc_barrier(); - } while (version != src->version); + } while ((src->version & 1) || version != src->version); if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && - (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + (flags & PVCLOCK_TSC_STABLE_BIT)) return ret; /* -- cgit v0.10.2 From dce2db0a35197e2f4421245079f19ab753e97348 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:51 -0200 Subject: x86: pvclock: create helper for pvclock data retrieval Originally from Jeremy Fitzhardinge. So code can be reused. Acked-by: Glauber Costa Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index c59cc97..47e25d4 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -56,4 +56,32 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) return product; } +static __always_inline +u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) +{ + u64 delta = __native_read_tsc() - src->tsc_timestamp; + return pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); +} + +static __always_inline +unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, + cycle_t *cycles, u8 *flags) +{ + unsigned version; + cycle_t ret, offset; + u8 ret_flags; + + version = src->version; + rdtsc_barrier(); + offset = pvclock_get_nsec_offset(src); + ret = src->system_time + offset; + ret_flags = src->flags; + rdtsc_barrier(); + + *cycles = ret; + *flags = ret_flags; + return version; +} + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index d59e606..a7d9091 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -26,13 +26,6 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -static u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) -{ - u64 delta = native_read_tsc() - src->tsc_timestamp; - return pvclock_scale_delta(delta, src->tsc_to_system_mul, - src->tsc_shift); -} - unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { u64 pv_tsc_khz = 1000000ULL << 32; @@ -55,17 +48,12 @@ void pvclock_resume(void) cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret, offset; + cycle_t ret; u64 last; u8 flags; do { - version = src->version; - rdtsc_barrier(); - offset = pvclock_get_nsec_offset(src); - ret = src->system_time + offset; - flags = src->flags; - rdtsc_barrier(); + version = __pvclock_read_cycles(src, &ret, &flags); } while ((src->version & 1) || version != src->version); if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && -- cgit v0.10.2 From 2697902be89d7f38e9736dfe946cd0e5d4f47b80 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:52 -0200 Subject: x86: pvclock: introduce helper to read flags Acked-by: Glauber Costa Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 47e25d4..ea27a8d 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -6,6 +6,7 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index a7d9091..c8fb043 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -45,6 +45,19 @@ void pvclock_resume(void) atomic64_set(&last_value, 0); } +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + cycle_t ret; + u8 flags; + + do { + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); + + return flags & valid_flags; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { unsigned version; -- cgit v0.10.2 From 189e11731aa858597095fbe1e6d243bad26bd96b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:53 -0200 Subject: x86: pvclock: add note about rdtsc barriers As noted by Gleb, not advertising SSE2 support implies no RDTSC barriers. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index ea27a8d..63f9167 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -74,6 +74,12 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; + /* Note: emulated platforms which do not advertise SSE2 support + * result in kvmclock not using the necessary RDTSC barriers. + * Without barriers, it is possible that RDTSC instruction reads from + * the time stamp counter outside rdtsc_barrier protected section + * below, resulting in violation of monotonicity. + */ rdtsc_barrier(); offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; -- cgit v0.10.2 From 582b336ec2c0f0076f5650a029fcc9abd4a906f7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:54 -0200 Subject: sched: add notifier for cross-cpu migrations Originally from Jeremy Fitzhardinge. Acked-by: Ingo Molnar Signed-off-by: Marcelo Tosatti diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a0..6eb2ed8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -107,6 +107,14 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); +/* Notifier for when a task gets migrated to a new CPU */ +struct task_migration_notifier { + struct task_struct *task; + int from_cpu; + int to_cpu; +}; +extern void register_task_migration_notifier(struct notifier_block *n); + extern unsigned long get_parent_ip(unsigned long addr); struct seq_file; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927f..c86b8b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -922,6 +922,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq->skip_clock_update = 1; } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -952,8 +959,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); -- cgit v0.10.2 From 71056ae22d43f58d7e0f793af18ace2eaf5b74eb Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:55 -0200 Subject: x86: pvclock: generic pvclock vsyscall initialization Originally from Jeremy Fitzhardinge. Introduce generic, non hypervisor specific, pvclock initialization routines. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 0bdbbb3..16a57f4 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -8,6 +8,7 @@ #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ +#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4da3c0c..a09c285 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 #include #include @@ -81,6 +82,10 @@ enum fixed_addresses { VVAR_PAGE, VSYSCALL_HPET, #endif +#ifdef CONFIG_PARAVIRT_CLOCK + PVCLOCK_FIXMAP_BEGIN, + PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, +#endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 63f9167..109a9dd 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -91,4 +91,16 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, return version; } +struct pvclock_vsyscall_time_info { + struct pvclock_vcpu_time_info pvti; + u32 migrate_count; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size); +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index c8fb043..85c3959 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -17,6 +17,11 @@ #include #include +#include +#include +#include +#include +#include #include static u8 valid_flags __read_mostly = 0; @@ -122,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + +#ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + +/* + * Initialize the generic pvclock vsyscall state. This will allocate + * a/some page(s) for the per-vcpu pvclock information, set up a + * fixmap mapping for the page(s) + */ + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size) +{ + int idx; + + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + + pvclock_vdso_info = i; + + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, + __pa_symbol(i) + (idx*PAGE_SIZE), + PAGE_KERNEL_VVAR); + } + + + register_task_migration_notifier(&pvclock_migrate); + + return 0; +} +#endif -- cgit v0.10.2 From 3dc4f7cfb7441e5e0fed3a02fc81cdaabd28300a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:56 -0200 Subject: x86: kvm guest: pvclock vsyscall support Hook into generic pvclock vsyscall code, with the aim to allow userspace to have visibility into pvclock data. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h new file mode 100644 index 0000000..a92b176 --- /dev/null +++ b/arch/x86/include/asm/kvm_guest.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_GUEST_H +#define _ASM_X86_KVM_GUEST_H + +int kvm_setup_vsyscall_timeinfo(void); + +#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4180a87..a91c6b4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,6 +42,7 @@ #include #include #include +#include static int kvmapf = 1; @@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static int kvmclock_vsyscall = 1; +static int parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} + +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -471,6 +481,9 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvmclock_vsyscall) + kvm_setup_vsyscall_timeinfo(); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index c7d7567..220a360 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -40,11 +40,7 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -struct pvclock_aligned_vcpu_time_info { - struct pvclock_vcpu_time_info clock; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - -static struct pvclock_aligned_vcpu_time_info *hv_clock; +static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; /* @@ -67,7 +63,7 @@ static unsigned long kvm_get_wallclock(void) preempt_disable(); cpu = smp_processor_id(); - vcpu_time = &hv_clock[cpu].clock; + vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); preempt_enable(); @@ -88,7 +84,7 @@ static cycle_t kvm_clock_read(void) preempt_disable_notrace(); cpu = smp_processor_id(); - src = &hv_clock[cpu].clock; + src = &hv_clock[cpu].pvti; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; @@ -116,7 +112,7 @@ static unsigned long kvm_get_tsc_khz(void) preempt_disable(); cpu = smp_processor_id(); - src = &hv_clock[cpu].clock; + src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); preempt_enable(); return tsc_khz; @@ -143,7 +139,7 @@ bool kvm_check_and_clear_guest_paused(void) if (!hv_clock) return ret; - src = &hv_clock[cpu].clock; + src = &hv_clock[cpu].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { src->flags &= ~PVCLOCK_GUEST_STOPPED; ret = true; @@ -164,7 +160,7 @@ int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; - struct pvclock_vcpu_time_info *src = &hv_clock[cpu].clock; + struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; low = (int)__pa(src) | 1; high = ((u64)__pa(src) >> 32); @@ -235,7 +231,7 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - mem = memblock_alloc(sizeof(struct pvclock_aligned_vcpu_time_info) * NR_CPUS, + mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, PAGE_SIZE); if (!mem) return; @@ -244,7 +240,7 @@ void __init kvmclock_init(void) if (kvm_register_clock("boot clock")) { hv_clock = NULL; memblock_free(mem, - sizeof(struct pvclock_aligned_vcpu_time_info)*NR_CPUS); + sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); return; } pv_time_ops.sched_clock = kvm_clock_read; @@ -269,3 +265,37 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } + +int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + int cpu; + int ret; + u8 flags; + struct pvclock_vcpu_time_info *vcpu_time; + unsigned int size; + + size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { + preempt_enable(); + return 1; + } + + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { + preempt_enable(); + return ret; + } + + preempt_enable(); + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} -- cgit v0.10.2 From 51c19b4f5927f5a646e93d69f73c7e89ea14e737 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:57 -0200 Subject: x86: vdso: pvclock gettime support Improve performance of time system calls when using Linux pvclock, by reading time info from fixmap visible copy of pvclock data. Originally from Jeremy Fitzhardinge. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index eaea1d3..80f8095 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -33,6 +33,26 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +#ifdef CONFIG_X86_64 + +#define VGETCPU_CPU_MASK 0xfff + +static inline unsigned int __getcpu(void) +{ + unsigned int p; + + if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + native_read_tscp(&p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + + return p; +} +#endif /* CONFIG_X86_64 */ + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 4df6c37..205ad32 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,7 @@ #include #include #include +#include #define gtod (&VVAR(vsyscall_gtod_data)) @@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void) return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); } +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u32 migrate_count; + u8 flags; + unsigned cpu, cpu1; + + + /* + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + migrate_count = pvti->migrate_count; + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) } -notrace static inline u64 vgetsns(void) +notrace static inline u64 vgetsns(int *mode) { long v; cycles_t cycles; @@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void) cycles = vread_tsc(); else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif else return 0; v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; @@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns); diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 5463ad5..2f94b03 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } + p = __getcpu(); + if (cpu) - *cpu = p & 0xfff; + *cpu = p & VGETCPU_CPU_MASK; if (node) *node = p >> 12; return 0; -- cgit v0.10.2 From 886b470cb14733a0286e365c77f1844c240c33a4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:58 -0200 Subject: KVM: x86: pass host_tsc to read_l1_tsc Allow the caller to pass host tsc value to kvm_x86_ops->read_l1_tsc(). Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2e11f4..d60535a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -700,7 +700,7 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43e9fad..9392f52 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e93908f..94f5ceb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3005,11 +3005,11 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, host_tsc); } static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6599e45..896efd4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1839,11 +1839,10 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 host_tsc, tsc_offset; + u64 tsc_offset; - rdtscll(host_tsc); tsc_offset = is_guest_mode(vcpu) ? to_vmx(vcpu)->nested.vmcs01_tsc_offset : vmcs_read64(TSC_OFFSET); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1dfe9d3..1155059 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1150,7 +1150,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc()); kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { @@ -5338,7 +5338,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, + native_read_tsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); -- cgit v0.10.2 From e0b306fef90556233797d2e1747bd6a3ae35ea93 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:59 -0200 Subject: time: export time information for KVM pvclock As suggested by John, export time data similarly to how its done by vsyscall support. This allows KVM to retrieve necessary information to implement vsyscall support in KVM guests. Acked-by: John Stultz Signed-off-by: Marcelo Tosatti diff --git a/include/linux/pvclock_gtod.h b/include/linux/pvclock_gtod.h new file mode 100644 index 0000000..0ca7582 --- /dev/null +++ b/include/linux/pvclock_gtod.h @@ -0,0 +1,9 @@ +#ifndef _PVCLOCK_GTOD_H +#define _PVCLOCK_GTOD_H + +#include + +extern int pvclock_gtod_register_notifier(struct notifier_block *nb); +extern int pvclock_gtod_unregister_notifier(struct notifier_block *nb); + +#endif /* _PVCLOCK_GTOD_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970..69f5342 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct timekeeper timekeeper; @@ -180,6 +181,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + /* update timekeeping data */ + update_pvclock_gtod(tk); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { @@ -188,6 +237,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) ntp_clear(); } update_vsyscall(tk); + update_pvclock_gtod(tk); } /** -- cgit v0.10.2 From 16e8d74d2da9920f874b10a3d979fb25c01f518f Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:29:00 -0200 Subject: KVM: x86: notifier for clocksource changes Register a notifier for clocksource change event. In case the host switches to clock other than TSC, disable master clock usage. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1155059..c077b81 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -46,6 +46,8 @@ #include #include #include +#include +#include #include #define CREATE_TRACE_POINTS @@ -901,6 +903,55 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, index, *data); } +#ifdef CONFIG_X86_64 +struct pvclock_gtod_data { + seqcount_t seq; + + struct { /* extract of a clocksource struct */ + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; + + /* open coded 'struct timespec' */ + u64 monotonic_time_snsec; + time_t monotonic_time_sec; +}; + +static struct pvclock_gtod_data pvclock_gtod_data; + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + + write_seqcount_begin(&vdata->seq); + + /* copy pvclock gtod data */ + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->clock->cycle_last; + vdata->clock.mask = tk->clock->mask; + vdata->clock.mult = tk->mult; + vdata->clock.shift = tk->shift; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + write_seqcount_end(&vdata->seq); +} +#endif + + static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; @@ -997,6 +1048,8 @@ static inline u64 get_kernel_ns(void) return timespec_to_ns(&ts); } +static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); + static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; @@ -1229,7 +1282,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -4857,6 +4909,39 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } +#ifdef CONFIG_X86_64 +static void pvclock_gtod_update_fn(struct work_struct *work) +{ +} + +static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); + +/* + * Notification about pvclock gtod data update. + */ +static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, + void *priv) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + struct timekeeper *tk = priv; + + update_pvclock_gtod(tk); + + /* disable master clock if host does not trust, or does not + * use, TSC clocksource + */ + if (gtod->clock.vclock_mode != VCLOCK_TSC && + atomic_read(&kvm_guest_has_master_clock) != 0) + queue_work(system_long_wq, &pvclock_gtod_work); + + return 0; +} + +static struct notifier_block pvclock_gtod_notifier = { + .notifier_call = pvclock_gtod_notify, +}; +#endif + int kvm_arch_init(void *opaque) { int r; @@ -4898,6 +4983,10 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); +#ifdef CONFIG_X86_64 + pvclock_gtod_register_notifier(&pvclock_gtod_notifier); +#endif + return 0; out: @@ -4912,6 +5001,9 @@ void kvm_arch_exit(void) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#ifdef CONFIG_X86_64 + pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); +#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); } -- cgit v0.10.2 From d828199e84447795c6669ff0e6c6d55eb9beeff6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:29:01 -0200 Subject: KVM: x86: implement PVCLOCK_TSC_STABLE_BIT pvclock flag KVM added a global variable to guarantee monotonicity in the guest. One of the reasons for that is that the time between 1. ktime_get_ts(×pec); 2. rdtscll(tsc); Is variable. That is, given a host with stable TSC, suppose that two VCPUs read the same time via ktime_get_ts() above. The time required to execute 2. is not the same on those two instances executing in different VCPUS (cache misses, interrupts...). If the TSC value that is used by the host to interpolate when calculating the monotonic time is the same value used to calculate the tsc_timestamp value stored in the pvclock data structure, and a single tuple is visible to all vcpus simultaneously, this problem disappears. See comment on top of pvclock_update_vm_gtod_copy for details. Monotonicity is then guaranteed by synchronicity of the host TSCs and guest TSCs. Set TSC stable pvclock flag in that case, allowing the guest to read clock from userspace. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d60535a..32f0e4a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -560,6 +562,11 @@ struct kvm_arch { u64 cur_tsc_offset; u8 cur_tsc_generation; + spinlock_t pvclock_gtod_sync_lock; + bool use_master_clock; + u64 master_kernel_ns; + cycle_t master_cycle_now; + struct kvm_xen_hvm_config xen_hvm_config; /* fields used by HYPER-V emulation */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f0..1d65268 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -4,6 +4,7 @@ #include #include #include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -754,6 +755,35 @@ TRACE_EVENT( __entry->write ? "Write" : "Read", __entry->gpa_match ? "GPA" : "GVA") ); + +#ifdef CONFIG_X86_64 + +#define host_clocks \ + {VCLOCK_NONE, "none"}, \ + {VCLOCK_TSC, "tsc"}, \ + {VCLOCK_HPET, "hpet"} \ + +TRACE_EVENT(kvm_update_master_clock, + TP_PROTO(bool use_master_clock, unsigned int host_clock), + TP_ARGS(use_master_clock, host_clock), + + TP_STRUCT__entry( + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + ), + + TP_fast_assign( + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + ), + + TP_printk("masterclock %d hostclock %s", + __entry->use_master_clock, + __print_symbolic(__entry->host_clock, host_clocks)) +); + +#endif /* CONFIG_X86_64 */ + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c077b81..a7b97a4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1048,7 +1048,9 @@ static inline u64 get_kernel_ns(void) return timespec_to_ns(&ts); } +#ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); +#endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; @@ -1190,21 +1192,170 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) EXPORT_SYMBOL_GPL(kvm_write_tsc); +#ifdef CONFIG_X86_64 + +static cycle_t read_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = pvclock_gtod_data.clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static inline u64 vgettsc(cycle_t *cycle_now) +{ + long v; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + *cycle_now = read_tsc(); + + v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; + return v * gtod->clock.mult; +} + +static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +{ + unsigned long seq; + u64 ns; + int mode; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + ts->tv_nsec = 0; + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); + + return mode; +} + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +{ + struct timespec ts; + + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) + return false; + + monotonic_to_bootbased(&ts); + *kernel_ns = timespec_to_ns(&ts); + + return true; +} +#endif + +/* + * + * Assuming a stable TSC across physical CPUS, the following condition + * is possible. Each numbered line represents an event visible to both + * CPUs at the next numbered event. + * + * "timespecX" represents host monotonic time. "tscX" represents + * RDTSC value. + * + * VCPU0 on CPU0 | VCPU1 on CPU1 + * + * 1. read timespec0,tsc0 + * 2. | timespec1 = timespec0 + N + * | tsc1 = tsc0 + M + * 3. transition to guest | transition to guest + * 4. ret0 = timespec0 + (rdtsc - tsc0) | + * 5. | ret1 = timespec1 + (rdtsc - tsc1) + * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) + * + * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: + * + * - ret0 < ret1 + * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) + * ... + * - 0 < N - M => M < N + * + * That is, when timespec0 != timespec1, M < N. Unfortunately that is not + * always the case (the difference between two distinct xtime instances + * might be smaller then the difference between corresponding TSC reads, + * when updating guest vcpus pvclock areas). + * + * To avoid that problem, do not allow visibility of distinct + * system_timestamp/tsc_timestamp values simultaneously: use a master + * copy of host monotonic time values. Update that master copy + * in lockstep. + * + * Rely on synchronization of host TSCs for monotonicity. + * + */ + +static void pvclock_update_vm_gtod_copy(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct kvm_arch *ka = &kvm->arch; + int vclock_mode; + + /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + ka->use_master_clock = kvm_get_time_and_clockread( + &ka->master_kernel_ns, + &ka->master_cycle_now); + + if (ka->use_master_clock) + atomic_set(&kvm_guest_has_master_clock, 1); + + vclock_mode = pvclock_gtod_data.clock.vclock_mode; + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags; + unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; + struct kvm_arch *ka = &v->kvm->arch; void *shared_kaddr; - unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; + u64 tsc_timestamp, host_tsc; struct pvclock_vcpu_time_info *guest_hv_clock; u8 pvclock_flags; + bool use_master_clock; + + kernel_ns = 0; + host_tsc = 0; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, native_read_tsc()); - kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); @@ -1213,6 +1364,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + spin_lock(&ka->pvclock_gtod_sync_lock); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + spin_unlock(&ka->pvclock_gtod_sync_lock); + if (!use_master_clock) { + host_tsc = native_read_tsc(); + kernel_ns = get_kernel_ns(); + } + + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + + /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate @@ -1273,9 +1442,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - + /* with a master tuple, + * pvclock clock reads always increase at the (scaled) rate + * of guest TSC - no need to deal with sampling errors. + */ + if (!use_master_clock) { + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + } /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; @@ -1301,6 +1475,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } + /* If the host uses TSC clocksource, then it is stable */ + if (use_master_clock) + pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + vcpu->hv_clock.flags = pvclock_flags; memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, @@ -4912,6 +5090,17 @@ static void kvm_set_mmio_spte_mask(void) #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { + struct kvm *kvm; + + struct kvm_vcpu *vcpu; + int i; + + raw_spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + atomic_set(&kvm_guest_has_master_clock, 0); + raw_spin_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -5303,6 +5492,29 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5315,6 +5527,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); + if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) + kvm_gen_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) @@ -6219,6 +6433,8 @@ int kvm_arch_hardware_enable(void *garbage) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); } /* @@ -6356,6 +6572,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 99a4762..c94c998 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -131,6 +131,8 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_PMU 16 #define KVM_REQ_PMI 17 #define KVM_REQ_WATCHDOG 18 +#define KVM_REQ_MASTERCLOCK_UPDATE 19 +#define KVM_REQ_MCLOCK_INPROGRESS 20 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -540,6 +542,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); +void kvm_make_mclock_inprogress_request(struct kvm *kvm); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e3f5b14..be3e7bb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -212,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm) make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } +void kvm_make_mclock_inprogress_request(struct kvm *kvm) +{ + make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); +} + int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; -- cgit v0.10.2 From 42897d866b120547777ae1fd316680ec53356d9c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:29:02 -0200 Subject: KVM: x86: add kvm_arch_vcpu_postcreate callback, move TSC initialization TSC initialization will soon make use of online_vcpus. Signed-off-by: Marcelo Tosatti diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index c71acd7..83b5453 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1330,6 +1330,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + return 0; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index deb0d59..f9ab12a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -439,6 +439,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 38883f0..731ddee 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -355,6 +355,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + return 0; +} + int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 94f5ceb..161a5fa 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1251,7 +1251,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); err = fx_init(&svm->vcpu); if (err) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 896efd4..bb18923 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3896,8 +3896,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); - kvm_write_tsc(&vmx->vcpu, 0); - return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7b97a4..f3c069e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6320,6 +6320,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + int r; + + r = vcpu_load(vcpu); + if (r) + return r; + kvm_write_tsc(vcpu, 0); + vcpu_put(vcpu); + + return r; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { int r; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c94c998..8e5c7b6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -596,6 +596,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); int kvm_arch_hardware_enable(void *garbage); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index be3e7bb..e6cfd43 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1853,6 +1853,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) atomic_inc(&kvm->online_vcpus); mutex_unlock(&kvm->lock); + kvm_arch_vcpu_postcreate(vcpu); return r; unlock_vcpu_destroy: -- cgit v0.10.2 From b48aa97e38206a84bf8485e7c553412274708ce5 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:29:03 -0200 Subject: KVM: x86: require matched TSC offsets for master clock With master clock, a pvclock clock read calculates: ret = system_timestamp + [ (rdtsc + tsc_offset) - tsc_timestamp ] Where 'rdtsc' is the host TSC. system_timestamp and tsc_timestamp are unique, one tuple per VM: the "master clock". Given a host with synchronized TSCs, its obvious that guest TSC must be matched for the above to guarantee monotonicity. Allow master clock usage only if guest TSCs are synchronized. Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32f0e4a..9fb6d8d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -561,6 +561,7 @@ struct kvm_arch { u64 cur_tsc_write; u64 cur_tsc_offset; u8 cur_tsc_generation; + int nr_vcpus_matched_tsc; spinlock_t pvclock_gtod_sync_lock; bool use_master_clock; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1d65268..fe5e00e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -764,21 +764,54 @@ TRACE_EVENT( {VCLOCK_HPET, "hpet"} \ TRACE_EVENT(kvm_update_master_clock, - TP_PROTO(bool use_master_clock, unsigned int host_clock), - TP_ARGS(use_master_clock, host_clock), + TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), + TP_ARGS(use_master_clock, host_clock, offset_matched), TP_STRUCT__entry( __field( bool, use_master_clock ) __field( unsigned int, host_clock ) + __field( bool, offset_matched ) ), TP_fast_assign( __entry->use_master_clock = use_master_clock; __entry->host_clock = host_clock; + __entry->offset_matched = offset_matched; ), - TP_printk("masterclock %d hostclock %s", + TP_printk("masterclock %d hostclock %s offsetmatched %u", __entry->use_master_clock, + __print_symbolic(__entry->host_clock, host_clocks), + __entry->offset_matched) +); + +TRACE_EVENT(kvm_track_tsc, + TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, + unsigned int online_vcpus, bool use_master_clock, + unsigned int host_clock), + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, + host_clock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, nr_vcpus_matched_tsc ) + __field( unsigned int, online_vcpus ) + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->nr_vcpus_matched_tsc = nr_matched; + __entry->online_vcpus = online_vcpus; + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + ), + + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" + " hostclock %s", + __entry->vcpu_id, __entry->use_master_clock, + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, __print_symbolic(__entry->host_clock, host_clocks)) ); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f3c069e..422ef5e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1103,12 +1103,40 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + bool vcpus_matched; + bool do_request = false; + struct kvm_arch *ka = &vcpu->kvm->arch; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)); + + if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) + if (!ka->use_master_clock) + do_request = 1; + + if (!vcpus_matched && ka->use_master_clock) + do_request = 1; + + if (do_request) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, + atomic_read(&vcpu->kvm->online_vcpus), + ka->use_master_clock, gtod->clock.vclock_mode); +#endif +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; s64 usdiff; + bool matched; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1151,6 +1179,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + matched = true; } else { /* * We split periods of matched TSC writes into generations. @@ -1165,6 +1194,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; + matched = false; pr_debug("kvm: new tsc generation %u, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1188,6 +1218,15 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + if (matched) + kvm->arch.nr_vcpus_matched_tsc++; + else + kvm->arch.nr_vcpus_matched_tsc = 0; + + kvm_track_tsc_matching(vcpu); + spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } EXPORT_SYMBOL_GPL(kvm_write_tsc); @@ -1279,8 +1318,9 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) /* * - * Assuming a stable TSC across physical CPUS, the following condition - * is possible. Each numbered line represents an event visible to both + * Assuming a stable TSC across physical CPUS, and a stable TSC + * across virtual CPUs, the following condition is possible. + * Each numbered line represents an event visible to both * CPUs at the next numbered event. * * "timespecX" represents host monotonic time. "tscX" represents @@ -1313,7 +1353,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) * copy of host monotonic time values. Update that master copy * in lockstep. * - * Rely on synchronization of host TSCs for monotonicity. + * Rely on synchronization of host TSCs and guest TSCs for monotonicity. * */ @@ -1322,20 +1362,27 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #ifdef CONFIG_X86_64 struct kvm_arch *ka = &kvm->arch; int vclock_mode; + bool host_tsc_clocksource, vcpus_matched; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&kvm->online_vcpus)); /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - ka->use_master_clock = kvm_get_time_and_clockread( + host_tsc_clocksource = kvm_get_time_and_clockread( &ka->master_kernel_ns, &ka->master_cycle_now); + ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); vclock_mode = pvclock_gtod_data.clock.vclock_mode; - trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode); + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, + vcpus_matched); #endif } -- cgit v0.10.2 From d98d07ca7e0347d712d54a865af323c4aee04bc2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:29:04 -0200 Subject: KVM: x86: update pvclock area conditionally, on cpu migration As requested by Glauber, do not update kvmclock area on vcpu->pcpu migration, in case the host has stable TSC. This is to reduce cacheline bouncing. Acked-by: Glauber Costa Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 422ef5e..fd76621 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2627,7 +2627,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + /* + * On a host with synchronized TSC, there is no need to update + * kvmclock on vcpu->cpu migration + */ + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; -- cgit v0.10.2 From 859f8450d8a334a7f7cb994e4676cf918deff832 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Nov 2012 15:19:08 +0200 Subject: KVM: use is_idle_task() instead of idle_cpu() to decide when to halt in async_pf As Frederic pointed idle_cpu() may return false even if async fault happened in the idle task if wake up is pending. In this case the code will try to put idle task to sleep. Fix this by using is_idle_task() to check for idle task. Reported-by: Frederic Weisbecker Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a91c6b4..08b973f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -120,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token) struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; DEFINE_WAIT(wait); - int cpu, idle; - - cpu = get_cpu(); - idle = idle_cpu(cpu); - put_cpu(); spin_lock(&b->lock); e = _find_apf_task(b, token); @@ -138,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = idle || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1; init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); -- cgit v0.10.2 From e6c7d32172f10b68fa9a3be05aa1231352a52171 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 28 Nov 2012 20:53:15 +0800 Subject: KVM: VMX: fix invalid cpu passed to smp_call_function_single In loaded_vmcs_clear, loaded_vmcs->cpu is the fist parameter passed to smp_call_function_single, if the target cpu is downing (doing cpu hot remove), loaded_vmcs->cpu can become -1 then -1 is passed to smp_call_function_single It can be triggered when vcpu is being destroyed, loaded_vmcs_clear is called in the preemptionable context Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bb18923..4406374 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1007,9 +1007,11 @@ static void __loaded_vmcs_clear(void *arg) static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (loaded_vmcs->cpu != -1) - smp_call_function_single( - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) -- cgit v0.10.2 From 5a560f8b5ed361c7be783d5a671ea26ca4d6fa01 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 28 Nov 2012 20:54:14 +0800 Subject: KVM: VMX: fix memory order between loading vmcs and clearing vmcs vmcs->cpu indicates whether it exists on the target cpu, -1 means the vmcs does not exist on any vcpu If vcpu load vmcs with vmcs.cpu = -1, it can be directly added to cpu's percpu list. The list can be corrupted if the cpu prefetch the vmcs's list before reading vmcs->cpu. Meanwhile, we should remove vmcs from the list before making vmcs->vcpu == -1 be visible Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4406374..3606154 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1002,6 +1002,15 @@ static void __loaded_vmcs_clear(void *arg) if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link + * is before setting loaded_vmcs->vcpu to -1 which is done in + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist + * then adds the vmcs into percpu list before it is deleted. + */ + smp_wmb(); + loaded_vmcs_init(loaded_vmcs); } @@ -1537,6 +1546,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); + + /* + * Read loaded_vmcs->cpu should be before fetching + * loaded_vmcs->loaded_vmcss_on_cpu_link. + * See the comments in __loaded_vmcs_clear(). + */ + smp_rmb(); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); local_irq_enable(); -- cgit v0.10.2 From 5419369ed6bd4cf711fdda5e52a5999b940413f5 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 29 Nov 2012 14:07:59 -0700 Subject: KVM: Fix user memslot overlap check Prior to memory slot sorting this loop compared all of the user memory slots for overlap with new entries. With memory slot sorting, we're just checking some number of entries in the array that may or may not be user slots. Instead, walk all the slots with kvm_for_each_memslot, which has the added benefit of terminating early when we hit the first empty slot, and skip comparison to private slots. Cc: stable@vger.kernel.org Signed-off-by: Alex Williamson Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6cfd43..1cd693a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -714,8 +714,7 @@ int __kvm_set_memory_region(struct kvm *kvm, int r; gfn_t base_gfn; unsigned long npages; - unsigned long i; - struct kvm_memory_slot *memslot; + struct kvm_memory_slot *memslot, *slot; struct kvm_memory_slot old, new; struct kvm_memslots *slots, *old_memslots; @@ -766,13 +765,11 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Check for overlaps */ r = -EEXIST; - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; - - if (s == memslot || !s->npages) + kvm_for_each_memslot(slot, kvm->memslots) { + if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot) continue; - if (!((base_gfn + npages <= s->base_gfn) || - (base_gfn >= s->base_gfn + s->npages))) + if (!((base_gfn + npages <= slot->base_gfn) || + (base_gfn >= slot->base_gfn + slot->npages))) goto out_free; } -- cgit v0.10.2 From 8fe8ab46be06fcd9abfe6fe9928fd95b54ab079a Mon Sep 17 00:00:00 2001 From: Will Auld Date: Thu, 29 Nov 2012 12:42:12 -0800 Subject: KVM: x86: Add code to track call origin for msr assignment In order to track who initiated the call (host or guest) to modify an msr value I have changed function call parameters along the call path. The specific change is to add a struct pointer parameter that points to (index, data, caller) information rather than having this information passed as individual parameters. The initial use for this capability is for updating the IA32_TSC_ADJUST msr while setting the tsc value. It is anticipated that this capability is useful for other tasks. Signed-off-by: Will Auld Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9fb6d8d..56c5dca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -620,6 +620,12 @@ struct kvm_vcpu_stat { struct x86_instruction_info; +struct msr_data { + bool host_initiated; + u32 index; + u64 data; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -642,7 +648,7 @@ struct kvm_x86_ops { void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -793,7 +799,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; @@ -820,7 +826,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 161a5fa..fc22e58 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3127,13 +3127,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ecx = msr->index; + u64 data = msr->data; switch (ecx) { case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr); break; case MSR_STAR: svm->vmcb->save.star = data; @@ -3188,20 +3190,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: - return kvm_set_msr_common(vcpu, ecx, data); + return kvm_set_msr_common(vcpu, msr); } return 0; } static int wrmsr_interception(struct vcpu_svm *svm) { + struct msr_data msr; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) { + if (svm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3606154..45ffa32 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2220,15 +2220,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; switch (msr_index) { case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -2254,7 +2256,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr_info); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2262,7 +2264,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) @@ -2285,7 +2287,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -4648,11 +4650,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) static int handle_wrmsr(struct kvm_vcpu *vcpu) { + struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (vmx_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fd76621..95f6613 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -890,9 +890,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); + return kvm_x86_ops->set_msr(vcpu, msr); } /* @@ -900,7 +900,12 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return kvm_set_msr(vcpu, index, *data); + struct msr_data msr; + + msr.data = *data; + msr.index = index; + msr.host_initiated = true; + return kvm_set_msr(vcpu, &msr); } #ifdef CONFIG_X86_64 @@ -1130,13 +1135,14 @@ void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; s64 usdiff; bool matched; + u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1857,9 +1863,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { case MSR_EFER: @@ -4531,7 +4539,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct msr_data msr; + + msr.data = data; + msr.index = msr_index; + msr.host_initiated = false; + return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -6375,11 +6388,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; + struct msr_data msr; r = vcpu_load(vcpu); if (r) return r; - kvm_write_tsc(vcpu, 0); + msr.data = 0x0; + msr.index = MSR_IA32_TSC; + msr.host_initiated = true; + kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); return r; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2b5219c..e224f7a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, -- cgit v0.10.2 From ba904635d498fea43fc3610983f9dc430ac324e4 Mon Sep 17 00:00:00 2001 From: Will Auld Date: Thu, 29 Nov 2012 12:42:50 -0800 Subject: KVM: x86: Emulate IA32_TSC_ADJUST MSR CPUID.7.0.EBX[1]=1 indicates IA32_TSC_ADJUST MSR 0x3b is supported Basic design is to emulate the MSR by allowing reads and writes to a guest vcpu specific location to store the value of the emulated MSR while adding the value to the vmcs tsc_offset. In this way the IA32_TSC_ADJUST value will be included in all reads to the TSC MSR whether through rdmsr or rdtsc. This is of course as long as the "use TSC counter offsetting" VM-execution control is enabled as well as the IA32_TSC_ADJUST control. However, because hardware will only return the TSC + IA32_TSC_ADJUST + vmsc tsc_offset for a guest process when it does and rdtsc (with the correct settings) the value of our virtualized IA32_TSC_ADJUST must be stored in one of these three locations. The argument against storing it in the actual MSR is performance. This is likely to be seldom used while the save/restore is required on every transition. IA32_TSC_ADJUST was created as a way to solve some issues with writing TSC itself so that is not an option either. The remaining option, defined above as our solution has the problem of returning incorrect vmcs tsc_offset values (unless we intercept and fix, not done here) as mentioned above. However, more problematic is that storing the data in vmcs tsc_offset will have a different semantic effect on the system than does using the actual MSR. This is illustrated in the following example: The hypervisor set the IA32_TSC_ADJUST, then the guest sets it and a guest process performs a rdtsc. In this case the guest process will get TSC + IA32_TSC_ADJUST_hyperviser + vmsc tsc_offset including IA32_TSC_ADJUST_guest. While the total system semantics changed the semantics as seen by the guest do not and hence this will not cause a problem. Signed-off-by: Will Auld Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8c297aa..602c476 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -202,6 +202,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 56c5dca..dc87b65 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -444,6 +444,7 @@ struct kvm_vcpu_arch { s8 virtual_tsc_shift; u32 virtual_tsc_mult; u32 virtual_tsc_khz; + s64 ia32_tsc_adjust_msr; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -711,6 +712,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); + u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 7f0edce..c2dea36 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -236,6 +236,7 @@ #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_EBC_FREQUENCY_ID 0x0000002c #define MSR_IA32_FEATURE_CONTROL 0x0000003a +#define MSR_IA32_TSC_ADJUST 0x0000003b #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ec79e77..52f6166 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (index == 0) { entry->ebx &= kvm_supported_word9_x86_features; cpuid_mask(&entry->ebx, 9); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); } else entry->ebx = 0; entry->eax = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a10e460..3a8b504 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -28,6 +28,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); +} + static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fc22e58..dcb79527 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1009,6 +1009,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) svm->tsc_ratio = ratio; } +static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->vmcb->control.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4304,6 +4311,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, .set_tsc_khz = svm_set_tsc_khz, + .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 45ffa32..2fd2046 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1884,6 +1884,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) WARN(1, "user requested TSC rate below hardware speed\n"); } +static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + return vmcs_read64(TSC_OFFSET); +} + /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2266,6 +2271,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_TSC_ADJUST: + ret = kvm_set_msr_common(vcpu, msr_info); + break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -7345,6 +7353,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .set_tsc_khz = vmx_set_tsc_khz, + .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 95f6613..b0b8abe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -831,6 +831,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static const u32 emulated_msrs[] = { + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, @@ -1135,6 +1136,12 @@ void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } +static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) +{ + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1222,6 +1229,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) + update_ia32_tsc_adjust_msr(vcpu, offset); kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -1918,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; + case MSR_IA32_TSC_ADJUST: + if (guest_cpuid_has_tsc_adjust(vcpu)) { + if (!msr_info->host_initiated) { + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -2277,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_TSCDEADLINE: data = kvm_get_lapic_tscdeadline_msr(vcpu); break; + case MSR_IA32_TSC_ADJUST: + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -6607,6 +6628,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + vcpu->arch.ia32_tsc_adjust_msr = 0x0; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); -- cgit v0.10.2 From 45e3cc7d9fe69844cd12d51c511e1e98d156bbe1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 2 Dec 2012 11:04:14 +0100 Subject: KVM: x86: Fix uninitialized return code This is a regression caused by 18595411a7. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b0b8abe..3bdaf29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3006,6 +3006,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_LAPIC: { + r = -EINVAL; if (!vcpu->arch.apic) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); -- cgit v0.10.2 From 01f218803757c9ec1152ac2fd39d03c27c452634 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 17 Oct 2012 18:06:02 +0200 Subject: kvm: add kvm_set_irq_inatomic Add an API to inject IRQ from atomic context. Return EWOULDBLOCK if impossible (e.g. for multicast). Only MSI is supported ATM. Signed-off-by: Michael S. Tsirkin Signed-off-by: Gleb Natapov diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8e5c7b6..36c3704 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -693,6 +693,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, unsigned long *deliver_bitmask); #endif int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); +int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, int irq_source_id, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 2eb58af..656fa45 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -102,6 +102,23 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } +static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq) +{ + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + + irq->dest_id = (e->msi.address_lo & + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + irq->vector = (e->msi.data & + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; + irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; + irq->delivery_mode = e->msi.data & 0x700; + irq->level = 1; + irq->shorthand = 0; + /* TODO Deal with RH bit of MSI message address */ +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level) { @@ -110,22 +127,26 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, if (!level) return -1; - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + kvm_set_msi_irq(e, &irq); - irq.dest_id = (e->msi.address_lo & - MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; - irq.vector = (e->msi.data & - MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; - irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; - irq.delivery_mode = e->msi.data & 0x700; - irq.level = 1; - irq.shorthand = 0; - - /* TODO Deal with RH bit of MSI message address */ return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } + +static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm) +{ + struct kvm_lapic_irq irq; + int r; + + kvm_set_msi_irq(e, &irq); + + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r)) + return r; + else + return -EWOULDBLOCK; +} + int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; @@ -178,6 +199,44 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) return ret; } +/* + * Deliver an IRQ in an atomic context if we can, or return a failure, + * user can retry in a process context. + * Return value: + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. + * Other values - No need to retry. + */ +int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) +{ + struct kvm_kernel_irq_routing_entry *e; + int ret = -EINVAL; + struct kvm_irq_routing_table *irq_rt; + struct hlist_node *n; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* + * Injection into either PIC or IOAPIC might need to scan all CPUs, + * which would need to be retried from thread context; when same GSI + * is connected to both PIC and IOAPIC, we'd have to report a + * partial failure here. + * Since there's no easy way to do this, we only support injecting MSI + * which is limited to 1:1 GSI mapping. + */ + rcu_read_lock(); + irq_rt = rcu_dereference(kvm->irq_routing); + if (irq < irq_rt->nr_rt_entries) + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) { + if (likely(e->type == KVM_IRQ_ROUTING_MSI)) + ret = kvm_set_msi_inatomic(e, kvm); + else + ret = -EWOULDBLOCK; + break; + } + rcu_read_unlock(); + return ret; +} + void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_ack_notifier *kian; -- cgit v0.10.2 From 78c634402a1825f1f5bef13077f0985f3b8a3212 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 17 Oct 2012 18:06:06 +0200 Subject: kvm: deliver msi interrupts from irq handler We can deliver certain interrupts, notably MSI, from atomic context. Use kvm_set_irq_inatomic, to implement an irq handler for msi. This reduces the pressure on scheduler in case where host and guest irq share a host cpu. Signed-off-by: Michael S. Tsirkin Signed-off-by: Gleb Natapov diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 23a41a9..3642239 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -105,6 +105,15 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) } #ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret = kvm_set_irq_inatomic(assigned_dev->kvm, + assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} + static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -117,6 +126,23 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) #endif #ifdef __KVM_HAVE_MSIX +static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int index = find_index_from_host_irq(assigned_dev, irq); + u32 vector; + int ret = 0; + + if (index >= 0) { + vector = assigned_dev->guest_msix_entries[index].vector; + ret = kvm_set_irq_inatomic(assigned_dev->kvm, + assigned_dev->irq_source_id, + vector, 1); + } + + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; +} + static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; @@ -334,11 +360,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, } #ifdef __KVM_HAVE_MSI -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - return IRQ_WAKE_THREAD; -} - static int assigned_device_enable_host_msi(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { @@ -363,11 +384,6 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, #endif #ifdef __KVM_HAVE_MSIX -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - return IRQ_WAKE_THREAD; -} - static int assigned_device_enable_host_msix(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { -- cgit v0.10.2 From 0307b7b8c275e65552f6022a18ad91902ae25d42 Mon Sep 17 00:00:00 2001 From: Zhang Xiantao Date: Wed, 5 Dec 2012 01:55:14 +0800 Subject: kvm: remove unnecessary bit checking for ept violation Bit 6 in EPT vmexit's exit qualification is not defined in SDM, so remove it. Signed-off-by: Zhang Xiantao Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2fd2046..d2248b3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4863,11 +4863,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (exit_qualification & (1 << 6)) { - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -EINVAL; - } - gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); -- cgit v0.10.2 From 2b3c5cbc0d814437fe4d70cc11ed60550b95b29f Mon Sep 17 00:00:00 2001 From: Zhang Xiantao Date: Wed, 5 Dec 2012 01:55:15 +0800 Subject: kvm: don't use bit24 for detecting address-specific invalidation capability Bit24 in VMX_EPT_VPID_CAP_MASI is not used for address-specific invalidation capability reporting, so remove it from KVM to avoid conflicts in future. Signed-off-by: Zhang Xiantao Signed-off-by: Gleb Natapov diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 36ec21c..c2d56b3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -445,8 +445,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) -#define VMX_EPT_AD_BIT (1ull << 21) -#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2248b3..32485d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -802,11 +802,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void) return vmx_capability.ept & VMX_EPT_AD_BIT; } -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; -} - static inline bool cpu_has_vmx_invept_context(void) { return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; @@ -1062,17 +1057,6 @@ static inline void ept_sync_context(u64 eptp) } } -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_individual_addr()) - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, - eptp, gpa); - else - ept_sync_context(eptp); - } -} - static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; -- cgit v0.10.2 From 66f7b72e117180d0007e7a65b8dc5bd1c8126e3b Mon Sep 17 00:00:00 2001 From: Julian Stecklina Date: Wed, 5 Dec 2012 15:26:19 +0100 Subject: KVM: x86: Make register state after reset conform to specification VMX behaves now as SVM wrt to FPU initialization. Code has been moved to generic code path. General-purpose registers are now cleared on reset and INIT. SVM code properly initializes EDX. Signed-off-by: Julian Stecklina Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52f6166..a20ecb5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -661,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } else *eax = *ebx = *ecx = *edx = 0; } +EXPORT_SYMBOL_GPL(kvm_cpuid); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index dcb79527..d29d3cd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -20,6 +20,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include #include @@ -1193,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm) static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + u32 dummy; + u32 eax = 1; init_vmcb(svm); @@ -1201,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); + kvm_register_write(vcpu, VCPU_REGS_RDX, eax); return 0; } @@ -1259,10 +1263,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; - svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -1271,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; -free_page4: - __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32485d8..94833e2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3918,8 +3918,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -3931,10 +3929,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -3975,7 +3969,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4025,7 +4018,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) /* HACK: Don't enable emulation on guest boot/reset */ vmx->emulation_required = 0; -out: return ret; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3bdaf29..57c76e8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6461,6 +6461,10 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -6629,11 +6633,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + r = fx_init(vcpu); + if (r) + goto fail_free_wbinvd_dirty_mask; + vcpu->arch.ia32_tsc_adjust_msr = 0x0; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); return 0; +fail_free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: -- cgit v0.10.2 From 914daba865cb5c38cd5fdee024ca38029315b38f Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 9 Oct 2012 00:22:59 +0200 Subject: KVM: Distangle eventfd code from irqchip The current eventfd code assumes that when we have eventfd, we also have irqfd for in-kernel interrupt delivery. This is not necessarily true. On PPC we don't have an in-kernel irqchip yet, but we can still support easily support eventfd. Signed-off-by: Alexander Graf diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8e5c7b6..c823e47 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -900,10 +900,20 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} #ifdef CONFIG_HAVE_KVM_EVENTFD void kvm_eventfd_init(struct kvm *kvm); +int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); + +#ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); -int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); +#else +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) +{ + return -EINVAL; +} + +static inline void kvm_irqfd_release(struct kvm *kvm) {} +#endif #else diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 9718e98..d7424c8 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -35,6 +35,7 @@ #include "iodev.h" +#ifdef __KVM_HAVE_IOAPIC /* * -------------------------------------------------------------------- * irqfd: Allows an fd to be used to inject an interrupt to the guest @@ -425,17 +426,21 @@ fail: kfree(irqfd); return ret; } +#endif void kvm_eventfd_init(struct kvm *kvm) { +#ifdef __KVM_HAVE_IOAPIC spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); INIT_LIST_HEAD(&kvm->irqfds.resampler_list); mutex_init(&kvm->irqfds.resampler_lock); +#endif INIT_LIST_HEAD(&kvm->ioeventfds); } +#ifdef __KVM_HAVE_IOAPIC /* * shutdown any irqfd's that match fd+gsi */ @@ -555,6 +560,7 @@ static void __exit irqfd_module_exit(void) module_init(irqfd_module_init); module_exit(irqfd_module_exit); +#endif /* * -------------------------------------------------------------------- -- cgit v0.10.2 From 0e673fb679027600cad45bd61a4cc9ebd2ed2bb1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 9 Oct 2012 00:06:20 +0200 Subject: KVM: PPC: Support eventfd In order to support the generic eventfd infrastructure on PPC, we need to call into the generic KVM in-kernel device mmio code. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 71f0cd9..4730c95 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_EVENTFD config KVM_BOOK3S_HANDLER bool diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index c2a0863..cd89658 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -6,7 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ + eventfd.o) CFLAGS_44x_tlb.o := -I. CFLAGS_e500_tlb.o := -I. @@ -76,6 +77,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ kvm-book3s_64-module-objs := \ ../../../virt/kvm/kvm_main.o \ + ../../../virt/kvm/eventfd.o \ powerpc.o \ emulate.o \ book3s.o \ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f9ab12a..d583ea1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -314,6 +314,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: case KVM_CAP_ONE_REG: + case KVM_CAP_IOEVENTFD: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV @@ -618,6 +619,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->mmio_is_write = 0; vcpu->arch.mmio_sign_extend = 0; + if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data)) { + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + return EMULATE_DO_MMIO; } @@ -627,8 +635,8 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, { int r; - r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); vcpu->arch.mmio_sign_extend = 1; + r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); return r; } @@ -666,6 +674,13 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } } + if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data)) { + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + return EMULATE_DO_MMIO; } -- cgit v0.10.2 From 7ed661bf852cefa1ab57ad709a675bfb029d47ab Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Nov 2012 18:31:32 +0000 Subject: KVM: PPC: Book3S HV: Restructure HPT entry creation code This restructures the code that creates HPT (hashed page table) entries so that it can be called in situations where we don't have a struct vcpu pointer, only a struct kvm pointer. It also fixes a bug where kvmppc_map_vrma() would corrupt the guest R4 value. Most of the work of kvmppc_virtmode_h_enter is now done by a new function, kvmppc_virtmode_do_h_enter, which itself calls another new function, kvmppc_do_h_enter, which contains most of the old kvmppc_h_enter. The new kvmppc_do_h_enter takes explicit arguments for the place to return the HPTE index, the Linux page tables to use, and whether it is being called in real mode, thus removing the need for it to have the vcpu as an argument. Currently kvmppc_map_vrma creates the VRMA (virtual real mode area) HPTEs by calling kvmppc_virtmode_h_enter, which is designed primarily to handle H_ENTER hcalls from the guest that need to pin a page of memory. Since H_ENTER returns the index of the created HPTE in R4, kvmppc_virtmode_h_enter updates the guest R4, corrupting the guest R4 in the case when it gets called from kvmppc_map_vrma on the first VCPU_RUN ioctl. With this, kvmppc_map_vrma instead calls kvmppc_virtmode_do_h_enter with the address of a dummy word as the place to store the HPTE index, thus avoiding corrupting the guest R4. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 36fcf41..fea768f 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -157,8 +157,9 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel); -extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, - long pte_index, unsigned long pteh, unsigned long ptel); +extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel, + pgd_t *pgdir, bool realmode, unsigned long *idx_ret); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2a89a36..6ee6516 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -41,6 +41,10 @@ /* Power architecture requires HPT is at least 256kB */ #define PPC_MIN_HPT_ORDER 18 +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret); + long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) { unsigned long hpt; @@ -185,6 +189,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long addr, hash; unsigned long psize; unsigned long hp0, hp1; + unsigned long idx_ret; long ret; struct kvm *kvm = vcpu->kvm; @@ -216,7 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, hash = (hash << 3) + 7; hp_v = hp0 | ((addr >> 16) & ~0x7fUL); hp_r = hp1 | addr; - ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, + &idx_ret); if (ret != H_SUCCESS) { pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", addr, ret); @@ -354,15 +360,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, return err; } -/* - * We come here on a H_ENTER call from the guest when we are not - * using mmu notifiers and we don't have the requested page pinned - * already. - */ -long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, - long pte_index, unsigned long pteh, unsigned long ptel) +long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret) { - struct kvm *kvm = vcpu->kvm; unsigned long psize, gpa, gfn; struct kvm_memory_slot *memslot; long ret; @@ -390,8 +391,8 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, do_insert: /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ - vcpu->arch.pgdir = current->mm->pgd; - ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); + ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, + current->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ @@ -402,6 +403,19 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, } +/* + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel) +{ + return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index, + pteh, ptel, &vcpu->arch.gpr[4]); +} + static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, gva_t eaddr) { diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5e06e31..362dffe 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -103,14 +103,14 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, int writing, unsigned long *pte_sizep) { pte_t *ptep; unsigned long ps = *pte_sizep; unsigned int shift; - ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); if (!ptep) return __pte(0); if (shift) @@ -130,10 +130,10 @@ static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) hpte[0] = hpte_v; } -long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, - long pte_index, unsigned long pteh, unsigned long ptel) +long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel, + pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) { - struct kvm *kvm = vcpu->kvm; unsigned long i, pa, gpa, gfn, psize; unsigned long slot_fn, hva; unsigned long *hpte; @@ -147,7 +147,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned int writing; unsigned long mmu_seq; unsigned long rcbits; - bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); if (!psize) @@ -201,7 +200,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; - pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); + pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); if (pte_present(pte)) { if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ @@ -210,6 +209,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pa = pte_pfn(pte) << PAGE_SHIFT; } } + if (pte_size < psize) return H_PARAMETER; if (pa && pte_size > psize) @@ -297,7 +297,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ if (kvm->arch.using_mmu_notifiers && - mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + mmu_notifier_retry(kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; @@ -318,10 +318,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); - vcpu->arch.gpr[4] = pte_index; + *pte_idx_ret = pte_index; return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_h_enter); +EXPORT_SYMBOL_GPL(kvmppc_do_h_enter); + +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel, + vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); +} #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) -- cgit v0.10.2 From 4879f241720cda3e6c18a1713bf9b2ed2de14ee4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 19 Nov 2012 23:01:34 +0000 Subject: KVM: PPC: Book3S HV: Fix bug causing loss of page dirty state This fixes a bug where adding a new guest HPT entry via the H_ENTER hcall would lose the "changed" bit in the reverse map information for the guest physical page being mapped. The result was that the KVM_GET_DIRTY_LOG could return a zero bit for the page even though the page had been modified by the guest. This fixes it by only modifying the index and present bits in the reverse map entry, thus preserving the reference and change bits. We were also unnecessarily setting the reference bit, and this fixes that too. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 362dffe..ff2da5c 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -59,10 +59,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, head->back = pte_index; } else { rev->forw = rev->back = pte_index; - i = pte_index; + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; } - smp_wmb(); - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ + unlock_rmap(rmap); } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -- cgit v0.10.2 From 44e5f6be62741bd44968f40f3afa1cff1df983f2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 19 Nov 2012 22:52:49 +0000 Subject: KVM: PPC: Book3S HV: Add a mechanism for recording modified HPTEs This uses a bit in our record of the guest view of the HPTE to record when the HPTE gets modified. We use a reserved bit for this, and ensure that this bit is always cleared in HPTE values returned to the guest. The recording of modified HPTEs is only done if other code indicates its interest by setting kvm->arch.hpte_mod_interest to a non-zero value. The reason for this is that when later commits add facilities for userspace to read the HPT, the first pass of reading the HPT will be quicker if there are no (or very few) HPTEs marked as modified, rather than having most HPTEs marked as modified. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 1472a5b..b322e5b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -50,6 +50,15 @@ extern int kvm_hpt_order; /* order of preallocated HPTs */ #define HPTE_V_HVLOCK 0x40UL #define HPTE_V_ABSENT 0x20UL +/* + * We use this bit in the guest_rpte field of the revmap entry + * to indicate a modified HPTE. + */ +#define HPTE_GR_MODIFIED (1ul << 62) + +/* These bits are reserved in the guest view of the HPTE */ +#define HPTE_GR_RESERVED HPTE_GR_MODIFIED + static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) { unsigned long tmp, old; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3093896..58c7264 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -248,6 +248,7 @@ struct kvm_arch { atomic_t vcpus_running; unsigned long hpt_npte; unsigned long hpt_mask; + atomic_t hpte_mod_interest; spinlock_t slot_phys_lock; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index ff2da5c..ed563a5 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -66,6 +66,17 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); +/* + * Note modification of an HPTE; set the HPTE modified bit + * if anyone is interested. + */ +static inline void note_hpte_modification(struct kvm *kvm, + struct revmap_entry *rev) +{ + if (atomic_read(&kvm->arch.hpte_mod_interest)) + rev->guest_rpte |= HPTE_GR_MODIFIED; +} + /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, @@ -138,7 +149,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long slot_fn, hva; unsigned long *hpte; struct revmap_entry *rev; - unsigned long g_ptel = ptel; + unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned long *physp, pte_size; unsigned long is_io; @@ -153,6 +164,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, return H_PARAMETER; writing = hpte_is_writable(ptel); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + ptel &= ~HPTE_GR_RESERVED; + g_ptel = ptel; /* used later to detect if we might have been invalidated */ mmu_seq = kvm->mmu_notifier_seq; @@ -287,8 +300,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, rev = &kvm->arch.revmap[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); - if (rev) + if (rev) { rev->guest_rpte = g_ptel; + note_hpte_modification(kvm, rev); + } /* Link HPTE into reverse-map chain */ if (pteh & HPTE_V_VALID) { @@ -392,7 +407,8 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, /* Read PTE low word after tlbie to get final R/C values */ remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } - r = rev->guest_rpte; + r = rev->guest_rpte & ~HPTE_GR_RESERVED; + note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); vcpu->arch.gpr[4] = v; @@ -466,6 +482,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) args[j] = ((0x80 | flags) << 56) + pte_index; rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + note_hpte_modification(kvm, rev); if (!(hp[0] & HPTE_V_VALID)) { /* insert R and C bits from PTE */ @@ -555,6 +572,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (rev) { r = (rev->guest_rpte & ~mask) | bits; rev->guest_rpte = r; + note_hpte_modification(kvm, rev); } r = (hpte[1] & ~mask) | bits; @@ -606,8 +624,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; } - if (v & HPTE_V_VALID) + if (v & HPTE_V_VALID) { r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); + r &= ~HPTE_GR_RESERVED; + } vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } -- cgit v0.10.2 From 6b445ad4f839b06e68dd8e178e1168482ca20310 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 19 Nov 2012 22:55:44 +0000 Subject: KVM: PPC: Book3S HV: Make a HPTE removal function available This makes a HPTE removal function, kvmppc_do_h_remove(), available outside book3s_hv_rm_mmu.c. This will be used by the HPT writing code. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index fea768f..46763d10 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -160,6 +160,9 @@ extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *idx_ret); +extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index ed563a5..fc3da32 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -365,11 +365,10 @@ static inline int try_lock_tlbie(unsigned int *lock) return old == 0; } -long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long va) +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) { - struct kvm *kvm = vcpu->kvm; unsigned long *hpte; unsigned long v, r, rb; struct revmap_entry *rev; @@ -411,10 +410,18 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); - vcpu->arch.gpr[4] = v; - vcpu->arch.gpr[5] = r; + hpret[0] = v; + hpret[1] = r; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn) +{ + return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn, + &vcpu->arch.gpr[4]); +} long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { -- cgit v0.10.2 From a2932923ccf63c419c77aaa18ac09be98f2c94d8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 19 Nov 2012 22:57:20 +0000 Subject: KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on this fd return the contents of the HPT (hashed page table), writes create and/or remove entries in the HPT. There is a new capability, KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl takes an argument structure with the index of the first HPT entry to read out and a set of flags. The flags indicate whether the user is intending to read or write the HPT, and whether to return all entries or only the "bolted" entries (those with the bolted bit, 0x10, set in the first doubleword). This is intended for use in implementing qemu's savevm/loadvm and for live migration. Therefore, on reads, the first pass returns information about all HPTEs (or all bolted HPTEs). When the first pass reaches the end of the HPT, it returns from the read. Subsequent reads only return information about HPTEs that have changed since they were last read. A read that finds no changed HPTEs in the HPT following where the last read finished will return 0 bytes. The format of the data provides a simple run-length compression of the invalid entries. Each block of data starts with a header that indicates the index (position in the HPT, which is just an array), the number of valid entries starting at that index (may be zero), and the number of invalid entries following those valid entries. The valid entries, 16 bytes each, follow the header. The invalid entries are not explicitly represented. Signed-off-by: Paul Mackerras [agraf: fix documentation] Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6671fdc..a5607c5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2071,6 +2071,60 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm Note that the vcpu ioctl is asynchronous to vcpu execution. +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (>= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The `start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +"interesting" HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is: + +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + +Writes to the fd create HPT entries starting at the index given in the +header; first `n_valid' valid entries with contents from the data +written, then `n_invalid' invalid entries, invalidating any previously +valid entries found. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b322e5b..38bec1d 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, return !(memslot->base_gfn & mask) && !(memslot->npages & mask); } +/* + * This works for 4k, 64k and 16M pages on POWER7, + * and 4k and 16M pages on PPC970. + */ +static inline unsigned long slb_pgsize_encoding(unsigned long psize) +{ + unsigned long senc = 0; + + if (psize > 0x1000) { + senc = SLB_VSID_L; + if (psize == 0x10000) + senc |= SLB_VSID_LP_01; + } + return senc; +} + +static inline int is_vrma_hpte(unsigned long hpte_v) +{ + return (hpte_v & ~0xffffffUL) == + (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 609cca3..1ca31e9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -164,6 +164,8 @@ extern void kvmppc_bookehv_exit(void); extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); +extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index b89ae4d..514883d 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -331,6 +331,31 @@ struct kvm_book3e_206_tlb_params { __u32 reserved[8]; }; +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +/* + * Data read on the file descriptor is formatted as a series of + * records, each consisting of a header followed by a series of + * `n_valid' HPTEs (16 bytes each), which are all valid. Following + * those valid HPTEs there are `n_invalid' invalid HPTEs, which + * are not represented explicitly in the stream. The same format + * is used for writing. + */ +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + #define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) #define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) #define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6ee6516..0aa4073 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include @@ -1145,6 +1147,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) put_page(page); } +/* + * Functions for reading and writing the hash table via reads and + * writes on a file descriptor. + * + * Reads return the guest view of the hash table, which has to be + * pieced together from the real hash table and the guest_rpte + * values in the revmap array. + * + * On writes, each HPTE written is considered in turn, and if it + * is valid, it is written to the HPT as if an H_ENTER with the + * exact flag set was done. When the invalid count is non-zero + * in the header written to the stream, the kernel will make + * sure that that many HPTEs are invalid, and invalidate them + * if not. + */ + +struct kvm_htab_ctx { + unsigned long index; + unsigned long flags; + struct kvm *kvm; + int first_pass; +}; + +#define HPTE_SIZE (2 * sizeof(unsigned long)) + +static long record_hpte(unsigned long flags, unsigned long *hptp, + unsigned long *hpte, struct revmap_entry *revp, + int want_valid, int first_pass) +{ + unsigned long v, r; + int ok = 1; + int valid, dirty; + + /* Unmodified entries are uninteresting except on the first pass */ + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + if (!first_pass && !dirty) + return 0; + + valid = 0; + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + valid = 1; + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && + !(hptp[0] & HPTE_V_BOLTED)) + valid = 0; + } + if (valid != want_valid) + return 0; + + v = r = 0; + if (valid || dirty) { + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = hptp[0]; + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) + valid = 0; + r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + /* only clear modified if this is the right sort of entry */ + if (valid == want_valid && dirty) { + r &= ~HPTE_GR_MODIFIED; + revp->guest_rpte = r; + } + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hptp[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + if (!(valid == want_valid && (first_pass || dirty))) + ok = 0; + } + hpte[0] = v; + hpte[1] = r; + return ok; +} + +static ssize_t kvm_htab_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long *hptp; + struct revmap_entry *revp; + unsigned long i, nb, nw; + unsigned long __user *lbuf; + struct kvm_get_htab_header __user *hptr; + unsigned long flags; + int first_pass; + unsigned long hpte[2]; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + first_pass = ctx->first_pass; + flags = ctx->flags; + + i = ctx->index; + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + revp = kvm->arch.revmap + i; + lbuf = (unsigned long __user *)buf; + + nb = 0; + while (nb + sizeof(hdr) + HPTE_SIZE < count) { + /* Initialize header */ + hptr = (struct kvm_get_htab_header __user *)buf; + hdr.index = i; + hdr.n_valid = 0; + hdr.n_invalid = 0; + nw = nb; + nb += sizeof(hdr); + lbuf = (unsigned long __user *)(buf + sizeof(hdr)); + + /* Skip uninteresting entries, i.e. clean on not-first pass */ + if (!first_pass) { + while (i < kvm->arch.hpt_npte && + !(revp->guest_rpte & HPTE_GR_MODIFIED)) { + ++i; + hptp += 2; + ++revp; + } + } + + /* Grab a series of valid entries */ + while (i < kvm->arch.hpt_npte && + hdr.n_valid < 0xffff && + nb + HPTE_SIZE < count && + record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { + /* valid entry, write it out */ + ++hdr.n_valid; + if (__put_user(hpte[0], lbuf) || + __put_user(hpte[1], lbuf + 1)) + return -EFAULT; + nb += HPTE_SIZE; + lbuf += 2; + ++i; + hptp += 2; + ++revp; + } + /* Now skip invalid entries while we can */ + while (i < kvm->arch.hpt_npte && + hdr.n_invalid < 0xffff && + record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { + /* found an invalid entry */ + ++hdr.n_invalid; + ++i; + hptp += 2; + ++revp; + } + + if (hdr.n_valid || hdr.n_invalid) { + /* write back the header */ + if (__copy_to_user(hptr, &hdr, sizeof(hdr))) + return -EFAULT; + nw = nb; + buf = (char __user *)lbuf; + } else { + nb = nw; + } + + /* Check if we've wrapped around the hash table */ + if (i >= kvm->arch.hpt_npte) { + i = 0; + ctx->first_pass = 0; + break; + } + } + + ctx->index = i; + + return nb; +} + +static ssize_t kvm_htab_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long i, j; + unsigned long v, r; + unsigned long __user *lbuf; + unsigned long *hptp; + unsigned long tmp[2]; + ssize_t nb; + long int err, ret; + int rma_setup; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* lock out vcpus from running while we're doing this */ + mutex_lock(&kvm->lock); + rma_setup = kvm->arch.rma_setup_done; + if (rma_setup) { + kvm->arch.rma_setup_done = 0; /* temporarily */ + /* order rma_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.rma_setup_done = 1; + mutex_unlock(&kvm->lock); + return -EBUSY; + } + } + + err = 0; + for (nb = 0; nb + sizeof(hdr) <= count; ) { + err = -EFAULT; + if (__copy_from_user(&hdr, buf, sizeof(hdr))) + break; + + err = 0; + if (nb + hdr.n_valid * HPTE_SIZE > count) + break; + + nb += sizeof(hdr); + buf += sizeof(hdr); + + err = -EINVAL; + i = hdr.index; + if (i >= kvm->arch.hpt_npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + break; + + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + lbuf = (unsigned long __user *)buf; + for (j = 0; j < hdr.n_valid; ++j) { + err = -EFAULT; + if (__get_user(v, lbuf) || __get_user(r, lbuf + 1)) + goto out; + err = -EINVAL; + if (!(v & HPTE_V_VALID)) + goto out; + lbuf += 2; + nb += HPTE_SIZE; + + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + err = -EIO; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, + tmp); + if (ret != H_SUCCESS) { + pr_err("kvm_htab_write ret %ld i=%ld v=%lx " + "r=%lx\n", ret, i, v, r); + goto out; + } + if (!rma_setup && is_vrma_hpte(v)) { + unsigned long psize = hpte_page_size(v, r); + unsigned long senc = slb_pgsize_encoding(psize); + unsigned long lpcr; + + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; + lpcr |= senc << (LPCR_VRMASD_SH - 4); + kvm->arch.lpcr = lpcr; + rma_setup = 1; + } + ++i; + hptp += 2; + } + + for (j = 0; j < hdr.n_invalid; ++j) { + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + ++i; + hptp += 2; + } + err = 0; + } + + out: + /* Order HPTE updates vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = rma_setup; + mutex_unlock(&kvm->lock); + + if (err) + return err; + return nb; +} + +static int kvm_htab_release(struct inode *inode, struct file *filp) +{ + struct kvm_htab_ctx *ctx = filp->private_data; + + filp->private_data = NULL; + if (!(ctx->flags & KVM_GET_HTAB_WRITE)) + atomic_dec(&ctx->kvm->arch.hpte_mod_interest); + kvm_put_kvm(ctx->kvm); + kfree(ctx); + return 0; +} + +static struct file_operations kvm_htab_fops = { + .read = kvm_htab_read, + .write = kvm_htab_write, + .llseek = default_llseek, + .release = kvm_htab_release, +}; + +int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) +{ + int ret; + struct kvm_htab_ctx *ctx; + int rwflag; + + /* reject flags we don't recognize */ + if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) + return -EINVAL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + kvm_get_kvm(kvm); + ctx->kvm = kvm; + ctx->index = ghf->start_index; + ctx->flags = ghf->flags; + ctx->first_pass = 1; + + rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag); + if (ret < 0) { + kvm_put_kvm(kvm); + return ret; + } + + if (rwflag == O_RDONLY) { + mutex_lock(&kvm->slots_lock); + atomic_inc(&kvm->arch.hpte_mod_interest); + /* make sure kvmppc_do_h_enter etc. see the increment */ + synchronize_srcu_expedited(&kvm->srcu); + mutex_unlock(&kvm->slots_lock); + } + + return ret; +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 843eb75..a4f59db 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1563,18 +1563,6 @@ out: return r; } -static unsigned long slb_pgsize_encoding(unsigned long psize) -{ - unsigned long senc = 0; - - if (psize > 0x1000) { - senc = SLB_VSID_L; - if (psize == 0x10000) - senc |= SLB_VSID_LP_01; - } - return senc; -} - static void unpin_slot(struct kvm_memory_slot *memslot) { unsigned long *physp; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d583ea1..70739a0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -354,6 +354,12 @@ int kvm_dev_ioctl_check_extension(long ext) r = 1; #else r = 0; + break; +#endif +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CAP_PPC_HTAB_FD: + r = 1; + break; #endif break; case KVM_CAP_NR_VCPUS: @@ -954,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm *kvm = filp->private_data; + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } #endif /* CONFIG_KVM_BOOK3S_64_HV */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 494a84c..e6e5d4b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_smmu_info { #endif #define KVM_CAP_IRQFD_RESAMPLE 82 #define KVM_CAP_PPC_BOOKE_WATCHDOG 83 +#define KVM_CAP_PPC_HTAB_FD 84 #ifdef KVM_CAP_IRQ_ROUTING @@ -859,6 +860,8 @@ struct kvm_s390_ucas_mapping { #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) +/* Available with KVM_CAP_PPC_HTAB_FD */ +#define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) /* * ioctls for vcpu fds -- cgit v0.10.2 From a64fd707481631b9682f9baeefac489bc55bbf73 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 21 Nov 2012 23:27:19 +0000 Subject: KVM: PPC: Book3S HV: Reset reverse-map chains when resetting the HPT With HV-style KVM, we maintain reverse-mapping lists that enable us to find all the HPT (hashed page table) entries that reference each guest physical page, with the heads of the lists in the memslot->arch.rmap arrays. When we reset the HPT (i.e. when we reboot the VM), we clear out all the HPT entries but we were not clearing out the reverse mapping lists. The result is that as we create new HPT entries, the lists get corrupted, which can easily lead to loops, resulting in the host kernel hanging when it tries to traverse those lists. This fixes the problem by zeroing out all the reverse mapping lists when we zero out the HPT. This incidentally means that we are also zeroing our record of the referenced and changed bits (not the bits in the Linux PTEs, used by the Linux MM subsystem, but the bits used by the KVM_GET_DIRTY_LOG ioctl, and those used by kvm_age_hva() and kvm_test_age_hva()). Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 0aa4073..1029e22 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -46,6 +46,7 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret); +static void kvmppc_rmap_reset(struct kvm *kvm); long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) { @@ -144,6 +145,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) /* Set the entire HPT to 0, i.e. invalid HPTEs */ memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); /* + * Reset all the reverse-mapping chains for all memslots + */ + kvmppc_rmap_reset(kvm); + /* * Set the whole last_vcpu array to an invalid vcpu number. * This ensures that each vcpu will flush its TLB on next entry. */ @@ -772,6 +777,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; } +static void kvmppc_rmap_reset(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + slots = kvm->memslots; + kvm_for_each_memslot(memslot, slots) { + /* + * This assumes it is acceptable to lose reference and + * change bits across a reset. + */ + memset(memslot->arch.rmap, 0, + memslot->npages * sizeof(*memslot->arch.rmap)); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, -- cgit v0.10.2 From 05dd85f7933ffbe6d71415e631c95ca615ae1e81 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 21 Nov 2012 23:29:12 +0000 Subject: KVM: PPC: Book3S HV: Report correct HPT entry index when reading HPT This fixes a bug in the code which allows userspace to read out the contents of the guest's hashed page table (HPT). On the second and subsequent passes through the HPT, when we are reporting only those entries that have changed, we were incorrectly initializing the index field of the header with the index of the first entry we skipped rather than the first changed entry. This fixes it. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1029e22..ac6b5ac 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1282,7 +1282,6 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, while (nb + sizeof(hdr) + HPTE_SIZE < count) { /* Initialize header */ hptr = (struct kvm_get_htab_header __user *)buf; - hdr.index = i; hdr.n_valid = 0; hdr.n_invalid = 0; nw = nb; @@ -1298,6 +1297,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, ++revp; } } + hdr.index = i; /* Grab a series of valid entries */ while (i < kvm->arch.hpt_npte && -- cgit v0.10.2 From 1cc8ed0b13ae6e076a1dd1f18da508b48c7aa05a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 21 Nov 2012 23:28:41 +0000 Subject: KVM: PPC: Book3S HV: Don't give the guest RW access to RO pages Currently, if the guest does an H_PROTECT hcall requesting that the permissions on a HPT entry be changed to allow writing, we make the requested change even if the page is marked read-only in the host Linux page tables. This is a problem since it would for instance allow a guest to modify a page that KSM has decided can be shared between multiple guests. To fix this, if the new permissions for the page allow writing, we need to look up the memslot for the page, work out the host virtual address, and look up the Linux page tables to get the PTE for the page. If that PTE is read-only, we reduce the HPTE permissions to read-only. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index fc3da32..7a57ea4 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -600,6 +600,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, asm volatile("tlbiel %0" : : "r" (rb)); asm volatile("ptesync" : : : "memory"); } + /* + * If the host has this page as readonly but the guest + * wants to make it read/write, reduce the permissions. + * Checking the host permissions involves finding the + * memslot and then the Linux PTE for the page. + */ + if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) { + unsigned long psize, gfn, hva; + struct kvm_memory_slot *memslot; + pgd_t *pgdir = vcpu->arch.pgdir; + pte_t pte; + + psize = hpte_page_size(v, r); + gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + if (memslot) { + hva = __gfn_to_hva_memslot(memslot, gfn); + pte = lookup_linux_pte(pgdir, hva, 1, &psize); + if (pte_present(pte) && !pte_write(pte)) + r = hpte_make_readonly(r); + } + } } hpte[1] = r; eieio(); -- cgit v0.10.2 From b0a94d4e23201c7559bb8f8657cfb629561288f2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 4 Nov 2012 18:15:43 +0000 Subject: KVM: PPC: Book3S PR: Emulate PURR, SPURR and DSCR registers This adds basic emulation of the PURR and SPURR registers. We assume we are emulating a single-threaded core, so these advance at the same rate as the timebase. A Linux kernel running on a POWER7 expects to be able to access these registers and is not prepared to handle a program interrupt on accessing them. This also adds a very minimal emulation of the DSCR (data stream control register). Writes are ignored and reads return zero. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 46763d10..5a56e1c 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -81,6 +81,8 @@ struct kvmppc_vcpu_book3s { u64 sdr1; u64 hior; u64 msr_mask; + u64 purr_offset; + u64 spurr_offset; #ifdef CONFIG_PPC_BOOK3S_32 u32 vsid_pool[VSID_POOL_SIZE]; u32 vsid_next; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index b9a989d..d31a716 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -22,6 +22,7 @@ #include #include #include +#include #define OP_19_XOP_RFID 18 #define OP_19_XOP_RFI 50 @@ -395,6 +396,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) (mfmsr() & MSR_HV)) vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; break; + case SPRN_PURR: + to_book3s(vcpu)->purr_offset = spr_val - get_tb(); + break; + case SPRN_SPURR: + to_book3s(vcpu)->spurr_offset = spr_val - get_tb(); + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: @@ -412,6 +419,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_CTRLF: case SPRN_CTRLT: case SPRN_L2CR: + case SPRN_DSCR: case SPRN_MMCR0_GEKKO: case SPRN_MMCR1_GEKKO: case SPRN_PMC1_GEKKO: @@ -483,9 +491,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = to_book3s(vcpu)->hid[5]; break; case SPRN_CFAR: - case SPRN_PURR: + case SPRN_DSCR: *spr_val = 0; break; + case SPRN_PURR: + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; + break; + case SPRN_SPURR: + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: -- cgit v0.10.2 From 28c483b62fcd2589dadfc1250970f85aa0ab3df6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 4 Nov 2012 18:16:46 +0000 Subject: KVM: PPC: Book3S PR: Fix VSX handling This fixes various issues in how we were handling the VSX registers that exist on POWER7 machines. First, we were running off the end of the current->thread.fpr[] array. Ultimately this was because the vcpu->arch.vsr[] array is sized to be able to store both the FP registers and the extra VSX registers (i.e. 64 entries), but PR KVM only uses it for the extra VSX registers (i.e. 32 entries). Secondly, calling load_up_vsx() from C code is a really bad idea, because it jumps to fast_exception_return at the end, rather than returning with a blr instruction. This was causing it to jump off to a random location with random register contents, since it was using the largely uninitialized stack frame created by kvmppc_load_up_vsx. In fact, it isn't necessary to call either __giveup_vsx or load_up_vsx, since giveup_fpu and load_up_fpu handle the extra VSX registers as well as the standard FP registers on machines with VSX. Also, since VSX instructions can access the VMX registers and the FP registers as well as the extra VSX registers, we have to load up the FP and VMX registers before we can turn on the MSR_VSX bit for the guest. Conversely, if we save away any of the VSX or FP registers, we have to turn off MSR_VSX for the guest. To handle all this, it is more convenient for a single call to kvmppc_giveup_ext() to handle all the state saving that needs to be done, so we make it take a set of MSR bits rather than just one, and the switch statement becomes a series of if statements. Similarly kvmppc_handle_ext needs to be able to load up more than one set of registers. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index d24c141..97d3727 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -518,6 +518,7 @@ #define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */ #define SRR1_WS_DEEP 0x00010000 /* All resources maintained */ #define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ +#define SRR1_PROGILL 0x00080000 /* Illegal instruction */ #define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ #define SRR1_PROGTRAP 0x00020000 /* Trap */ #define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */ diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index a150817..7057a02 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -28,8 +28,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); #endif -#ifdef CONFIG_VSX -EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); -#endif #endif diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b853696..5c496ec 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -81,9 +81,7 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) svcpu_put(svcpu); #endif - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); vcpu->cpu = -1; } @@ -433,10 +431,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, static inline int get_fpr_index(int i) { -#ifdef CONFIG_VSX - i *= 2; -#endif - return i; + return i * TS_FPRWIDTH; } /* Give up external provider (FPU, Altivec, VSX) */ @@ -450,41 +445,49 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) u64 *thread_fpr = (u64*)t->fpr; int i; - if (!(vcpu->arch.guest_owned_ext & msr)) + /* + * VSX instructions can access FP and vector registers, so if + * we are giving up VSX, make sure we give up FP and VMX as well. + */ + if (msr & MSR_VSX) + msr |= MSR_FP | MSR_VEC; + + msr &= vcpu->arch.guest_owned_ext; + if (!msr) return; #ifdef DEBUG_EXT printk(KERN_INFO "Giving up ext 0x%lx\n", msr); #endif - switch (msr) { - case MSR_FP: + if (msr & MSR_FP) { + /* + * Note that on CPUs with VSX, giveup_fpu stores + * both the traditional FP registers and the added VSX + * registers into thread.fpr[]. + */ giveup_fpu(current); for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; vcpu->arch.fpscr = t->fpscr.val; - break; - case MSR_VEC: + +#ifdef CONFIG_VSX + if (cpu_has_feature(CPU_FTR_VSX)) + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; +#endif + } + #ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { giveup_altivec(current); memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); vcpu->arch.vscr = t->vscr; -#endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - __giveup_vsx(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; -#endif - break; - default: - BUG(); } +#endif - vcpu->arch.guest_owned_ext &= ~msr; - current->thread.regs->msr &= ~msr; + vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX); kvmppc_recalc_shadow_msr(vcpu); } @@ -544,47 +547,56 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, return RESUME_GUEST; } - /* We already own the ext */ - if (vcpu->arch.guest_owned_ext & msr) { - return RESUME_GUEST; + if (msr == MSR_VSX) { + /* No VSX? Give an illegal instruction interrupt */ +#ifdef CONFIG_VSX + if (!cpu_has_feature(CPU_FTR_VSX)) +#endif + { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + return RESUME_GUEST; + } + + /* + * We have to load up all the FP and VMX registers before + * we can let the guest use VSX instructions. + */ + msr = MSR_FP | MSR_VEC | MSR_VSX; } + /* See if we already own all the ext(s) needed */ + msr &= ~vcpu->arch.guest_owned_ext; + if (!msr) + return RESUME_GUEST; + #ifdef DEBUG_EXT printk(KERN_INFO "Loading up ext 0x%lx\n", msr); #endif current->thread.regs->msr |= msr; - switch (msr) { - case MSR_FP: + if (msr & MSR_FP) { for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; - +#ifdef CONFIG_VSX + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; +#endif t->fpscr.val = vcpu->arch.fpscr; t->fpexc_mode = 0; kvmppc_load_up_fpu(); - break; - case MSR_VEC: + } + + if (msr & MSR_VEC) { #ifdef CONFIG_ALTIVEC memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); t->vscr = vcpu->arch.vscr; t->vrsave = -1; kvmppc_load_up_altivec(); #endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; - kvmppc_load_up_vsx(); -#endif - break; - default: - BUG(); } vcpu->arch.guest_owned_ext |= msr; - kvmppc_recalc_shadow_msr(vcpu); return RESUME_GUEST; @@ -1134,7 +1146,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Save VSX state in stack */ used_vsr = current->thread.used_vsr; if (used_vsr && (current->thread.regs->msr & MSR_VSX)) - __giveup_vsx(current); + __giveup_vsx(current); #endif /* Remember the MSR with disabled extensions */ @@ -1151,14 +1163,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* No need for kvm_guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ - current->thread.regs->msr = ext_msr; - /* Make sure we save the guest FPU/Altivec/VSX state */ - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + + current->thread.regs->msr = ext_msr; - /* Restore FPU state from stack */ + /* Restore FPU/VSX state from stack */ memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); current->thread.fpscr.val = fpscr; current->thread.fpexc_mode = fpexc_mode; diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index b2f8258..8f7633e 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -234,8 +234,5 @@ define_load_up(fpu) #ifdef CONFIG_ALTIVEC define_load_up(altivec) #endif -#ifdef CONFIG_VSX -define_load_up(vsx) -#endif #include "book3s_segment.S" -- cgit v0.10.2 From 3a2e7b0d761ae3faecdb43482d178b5fe2e3b8a5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 4 Nov 2012 18:17:28 +0000 Subject: KVM: PPC: Book3S PR: MSR_DE doesn't exist on Book 3S The mask of MSR bits that get transferred from the guest MSR to the shadow MSR included MSR_DE. In fact that bit only exists on Book 3E processors, and it is assigned the same bit used for MSR_BE on Book 3S processors. Since we already had MSR_BE in the mask, this just removes MSR_DE. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 5c496ec..28d38ad 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -145,7 +145,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) ulong smsr = vcpu->arch.shared->msr; /* Guest MSR values */ - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; /* Process MSR values */ smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ -- cgit v0.10.2 From 6a7f972dfe8de97c00f3196d0b87fb68bd8cf35e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 15 Oct 2012 19:01:05 +0000 Subject: MAINTAINERS: Add git tree link for PPC KVM Signed-off-by: Michael Ellerman Signed-off-by: Alexander Graf diff --git a/MAINTAINERS b/MAINTAINERS index 4376c52..7034467 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4253,6 +4253,7 @@ KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC M: Alexander Graf L: kvm-ppc@vger.kernel.org W: http://kvm.qumranet.com +T: git git://github.com/agraf/linux-2.6.git S: Supported F: arch/powerpc/include/asm/kvm* F: arch/powerpc/kvm/ -- cgit v0.10.2 From 1b400ba0cd24a5994d792c7cfa0ee24cac266d3c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 21 Nov 2012 23:28:08 +0000 Subject: KVM: PPC: Book3S HV: Improve handling of local vs. global TLB invalidations When we change or remove a HPT (hashed page table) entry, we can do either a global TLB invalidation (tlbie) that works across the whole machine, or a local invalidation (tlbiel) that only affects this core. Currently we do local invalidations if the VM has only one vcpu or if the guest requests it with the H_LOCAL flag, though the guest Linux kernel currently doesn't ever use H_LOCAL. Then, to cope with the possibility that vcpus moving around to different physical cores might expose stale TLB entries, there is some code in kvmppc_hv_entry to flush the whole TLB of entries for this VM if either this vcpu is now running on a different physical core from where it last ran, or if this physical core last ran a different vcpu. There are a number of problems on POWER7 with this as it stands: - The TLB invalidation is done per thread, whereas it only needs to be done per core, since the TLB is shared between the threads. - With the possibility of the host paging out guest pages, the use of H_LOCAL by an SMP guest is dangerous since the guest could possibly retain and use a stale TLB entry pointing to a page that had been removed from the guest. - The TLB invalidations that we do when a vcpu moves from one physical core to another are unnecessary in the case of an SMP guest that isn't using H_LOCAL. - The optimization of using local invalidations rather than global should apply to guests with one virtual core, not just one vcpu. (None of this applies on PPC970, since there we always have to invalidate the whole TLB when entering and leaving the guest, and we can't support paging out guest memory.) To fix these problems and simplify the code, we now maintain a simple cpumask of which cpus need to flush the TLB on entry to the guest. (This is indexed by cpu, though we only ever use the bits for thread 0 of each core.) Whenever we do a local TLB invalidation, we set the bits for every cpu except the bit for thread 0 of the core that we're currently running on. Whenever we enter a guest, we test and clear the bit for our core, and flush the TLB if it was set. On initial startup of the VM, and when resetting the HPT, we set all the bits in the need_tlb_flush cpumask, since any core could potentially have stale TLB entries from the previous VM to use the same LPID, or the previous contents of the HPT. Then, we maintain a count of the number of online virtual cores, and use that when deciding whether to use a local invalidation rather than the number of online vcpus. The code to make that decision is extracted out into a new function, global_invalidates(). For multi-core guests on POWER7 (i.e. when we are using mmu notifiers), we now never do local invalidations regardless of the H_LOCAL flag. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 58c7264..62fbd38 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -246,11 +246,12 @@ struct kvm_arch { int using_mmu_notifiers; u32 hpt_order; atomic_t vcpus_running; + u32 online_vcores; unsigned long hpt_npte; unsigned long hpt_mask; atomic_t hpte_mod_interest; spinlock_t slot_phys_lock; - unsigned short last_vcpu[NR_CPUS]; + cpumask_t need_tlb_flush; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; struct kvmppc_linear_info *hpt_li; #endif /* CONFIG_KVM_BOOK3S_64_HV */ @@ -275,6 +276,7 @@ struct kvmppc_vcore { int nap_count; int napping_threads; u16 pcpu; + u16 last_cpu; u8 vcore_state; u8 in_guest; struct list_head runnable_threads; @@ -523,7 +525,6 @@ struct kvm_vcpu_arch { u64 dec_jiffies; u64 dec_expires; unsigned long pending_exceptions; - u16 last_cpu; u8 ceded; u8 prodded; u32 last_inst; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 7523539..4e23ba2 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -441,8 +441,7 @@ int main(void) DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); - DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); - DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); + DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); @@ -470,7 +469,6 @@ int main(void) DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); - DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index ac6b5ac..8cc18ab 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -148,11 +148,8 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); - /* - * Set the whole last_vcpu array to an invalid vcpu number. - * This ensures that each vcpu will flush its TLB on next entry. - */ - memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); *htab_orderp = order; err = 0; } else { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a4f59db..ddbec60 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -853,7 +853,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto free_vcpu; vcpu->arch.shared = &vcpu->arch.shregs; - vcpu->arch.last_cpu = -1; vcpu->arch.mmcr[0] = MMCR0_FC; vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ @@ -880,6 +879,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcore->preempt_tb = TB_NIL; } kvm->arch.vcores[core] = vcore; + kvm->arch.online_vcores++; } mutex_unlock(&kvm->lock); @@ -1802,6 +1802,13 @@ int kvmppc_core_init_vm(struct kvm *kvm) return -ENOMEM; kvm->arch.lpid = lpid; + /* + * Since we don't flush the TLB when tearing down a VM, + * and this lpid might have previously been used, + * make sure we flush on each core before running the new VM. + */ + cpumask_setall(&kvm->arch.need_tlb_flush); + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); kvm->arch.rma = NULL; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 7a57ea4..19c93ba 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x) return __va(addr); } +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ +static int global_invalidates(struct kvm *kvm, unsigned long flags) +{ + int global; + + /* + * If there is only one vcore, and it's currently running, + * we can use tlbiel as long as we mark all other physical + * cores as potentially having stale TLB entries for this lpid. + * If we're not using MMU notifiers, we never take pages away + * from the guest, so we can use tlbiel if requested. + * Otherwise, don't use tlbiel. + */ + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) + global = 0; + else if (kvm->arch.using_mmu_notifiers) + global = 1; + else + global = !(flags & H_LOCAL); + + if (!global) { + /* any other core might now have stale TLB entries... */ + smp_wmb(); + cpumask_setall(&kvm->arch.need_tlb_flush); + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, + &kvm->arch.need_tlb_flush); + } + + return global; +} + /* * Add this HPTE into the chain for the real page. * Must be called with the chain locked; it unlocks the chain. @@ -390,7 +421,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (v & HPTE_V_VALID) { hpte[0] &= ~HPTE_V_VALID; rb = compute_tlbie_rb(v, hpte[1], pte_index); - if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { + if (global_invalidates(kvm, flags)) { while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); asm volatile("ptesync" : : : "memory"); @@ -565,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; v = hpte[0]; bits = (flags << 55) & HPTE_R_PP0; bits |= (flags << 48) & HPTE_R_KEY_HI; @@ -587,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (v & HPTE_V_VALID) { rb = compute_tlbie_rb(v, r, pte_index); hpte[0] = v & ~HPTE_V_VALID; - if (!(flags & H_LOCAL)) { + if (global_invalidates(kvm, flags)) { while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); asm volatile("ptesync" : : : "memory"); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 690d112..b48bd53 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -313,7 +313,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync + + /* See if we need to flush the TLB */ + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ + srdi r6,r6,6 /* doubleword number */ + sldi r6,r6,3 /* address offset */ + add r6,r6,r9 + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ li r0,1 + sld r0,r0,r7 + ld r7,0(r6) + and. r7,r7,r0 + beq 22f +23: ldarx r7,0,r6 /* if set, clear the bit */ + andc r7,r7,r0 + stdcx. r7,0,r6 + bne 23b + li r6,128 /* and flush the TLB */ + mtctr r6 + li r7,0x800 /* IS field = 0b10 */ + ptesync +28: tlbiel r7 + addi r7,r7,0x1000 + bdnz 28b + ptesync + +22: li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ b 10f @@ -336,36 +362,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mr r9,r4 blt hdec_soon - /* - * Invalidate the TLB if we could possibly have stale TLB - * entries for this partition on this core due to the use - * of tlbiel. - * XXX maybe only need this on primary thread? - */ - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ - lwz r5,VCPU_VCPUID(r4) - lhz r6,PACAPACAINDEX(r13) - rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ - lhz r8,VCPU_LAST_CPU(r4) - sldi r7,r6,1 /* see if this is the same vcpu */ - add r7,r7,r9 /* as last ran on this pcpu */ - lhz r0,KVM_LAST_VCPU(r7) - cmpw r6,r8 /* on the same cpu core as last time? */ - bne 3f - cmpw r0,r5 /* same vcpu as this core last ran? */ - beq 1f -3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ - sth r5,KVM_LAST_VCPU(r7) - li r6,128 - mtctr r6 - li r7,0x800 /* IS field = 0b10 */ - ptesync -2: tlbiel r7 - addi r7,r7,0x1000 - bdnz 2b - ptesync -1: - /* Save purr/spurr */ mfspr r5,SPRN_PURR mfspr r6,SPRN_SPURR -- cgit v0.10.2 From b4072df4076c4f33ac9f518052c318c979bca533 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 23 Nov 2012 22:37:50 +0000 Subject: KVM: PPC: Book3S HV: Handle guest-caused machine checks on POWER7 without panicking Currently, if a machine check interrupt happens while we are in the guest, we exit the guest and call the host's machine check handler, which tends to cause the host to panic. Some machine checks can be triggered by the guest; for example, if the guest creates two entries in the SLB that map the same effective address, and then accesses that effective address, the CPU will take a machine check interrupt. To handle this better, when a machine check happens inside the guest, we call a new function, kvmppc_realmode_machine_check(), while still in real mode before exiting the guest. On POWER7, it handles the cases that the guest can trigger, either by flushing and reloading the SLB, or by flushing the TLB, and then it delivers the machine check interrupt directly to the guest without going back to the host. On POWER7, the OPAL firmware patches the machine check interrupt vector so that it gets control first, and it leaves behind its analysis of the situation in a structure pointed to by the opal_mc_evt field of the paca. The kvmppc_realmode_machine_check() function looks at this, and if OPAL reports that there was no error, or that it has handled the error, we also go straight back to the guest with a machine check. We have to deliver a machine check to the guest since the machine check interrupt might have trashed valid values in SRR0/1. If the machine check is one we can't handle in real mode, and one that OPAL hasn't already handled, or on PPC970, we exit the guest and call the host's machine check handler. We do this by jumping to the machine_check_fwnmi label, rather than absolute address 0x200, because we don't want to re-execute OPAL's handler on POWER7. On PPC970, the two are equivalent because address 0x200 just contains a branch. Then, if the host machine check handler decides that the system can continue executing, kvmppc_handle_exit() delivers a machine check interrupt to the guest -- once again to let the guest know that SRR0/1 have been modified. Signed-off-by: Paul Mackerras [agraf: fix checkpatch warnings] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 9673f73..2fdb47a 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -121,6 +121,16 @@ extern char initial_stab[]; #define PP_RXRX 3 /* Supervisor read, User read */ #define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ +/* Fields for tlbiel instruction in architecture 2.06 */ +#define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ +#define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ +#define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ +#define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ +#define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */ +#define TLBIEL_INVAL_SET_SHIFT 12 + +#define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ + #ifndef __ASSEMBLY__ struct hash_pte { diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index cd89658..1e473d4 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -73,6 +73,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ book3s_64_vio_hv.o \ + book3s_hv_ras.o \ book3s_hv_builtin.o kvm-book3s_64-module-objs := \ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ddbec60..71d0c90 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -545,6 +545,17 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PERFMON: r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* + * Deliver a machine check interrupt to the guest. + * We have to do this, even if the host has handled the + * machine check, because machine checks use SRR0/1 and + * the interrupt might have trashed guest state in them. + */ + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_MACHINE_CHECK); + r = RESUME_GUEST; + break; case BOOK3S_INTERRUPT_PROGRAM: { ulong flags; diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c new file mode 100644 index 0000000..35f3cf0 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -0,0 +1,144 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2012 Paul Mackerras, IBM Corp. + */ + +#include +#include +#include +#include +#include +#include + +/* SRR1 bits for machine check on POWER7 */ +#define SRR1_MC_LDSTERR (1ul << (63-42)) +#define SRR1_MC_IFETCH_SH (63-45) +#define SRR1_MC_IFETCH_MASK 0x7 +#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */ +#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */ +#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */ +#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */ + +/* DSISR bits for machine check on POWER7 */ +#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */ +#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */ +#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */ +#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */ +#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */ + +/* POWER7 SLB flush and reload */ +static void reload_slb(struct kvm_vcpu *vcpu) +{ + struct slb_shadow *slb; + unsigned long i, n; + + /* First clear out SLB */ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + + /* Do they have an SLB shadow buffer registered? */ + slb = vcpu->arch.slb_shadow.pinned_addr; + if (!slb) + return; + + /* Sanity check */ + n = min_t(u32, slb->persistent, SLB_MIN_SIZE); + if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) + return; + + /* Load up the SLB from that */ + for (i = 0; i < n; ++i) { + unsigned long rb = slb->save_area[i].esid; + unsigned long rs = slb->save_area[i].vsid; + + rb = (rb & ~0xFFFul) | i; /* insert entry number */ + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); + } +} + +/* POWER7 TLB flush */ +static void flush_tlb_power7(struct kvm_vcpu *vcpu) +{ + unsigned long i, rb; + + rb = TLBIEL_INVAL_SET_LPID; + for (i = 0; i < POWER7_TLB_SETS; ++i) { + asm volatile("tlbiel %0" : : "r" (rb)); + rb += 1 << TLBIEL_INVAL_SET_SHIFT; + } +} + +/* + * On POWER7, see if we can handle a machine check that occurred inside + * the guest in real mode, without switching to the host partition. + * + * Returns: 0 => exit guest, 1 => deliver machine check to guest + */ +static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +{ + unsigned long srr1 = vcpu->arch.shregs.msr; + struct opal_machine_check_event *opal_evt; + long handled = 1; + + if (srr1 & SRR1_MC_LDSTERR) { + /* error on load/store */ + unsigned long dsisr = vcpu->arch.shregs.dsisr; + + if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { + /* flush and reload SLB; flushes D-ERAT too */ + reload_slb(vcpu); + dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); + } + if (dsisr & DSISR_MC_TLB_MULTI) { + flush_tlb_power7(vcpu); + dsisr &= ~DSISR_MC_TLB_MULTI; + } + /* Any other errors we don't understand? */ + if (dsisr & 0xffffffffUL) + handled = 0; + } + + switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { + case 0: + break; + case SRR1_MC_IFETCH_SLBPAR: + case SRR1_MC_IFETCH_SLBMULTI: + case SRR1_MC_IFETCH_SLBPARMULTI: + reload_slb(vcpu); + break; + case SRR1_MC_IFETCH_TLBMULTI: + flush_tlb_power7(vcpu); + break; + default: + handled = 0; + } + + /* + * See if OPAL has already handled the condition. + * We assume that if the condition is recovered then OPAL + * will have generated an error log event that we will pick + * up and log later. + */ + opal_evt = local_paca->opal_mc_evt; + if (opal_evt->version == OpalMCE_V1 && + (opal_evt->severity == OpalMCE_SEV_NO_ERROR || + opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED)) + handled = 1; + + if (handled) + opal_evt->in_use = 0; + + return handled; +} + +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +{ + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return kvmppc_realmode_mc_power7(vcpu); + + return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index b48bd53..10b6c35 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -27,6 +27,7 @@ #include #include #include +#include /***************************************************************************** * * @@ -678,8 +679,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -nohpte_cont: -hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC mftb r6 @@ -700,6 +700,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) + /* See if it is a machine check */ + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq machine_check_realmode +mc_cont: + /* Save guest CTRL register, set runlatch to 1 */ 6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) @@ -1112,38 +1117,41 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) /* * For external and machine check interrupts, we need * to call the Linux handler to process the interrupt. - * We do that by jumping to the interrupt vector address - * which we have in r12. The [h]rfid at the end of the + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the * handler will return to the book3s_hv_interrupts.S code. * For other interrupts we do the rfid to get back - * to the book3s_interrupts.S code here. + * to the book3s_hv_interrupts.S code here. */ ld r8, HSTATE_VMHANDLER(r13) ld r7, HSTATE_HOST_MSR(r13) + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION beq 11f - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* RFI into the highmem handler, or branch to interrupt handler */ -12: mfmsr r6 - mtctr r12 + mfmsr r6 li r0, MSR_RI andc r6, r6, r0 mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - beqctr + beqa 0x500 /* external interrupt (PPC970) */ + beq cr1, 13f /* machine check */ RFI -11: -BEGIN_FTR_SECTION - b 12b -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_HSRR0, r8 + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 mtspr SPRN_HSRR1, r7 ba 0x500 +13: b machine_check_fwnmi + /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing @@ -1176,7 +1184,7 @@ kvmppc_hdsi: cmpdi r3, 0 /* retry the instruction */ beq 6f cmpdi r3, -1 /* handle in kernel mode */ - beq nohpte_cont + beq guest_exit_cont cmpdi r3, -2 /* MMIO emulation; need instr word */ beq 2f @@ -1190,6 +1198,7 @@ kvmppc_hdsi: li r10, BOOK3S_INTERRUPT_DATA_STORAGE li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ rotldi r11, r11, 63 +fast_interrupt_c_return: 6: ld r7, VCPU_CTR(r9) lwz r8, VCPU_XER(r9) mtctr r7 @@ -1222,7 +1231,7 @@ kvmppc_hdsi: /* Unset guest mode. */ li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) - b nohpte_cont + b guest_exit_cont /* * Similarly for an HISI, reflect it to the guest as an ISI unless @@ -1248,9 +1257,9 @@ kvmppc_hisi: ld r11, VCPU_MSR(r9) li r12, BOOK3S_INTERRUPT_H_INST_STORAGE cmpdi r3, 0 /* retry the instruction */ - beq 6f + beq fast_interrupt_c_return cmpdi r3, -1 /* handle in kernel mode */ - beq nohpte_cont + beq guest_exit_cont /* Synthesize an ISI for the guest */ mr r11, r3 @@ -1259,12 +1268,7 @@ kvmppc_hisi: li r10, BOOK3S_INTERRUPT_INST_STORAGE li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ rotldi r11, r11, 63 -6: ld r7, VCPU_CTR(r9) - lwz r8, VCPU_XER(r9) - mtctr r7 - mtxer r8 - mr r4, r9 - b fast_guest_return + b fast_interrupt_c_return 3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ ld r5, KVM_VRMA_SLB_V(r6) @@ -1280,14 +1284,14 @@ kvmppc_hisi: hcall_try_real_mode: ld r3,VCPU_GPR(R3)(r9) andi. r0,r11,MSR_PR - bne hcall_real_cont + bne guest_exit_cont clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table - bge hcall_real_cont + bge guest_exit_cont LOAD_REG_ADDR(r4, hcall_real_table) lwzx r3,r3,r4 cmpwi r3,0 - beq hcall_real_cont + beq guest_exit_cont add r3,r3,r4 mtctr r3 mr r3,r9 /* get vcpu pointer */ @@ -1308,7 +1312,7 @@ hcall_real_fallback: li r12,BOOK3S_INTERRUPT_SYSCALL ld r9, HSTATE_KVM_VCPU(r13) - b hcall_real_cont + b guest_exit_cont .globl hcall_real_table hcall_real_table: @@ -1567,6 +1571,21 @@ kvm_cede_exit: li r3,H_TOO_HARD blr + /* Try to handle a machine check in real mode */ +machine_check_realmode: + mr r3, r9 /* get vcpu pointer */ + bl .kvmppc_realmode_machine_check + nop + cmpdi r3, 0 /* continue exiting from guest? */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq mc_cont + /* If not, deliver a machine check. SRR0/1 are already set */ + li r10, BOOK3S_INTERRUPT_MACHINE_CHECK + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 + b fast_interrupt_c_return + secondary_too_late: ld r5,HSTATE_KVM_VCORE(r13) HMT_LOW -- cgit v0.10.2 From 910040b82de872af453bf3ecc59de8f0abd22697 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:18 +0000 Subject: KVM: PPC: e500: Silence bogus GCC warning in tlb code 64-bit GCC 4.5.1 warns about an uninitialized variable which was guarded by a flag. Initialize the variable to make it happy. Signed-off-by: Mihai Caraman [agraf: reword comment] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 6305ee6..5532bfb 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -415,7 +415,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, struct tlbe_ref *ref) { struct kvm_memory_slot *slot; - unsigned long pfn, hva; + unsigned long pfn = 0; /* silence GCC warning */ + unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; -- cgit v0.10.2 From b50df19cccdd169d5345b5169699446b80ee051a Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:19 +0000 Subject: KVM: PPC: booke: Fix get_tb() compile error on 64-bit Include header file for get_tb() declaration. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3d1f35d..7c9c389 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "timing.h" #include "booke.h" -- cgit v0.10.2 From ff594746845877c0a6402be23897df659188eacb Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:20 +0000 Subject: KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler GET_VCPU define will not be implemented for 64-bit for performance reasons so get rid of it also on 32-bit. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 099fe82..fa6d552 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -32,9 +32,6 @@ #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ -#define GET_VCPU(vcpu, thread) \ - PPC_LL vcpu, THREAD_KVM_VCPU(thread) - #define LONGBYTES (BITS_PER_LONG / 8) #define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) @@ -206,7 +203,7 @@ */ .macro kvm_handler intno srr0, srr1, flags _GLOBAL(kvmppc_handler_\intno\()_\srr1) - GET_VCPU(r11, r10) + PPC_LL r11, THREAD_KVM_VCPU(r10) PPC_STL r3, VCPU_GPR(R3)(r11) mfspr r3, SPRN_SPRG_RSCRATCH0 PPC_STL r4, VCPU_GPR(R4)(r11) @@ -233,7 +230,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) .macro kvm_lvl_handler intno scratch srr0, srr1, flags _GLOBAL(kvmppc_handler_\intno\()_\srr1) mfspr r10, SPRN_SPRG_THREAD - GET_VCPU(r11, r10) + PPC_LL r11, THREAD_KVM_VCPU(r10) PPC_STL r3, VCPU_GPR(R3)(r11) mfspr r3, \scratch PPC_STL r4, VCPU_GPR(R4)(r11) -- cgit v0.10.2 From e51f8f32d6b82f4a34dbb5781769c79b813e5694 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:21 +0000 Subject: KVM: PPC: bookehv64: Add support for interrupt handling Add interrupt handling support for 64-bit bookehv hosts. Unify 32 and 64 bit implementations using a common stack layout and a common execution flow starting from kvm_handler_common macro. Update documentation for 64-bit input register values. This patch only address the bolted TLB miss exception handlers version. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h index a37a12a..3a79f53 100644 --- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -17,6 +17,7 @@ * there are no exceptions for which we fall through directly to * the normal host handler. * + * 32-bit host * Expected inputs (normal exceptions): * SCRATCH0 = saved r10 * r10 = thread struct @@ -33,6 +34,30 @@ * *(r8 + GPR9) = saved r9 * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) * *(r8 + GPR11) = saved r11 + * + * 64-bit host + * Expected inputs (GEN/GDBELL/DBG/MC exception types): + * r10 = saved CR + * r13 = PACA_POINTER + * *(r13 + PACA_EX##type + EX_R10) = saved r10 + * *(r13 + PACA_EX##type + EX_R11) = saved r11 + * SPRN_SPRG_##type##_SCRATCH = saved r13 + * + * Expected inputs (CRIT exception type): + * r10 = saved CR + * r13 = PACA_POINTER + * *(r13 + PACA_EX##type + EX_R10) = saved r10 + * *(r13 + PACA_EX##type + EX_R11) = saved r11 + * *(r13 + PACA_EX##type + EX_R13) = saved r13 + * + * Expected inputs (TLB exception type): + * r10 = saved CR + * r13 = PACA_POINTER + * *(r13 + PACA_EX##type + EX_TLB_R10) = saved r10 + * *(r13 + PACA_EX##type + EX_TLB_R11) = saved r11 + * SPRN_SPRG_GEN_SCRATCH = saved r13 + * + * Only the bolted version of TLB miss exception handlers is supported now. */ .macro DO_KVM intno srr1 #ifdef CONFIG_KVM_BOOKE_HV diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index fa6d552..e8ed7d6 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -16,6 +16,7 @@ * * Author: Varun Sethi * Author: Scott Wood + * Author: Mihai Caraman * * This file is derived from arch/powerpc/kvm/booke_interrupts.S */ @@ -30,28 +31,33 @@ #include #include +#ifdef CONFIG_64BIT +#include +#else #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ +#endif #define LONGBYTES (BITS_PER_LONG / 8) #define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) /* The host stack layout: */ -#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ -#define HOST_CALLEE_LR (1 * LONGBYTES) -#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ +#define HOST_R1 0 /* Implied by stwu. */ +#define HOST_CALLEE_LR PPC_LR_STKOFF +#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES) /* * r2 is special: it holds 'current', and it made nonvolatile in the * kernel with the -ffixed-r2 gcc option. */ -#define HOST_R2 (3 * LONGBYTES) -#define HOST_CR (4 * LONGBYTES) -#define HOST_NV_GPRS (5 * LONGBYTES) +#define HOST_R2 (HOST_RUN + LONGBYTES) +#define HOST_CR (HOST_R2 + LONGBYTES) +#define HOST_NV_GPRS (HOST_CR + LONGBYTES) #define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) #define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) #define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) #define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ -#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ +/* LR in caller stack frame. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF) #define NEED_EMU 0x00000001 /* emulation -- save nv regs */ #define NEED_DEAR 0x00000002 /* save faulting DEAR */ @@ -198,6 +204,122 @@ b kvmppc_resume_host .endm +#ifdef CONFIG_64BIT +/* Exception types */ +#define EX_GEN 1 +#define EX_GDBELL 2 +#define EX_DBG 3 +#define EX_MC 4 +#define EX_CRIT 5 +#define EX_TLB 6 + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags + _GLOBAL(kvmppc_handler_\intno\()_\srr1) + mr r11, r4 + /* + * Get vcpu from Paca: paca->__current.thread->kvm_vcpu + */ + PPC_LL r4, PACACURRENT(r13) + PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) + stw r10, VCPU_CR(r4) + PPC_STL r11, VCPU_GPR(R4)(r4) + PPC_STL r5, VCPU_GPR(R5)(r4) + .if \type == EX_CRIT + PPC_LL r5, (\paca_ex + EX_R13)(r13) + .else + mfspr r5, \scratch + .endif + PPC_STL r6, VCPU_GPR(R6)(r4) + PPC_STL r8, VCPU_GPR(R8)(r4) + PPC_STL r9, VCPU_GPR(R9)(r4) + PPC_STL r5, VCPU_GPR(R13)(r4) + PPC_LL r6, (\paca_ex + \ex_r10)(r13) + PPC_LL r8, (\paca_ex + \ex_r11)(r13) + PPC_STL r3, VCPU_GPR(R3)(r4) + PPC_STL r7, VCPU_GPR(R7)(r4) + PPC_STL r12, VCPU_GPR(R12)(r4) + PPC_STL r6, VCPU_GPR(R10)(r4) + PPC_STL r8, VCPU_GPR(R11)(r4) + mfctr r5 + PPC_STL r5, VCPU_CTR(r4) + mfspr r5, \srr0 + mfspr r6, \srr1 + kvm_handler_common \intno, \srr0, \flags +.endm + +#define EX_PARAMS(type) \ + EX_##type, \ + SPRN_SPRG_##type##_SCRATCH, \ + PACA_EX##type, \ + EX_R10, \ + EX_R11 + +#define EX_PARAMS_TLB \ + EX_TLB, \ + SPRN_SPRG_GEN_SCRATCH, \ + PACA_EXTLB, \ + EX_TLB_R10, \ + EX_TLB_R11 + +kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \ + SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\ + SPRN_CSRR0, SPRN_CSRR1, 0 +/* + * Only bolted TLB miss exception handlers are supported for now + */ +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \ + SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ + SPRN_DSRR0, SPRN_DSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +#else /* * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h */ @@ -292,7 +414,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 - +#endif /* Registers: * SPRG_SCRATCH0: guest r10 -- cgit v0.10.2 From 7cdd7a95c66a6309ae6156471033fb5375cbcfca Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:22 +0000 Subject: KVM: PPC: e500: Add emulation helper for getting instruction ea Add emulation helper for getting instruction ea and refactor tlb instruction emulation to use it. Signed-off-by: Mihai Caraman [agraf: keep rt variable around] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 1ca31e9..d55a2b2 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -295,4 +295,15 @@ static inline void kvmppc_lazy_ee_enable(void) #endif } +static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb) +{ + ulong ea; + + ea = kvmppc_get_gpr(vcpu, rb); + if (ra) + ea += kvmppc_get_gpr(vcpu, ra); + + return ea; +} + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index d162286..32e98a7 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -129,9 +129,9 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value); int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); -int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb); -int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb); -int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb); +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea); +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea); int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e04b0ef..e78f353 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -89,6 +89,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int ra = get_ra(inst); int rb = get_rb(inst); int rt = get_rt(inst); + gva_t ea; switch (get_op(inst)) { case 31: @@ -113,15 +114,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_TLBSX: - emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbsx(vcpu, ea); break; - case XOP_TLBILX: - emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb); + case XOP_TLBILX: { + int type = rt & 0x3; + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea); break; + } case XOP_TLBIVAX: - emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; default: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 5532bfb..7a14721 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -689,14 +689,11 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) return EMULATE_DONE; } -int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); unsigned int ia; int esel, tlbsel; - gva_t ea; - - ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); ia = (ea >> 2) & 0x1; @@ -723,7 +720,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) } static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, - int pid, int rt) + int pid, int type) { struct kvm_book3e_206_tlb_entry *tlbe; int tid, esel; @@ -732,7 +729,7 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { tlbe = get_entry(vcpu_e500, tlbsel, esel); tid = get_tlb_tid(tlbe); - if (rt == 0 || tid == pid) { + if (type == 0 || tid == pid) { inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } @@ -740,14 +737,9 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, } static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, - int ra, int rb) + gva_t ea) { int tlbsel, esel; - gva_t ea; - - ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); @@ -759,16 +751,16 @@ static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, } } -int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb) +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int pid = get_cur_spid(vcpu); - if (rt == 0 || rt == 1) { - tlbilx_all(vcpu_e500, 0, pid, rt); - tlbilx_all(vcpu_e500, 1, pid, rt); - } else if (rt == 3) { - tlbilx_one(vcpu_e500, pid, ra, rb); + if (type == 0 || type == 1) { + tlbilx_all(vcpu_e500, 0, pid, type); + tlbilx_all(vcpu_e500, 1, pid, type); + } else if (type == 3) { + tlbilx_one(vcpu_e500, pid, ea); } return EMULATE_DONE; @@ -793,16 +785,13 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) return EMULATE_DONE; } -int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int as = !!get_cur_sas(vcpu); unsigned int pid = get_cur_spid(vcpu); int esel, tlbsel; struct kvm_book3e_206_tlb_entry *gtlbe = NULL; - gva_t ea; - - ea = kvmppc_get_gpr(vcpu, rb); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); -- cgit v0.10.2 From 8823a8fd0d730612f12a87102503622c01eb2468 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:23 +0000 Subject: KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation Mask high 32 bits of effective address in emulation layer for guests running in 32-bit mode. Signed-off-by: Mihai Caraman [agraf: fix indent] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d55a2b2..572aa75 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -298,11 +298,21 @@ static inline void kvmppc_lazy_ee_enable(void) static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb) { ulong ea; + ulong msr_64bit = 0; ea = kvmppc_get_gpr(vcpu, rb); if (ra) ea += kvmppc_get_gpr(vcpu, ra); +#if defined(CONFIG_PPC_BOOK3E_64) + msr_64bit = MSR_CM; +#elif defined(CONFIG_PPC_BOOK3S_64) + msr_64bit = MSR_SF; +#endif + + if (!(vcpu->arch.shared->msr & msr_64bit)) + ea = (uint32_t)ea; + return ea; } -- cgit v0.10.2 From 9e2fa646936160eca525bcb80c2cce05faa9b208 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:24 +0000 Subject: KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation Mask high 32 bits of MAS2's effective page number in tlbwe emulation for guests running in 32-bit mode. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 7a14721..cf3f180 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -871,6 +871,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas1 = vcpu->arch.shared->mas1; gtlbe->mas2 = vcpu->arch.shared->mas2; + if (!(vcpu->arch.shared->msr & MSR_CM)) + gtlbe->mas2 &= 0xffffffffUL; gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, -- cgit v0.10.2 From e9666ea1b3d11509b76f8ff5b9776d8d30709b19 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:25 +0000 Subject: KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit Extend MAS2 EPN mask to retain most significant bits on 64-bit hosts. Use this mask in tlb effective address accessor. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index eeabcdb..99d43e0 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -59,7 +59,7 @@ #define MAS1_TSIZE_SHIFT 7 #define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) -#define MAS2_EPN 0xFFFFF000 +#define MAS2_EPN (~0xFFFUL) #define MAS2_X0 0x00000040 #define MAS2_X1 0x00000020 #define MAS2_W 0x00000010 diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 32e98a7..c70d37e 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -154,7 +154,7 @@ get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) { - return tlbe->mas2 & 0xfffff000; + return tlbe->mas2 & MAS2_EPN; } static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) -- cgit v0.10.2 From 62b4db0042aa753810e0d4f184481cc107c925ba Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sat, 1 Dec 2012 14:50:26 +0100 Subject: KVM: PPC: Make EPCR a valid field for booke64 and bookehv In BookE, EPCR is defined and valid when either the HV or the 64bit category are implemented. Reflect this in the field definition. Today the only KVM target on 64bit is HV enabled, so there is no change in actual source code, but this keeps the code closer to the spec and doesn't build up artificial road blocks for a PR KVM on 64bit. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 62fbd38..ca9bf45 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -406,13 +406,18 @@ struct kvm_vcpu_arch { u32 host_mas4; u32 host_mas6; u32 shadow_epcr; - u32 epcr; u32 shadow_msrp; u32 eplc; u32 epsc; u32 oldpir; #endif +#if defined(CONFIG_BOOKE) +#if defined(CONFIG_KVM_BOOKE_HV) || defined(CONFIG_64BIT) + u32 epcr; +#endif +#endif + #ifdef CONFIG_PPC_BOOK3S /* For Gekko paired singles */ u32 qpr[32]; -- cgit v0.10.2 From 95e90b43c9c648bde607101e5a158941eec8e514 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:26 +0000 Subject: KVM: PPC: bookehv: Add guest computation mode for irq delivery When delivering guest IRQs, update MSR computation mode according to guest interrupt computation mode found in EPCR. Signed-off-by: Mihai Caraman [agraf: remove HV dependency in the code] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 7c9c389..9457fb1 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -312,6 +312,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, bool crit; bool keep_irq = false; enum int_class int_class; + ulong new_msr = vcpu->arch.shared->msr; /* Truncate crit indicators in 32 bit mode */ if (!(vcpu->arch.shared->msr & MSR_SF)) { @@ -407,7 +408,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) set_guest_dear(vcpu, vcpu->arch.queued_dear); - kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); + + new_msr &= msr_mask; +#if defined(CONFIG_64BIT) + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + new_msr |= MSR_CM; +#endif + kvmppc_set_msr(vcpu, new_msr); if (!keep_irq) clear_bit(priority, &vcpu->arch.pending_exceptions); -- cgit v0.10.2 From 38f988240c611f9d2595feb1b8ddcb80b0e97dec Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:27 +0000 Subject: KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation Add EPCR support in booke mtspr/mfspr emulation. EPCR register is defined only for 64-bit and HV categories, we will expose it at this point only to 64-bit virtual processors running on 64-bit HV hosts. Define a reusable setter function for vcpu's EPCR. Signed-off-by: Mihai Caraman [agraf: move HV dependency in the code] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 9457fb1..037d045 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1473,6 +1473,18 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { } +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr) +{ +#if defined(CONFIG_64BIT) + vcpu->arch.epcr = new_epcr; +#ifdef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM; + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM; +#endif +#endif +} + void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) { vcpu->arch.tcr = new_tcr; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index ba61974..e9b88e4 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -69,6 +69,7 @@ extern unsigned long kvmppc_booke_handlers; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr); void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 514790f..4685b8c 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -240,7 +240,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_MCSR: vcpu->arch.mcsr &= ~spr_val; break; - +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + kvmppc_set_epcr(vcpu, spr_val); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); +#endif + break; +#endif default: emulated = EMULATE_FAIL; } @@ -335,6 +342,11 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_MCSR: *spr_val = vcpu->arch.mcsr; break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + *spr_val = vcpu->arch.epcr; + break; +#endif default: emulated = EMULATE_FAIL; -- cgit v0.10.2 From 352df1deb2e3c40e65ff94c8d7c62d9144446b1c Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Oct 2012 06:13:29 +0000 Subject: KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface Implement ONE_REG interface for EPCR register adding KVM_REG_PPC_EPCR to the list of ONE_REG PPC supported registers. Signed-off-by: Mihai Caraman [agraf: remove HV dependency, use get/put_user] Signed-off-by: Alexander Graf diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a5607c5..a4df553 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1773,6 +1773,7 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_ADDR | 64 PPC | KVM_REG_PPC_VPA_SLB | 128 PPC | KVM_REG_PPC_VPA_DTL | 128 + PPC | KVM_REG_PPC_EPCR | 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 514883d..2fba8a6 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -411,4 +411,6 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_VPA_SLB (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83) #define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84) +#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 037d045..69f1140 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1388,6 +1388,11 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); break; } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: + r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); + break; +#endif default: break; } @@ -1415,6 +1420,15 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) (u64 __user *)(long)reg->addr, sizeof(u64)); break; } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: { + u32 new_epcr; + r = get_user(new_epcr, (u32 __user *)(long)reg->addr); + if (r == 0) + kvmppc_set_epcr(vcpu, new_epcr); + break; + } +#endif default: break; } -- cgit v0.10.2 From c21934632549910188fb5cf40e79033f27a0741b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 4 Dec 2012 07:17:11 +0800 Subject: KVM: MMU: optimize for set_spte There are two cases we need to adjust page size in set_spte: 1): the one is other vcpu creates new sp in the window between mapping_level() and acquiring mmu-lock. 2): the another case is the new sp is created by itself (page-fault path) when guest uses the target gfn as its page table. In current code, set_spte drop the spte and emulate the access for these case, it works not good: - for the case 1, it may destroy the mapping established by other vcpu, and do expensive instruction emulation. - for the case 2, it may emulate the access even if the guest is accessing the page which not used as page table. There is a example, 0~2M is used as huge page in guest, in this huge page, only page 3 used as page table, then guest read/writes on other pages can cause instruction emulation. Both of these cases can be fixed by allowing guest to retry the access, it will refault, then we can establish the mapping by using small page Signed-off-by: Xiao Guangrong Acked-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b875a9e..01d7c2a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, || (!vcpu->arch.mmu.direct_map && write_fault && !is_write_protection(vcpu) && !user_fault)) { + /* + * There are two cases: + * - the one is other vcpu creates new sp in the window + * between mapping_level() and acquiring mmu-lock. + * - the another case is the new sp is created by itself + * (page-fault path) when guest uses the target gfn as + * its page table. + * Both of these cases can be fixed by allowing guest to + * retry the access, it will refault, then we can establish + * the mapping by using small page. + */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) { - ret = 1; - drop_spte(vcpu->kvm, sptep); + has_wrprotected_page(vcpu->kvm, gfn, level)) goto done; - } spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; -- cgit v0.10.2 From f23d1f4a116038c68df224deae6718fde87d8f0d Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Thu, 6 Dec 2012 23:40:47 +0800 Subject: x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary This patch provides a way to VMCLEAR VMCSs related to guests on all cpus before executing the VMXOFF when doing kdump. This is used to ensure the VMCSs in the vmcore updated and non-corrupted. Signed-off-by: Zhang Yanfei Acked-by: Eric W. Biederman Signed-off-by: Gleb Natapov diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 317ff17..28feeba 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -163,6 +163,8 @@ struct kimage_arch { }; #endif +extern void (*crash_vmclear_loaded_vmcss)(void); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad899..2f6b8e8 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,27 @@ int in_crash_kexec; +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +void (*crash_vmclear_loaded_vmcss)(void) = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + void (*do_vmclear_operation)(void) = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #endif crash_save_cpu(regs, cpu); + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Disable VMX or SVM if needed. * * We need to disable virtualization on all CPUs. @@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) kdump_nmi_shootdown_cpus(); + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Booting kdump kernel with VMX or SVM enabled won't work, * because (among other limitations) we can't disable paging * with the virt flags. -- cgit v0.10.2 From 8f536b7697a0d40ef6b5fd04cf2c04953d5ca06f Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Thu, 6 Dec 2012 23:43:34 +0800 Subject: KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump The vmclear function will be assigned to the callback function pointer when loading kvm-intel module. And the bitmap indicates whether we should do VMCLEAR operation in kdump. The bits in the bitmap are set/unset according to different conditions. Signed-off-by: Zhang Yanfei Acked-by: Eric W. Biederman Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 94833e2..1a30fd5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "trace.h" @@ -987,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ + static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; @@ -996,6 +1037,7 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; + crash_disable_local_vmclear(cpu); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* @@ -1007,6 +1049,7 @@ static void __loaded_vmcs_clear(void *arg) smp_wmb(); loaded_vmcs_init(loaded_vmcs); + crash_enable_local_vmclear(cpu); } static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) @@ -1530,6 +1573,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); + crash_disable_local_vmclear(cpu); /* * Read loaded_vmcs->cpu should be before fetching @@ -1540,6 +1584,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); local_irq_enable(); /* @@ -2353,6 +2398,18 @@ static int hardware_enable(void *garbage) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the loaded_vmcss_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the loaded_vmcss_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -7383,6 +7440,11 @@ static int __init vmx_init(void) if (r) goto out3; +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -7420,6 +7482,11 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + kvm_exit(); } -- cgit v0.10.2 From 49f8a1a5394d8baee5e56fb71e5cf993c228689a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 6 Dec 2012 14:44:59 -0700 Subject: kvm: Fix irqfd resampler list walk Typo for the next pointer means we're walking random data here. Signed-off-by: Alex Williamson Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index d7424c8..b6eea5c 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -333,7 +333,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) mutex_lock(&kvm->irqfds.resampler_lock); list_for_each_entry(resampler, - &kvm->irqfds.resampler_list, list) { + &kvm->irqfds.resampler_list, link) { if (resampler->notifier.gsi == irqfd->gsi) { irqfd->resampler = resampler; break; -- cgit v0.10.2 From 0ca0d818cbcbcac75a02833861b6fc42a98b904e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Tue, 11 Dec 2012 17:11:34 +0800 Subject: x86/kexec: crash_vmclear_local_vmcss needs __rcu This removes the sparse warning: arch/x86/kernel/crash.c:49:32: sparse: incompatible types in comparison expression (different address spaces) Reported-by: kbuild test robot Signed-off-by: Zhang Yanfei Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 28feeba..6080d26 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -163,7 +163,8 @@ struct kimage_arch { }; #endif -extern void (*crash_vmclear_loaded_vmcss)(void); +typedef void crash_vmclear_fn(void); +extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2f6b8e8..74467fe 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,12 +38,12 @@ int in_crash_kexec; * * protected by rcu. */ -void (*crash_vmclear_loaded_vmcss)(void) = NULL; +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); static inline void cpu_crash_vmclear_loaded_vmcss(void) { - void (*do_vmclear_operation)(void) = NULL; + crash_vmclear_fn *do_vmclear_operation = NULL; rcu_read_lock(); do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); -- cgit v0.10.2 From a4d3326c2de46fd7bcc47d1e8786efccfc152f81 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 11 Dec 2012 15:14:10 +0200 Subject: KVM: VMX: fix DPL during entry to protected mode On CPUs without support for unrestricted guests DPL cannot be smaller than RPL for data segments during guest entry, but this state can occurs if a data segment selector changes while vcpu is in real mode to a value with lowest two bits != 00. Fix that by forcing DPL == RPL on transition to protected mode. This is a regression introduced by c865c43de66dc97. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a30fd5..feab3d9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2766,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { tmp.base = vmcs_readl(sf->base); tmp.selector = vmcs_read16(sf->selector); + tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; tmp.s = 1; } vmx_set_segment(vcpu, &tmp, seg); -- cgit v0.10.2 From 0b26b588d97031fe8e16a35186220a6c14c89849 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 11 Dec 2012 15:14:11 +0200 Subject: VMX: remove unneeded enable_unrestricted_guest check If enable_unrestricted_guest is true vmx->rmode.vm86_active will always be false. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index feab3d9..d14bb12 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3316,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, * unrestricted guest like Westmere to older host that don't have * unrestricted guest like Nehelem. */ - if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + if (vmx->rmode.vm86_active) { switch (seg) { case VCPU_SREG_CS: vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); -- cgit v0.10.2 From 58b7825bc324da55415034a9f6ca5d716b8fd898 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 11 Dec 2012 15:14:12 +0200 Subject: KVM: emulator: fix real mode segment checks in address linearization In real mode CS register is writable, so do not #GP on write. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39171cb..979869f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -677,8 +677,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, addr.seg); if (!usable) goto bad; - /* code segment or read-only data segment */ - if (((desc.type & 8) || !(desc.type & 2)) && write) + /* code segment in protected mode or read-only data segment */ + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) + || !(desc.type & 2)) && write) goto bad; /* unreadable code segment */ if (!fetch && (desc.type & 8) && !(desc.type & 2)) -- cgit v0.10.2