From e48ba1cbce12eb4546771d45c09dd94c3404efe8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Aug 2016 11:13:09 +1000 Subject: KVM: PPC: Book3S: Don't crash if irqfd used with no in-kernel XICS emulation It turns out that if userspace creates a pseries-type VM without in-kernel XICS (interrupt controller) emulation, and then connects an eventfd to the VM as an irqfd, and the eventfd gets signalled, that the code will try to deliver an interrupt via the non-existent XICS object and crash the host kernel with a NULL pointer dereference. To fix this, we check for the presence of the XICS object before trying to deliver the interrupt, and return with an error if not. Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 05aa113..686147e 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1252,6 +1252,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, { struct kvmppc_xics *xics = kvm->arch.xics; + if (!xics) + return -ENODEV; return ics_deliver_irq(xics, irq, level); } -- cgit v0.10.2 From 34a75b0f63356097ae9f706d64a793934891002f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Aug 2016 11:27:27 +1000 Subject: KVM: PPC: Implement kvm_arch_intc_initialized() for PPC It doesn't make sense to create irqfds for a VM that doesn't have in-kernel interrupt controller emulation. There is an existing interface for architecture code to tell the irqfd code whether or not any interrupt controller has been initialized, called kvm_arch_intc_initialized(), so let's implement that for powerpc. Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ec35af3..e36ce0c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -43,6 +43,8 @@ #include #define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES) +#define __KVM_HAVE_ARCH_INTC_INITIALIZED + #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6ce40dd..ab2b3d5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1167,6 +1167,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_MPIC + if (kvm->arch.mpic) + return true; +#endif +#ifdef CONFIG_KVM_XICS + if (kvm->arch.xics) + return true; +#endif + return false; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { -- cgit v0.10.2 From 187ca84b4b02d8b8a4f2117f1c6b209ef1ee90fc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 3 Aug 2016 12:04:13 +0800 Subject: KVM: lapic: don't recalculate apic map table twice when enabling LAPIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit APIC map table is recalculated during reset APIC ID to the initial value when enabling LAPIC. This patch move the recalculate_apic_map() to the next branch since we don't need to recalculate apic map twice in current codes. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b62c852..23b99f3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - } else + } else { static_key_slow_inc(&apic_hw_disabled.key); - recalculate_apic_map(vcpu->kvm); + recalculate_apic_map(vcpu->kvm); + } } if ((old_value ^ value) & X2APIC_ENABLE) { -- cgit v0.10.2 From c4f138b4514ba77837d5f6e8ffd2cbc143997984 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 2 Aug 2016 16:32:37 -0400 Subject: mmu: don't pass *kvm to spte_write_protect and spte_*_dirty That parameter isn't used in these functions, it's probably a historical artifact. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d4cc8cc..d9c7e98 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * * Return true if tlb need be flushed. */ -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) +static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm, bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_write_protect(kvm, sptep, pt_protect); + flush |= spte_write_protect(sptep, pt_protect); return flush; } -static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(kvm, sptep); + flush |= spte_clear_dirty(sptep); return flush; } -static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_set_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(kvm, sptep); + flush |= spte_set_dirty(sptep); return flush; } -- cgit v0.10.2 From 8e3562f6f8d28804b5499bf3cff520598aa25323 Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Tue, 2 Aug 2016 16:01:18 +0800 Subject: KVM: x86: Expose more Intel AVX512 feature to guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose AVX512DQ, AVX512BW, AVX512VL feature to guest. Its spec can be found at: https://software.intel.com/sites/default/files/managed/b4/3a/319433-024.pdf Signed-off-by: Luwei Kang [Resolved a trivial conflict with removed F(PCOMMIT).] Signed-off-by: Radim Krčmář diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3235e0f..afa7bbb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = -- cgit v0.10.2 From 4b3d173d0440d37534906b6d93c02dfb577c68ce Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 18 Aug 2016 16:04:41 +1000 Subject: KVM: PPC: Always select KVM_VFIO, plus Makefile cleanup As discussed recently on the kvm mailing list, David Gibson's intention in commit 178a78750212 ("vfio: Enable VFIO device for powerpc", 2016-02-01) was to have the KVM VFIO device built in on all powerpc platforms. This patch adds the "select KVM_VFIO" statement that makes this happen. Currently, arch/powerpc/kvm/Makefile doesn't include vfio.o for the 64-bit kvm module, because the list of objects doesn't use the $(common-objs-y) list. The reason it doesn't is because we don't necessarily want coalesced_mmio.o or emulate.o (for example if HV KVM is the only target), and common-objs-y includes both. Since this is confusing, this patch adjusts the definitions so that we now use $(common-objs-y) in the list for the 64-bit kvm.ko module, emulate.o is removed from common-objs-y and added in the places that need it, and the inclusion of coalesced_mmio.o now depends on CONFIG_KVM_MMIO. Reviewed-by: Paolo Bonzini Reviewed-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c2024ac..c0a2478 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select ANON_INODES select HAVE_KVM_EVENTFD select SRCU + select KVM_VFIO config KVM_BOOK3S_HANDLER bool diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 1f9e552..bd9b82f 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm -common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o +common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. CFLAGS_emulate.o := -I. CFLAGS_emulate_loadstore.o := -I. -common-objs-y += powerpc.o emulate.o emulate_loadstore.o +common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o @@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj) kvm-e500-objs := \ $(common-objs-y) \ + emulate.o \ booke.o \ booke_emulate.o \ booke_interrupts.o \ @@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) kvm-e500mc-objs := \ $(common-objs-y) \ + emulate.o \ booke.o \ booke_emulate.o \ bookehv_interrupts.o \ @@ -61,9 +63,6 @@ kvm-pr-y := \ book3s_32_mmu.o ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE -kvm-book3s_64-module-objs := \ - $(KVM)/coalesced_mmio.o - kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_rmhandlers.o endif @@ -88,11 +87,8 @@ endif kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o -kvm-book3s_64-module-objs += \ - $(KVM)/kvm_main.o \ - $(KVM)/eventfd.o \ - powerpc.o \ - emulate_loadstore.o \ +kvm-book3s_64-module-objs := \ + $(common-objs-y) \ book3s.o \ book3s_64_vio.o \ book3s_rtas.o \ @@ -102,6 +98,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) kvm-book3s_32-objs := \ $(common-objs-y) \ + emulate.o \ fpu.o \ book3s_paired_singles.o \ book3s.o \ -- cgit v0.10.2 From 3928aa3f5775fc4e40117077e97d73d8526039c9 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:32 -0500 Subject: iommu/amd: Detect and enable guest vAPIC support This patch introduces a new IOMMU driver parameter, amd_iommu_guest_ir, which can be used to specify different interrupt remapping mode for passthrough devices to VM guest: * legacy: Legacy interrupt remapping (w/ 32-bit IRTE) * vapic : Guest vAPIC interrupt remapping (w/ GA mode 128-bit IRTE) Note that in vapic mode, it can also supports legacy interrupt remapping for non-passthrough devices with the 128-bit IRTE. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 46c030a..748ef7b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -460,6 +460,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. driver will print ACPI tables for AMD IOMMU during IOMMU initialization. + amd_iommu_intr= [HW,X86-64] + Specifies one of the following AMD IOMMU interrupt + remapping modes: + legacy - Use legacy interrupt remapping mode. + vapic - Use virtual APIC mode, which allows IOMMU + to inject interrupts directly into guest. + This mode requires kvm-amd.avic=1. + (Default when IOMMU HW support is present.) + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 59741ea..c3afd86 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -145,6 +145,8 @@ struct ivmd_header { bool amd_iommu_dump; bool amd_iommu_irq_remap __read_mostly; +int amd_iommu_guest_ir; + static bool amd_iommu_detected; static bool __initdata amd_iommu_disabled; static int amd_iommu_target_ivhd_type; @@ -1258,6 +1260,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) iommu->mmio_phys_end = MMIO_REG_END_OFFSET; else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; + if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; case 0x11: case 0x40: @@ -1265,6 +1269,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) iommu->mmio_phys_end = MMIO_REG_END_OFFSET; else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; + if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; default: return -EINVAL; @@ -1488,6 +1494,14 @@ static int iommu_init_pci(struct amd_iommu *iommu) if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu)) return -ENOMEM; + /* Note: We have already checked GASup from IVRS table. + * Now, we need to make sure that GAMSup is set. + */ + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && + !iommu_feature(iommu, FEATURE_GAM_VAPIC)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; + + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) amd_iommu_np_cache = true; @@ -1545,16 +1559,24 @@ static void print_iommu_info(void) dev_name(&iommu->dev->dev), iommu->cap_ptr); if (iommu->cap & (1 << IOMMU_CAP_EFR)) { - pr_info("AMD-Vi: Extended features: "); + pr_info("AMD-Vi: Extended features (%#llx):\n", + iommu->features); for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { if (iommu_feature(iommu, (1ULL << i))) pr_cont(" %s", feat_str[i]); } + + if (iommu->features & FEATURE_GAM_VAPIC) + pr_cont(" GA_vAPIC"); + pr_cont("\n"); } } - if (irq_remapping_enabled) + if (irq_remapping_enabled) { pr_info("AMD-Vi: Interrupt remapping enabled\n"); + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + pr_info("AMD-Vi: virtual APIC enabled\n"); + } } static int __init amd_iommu_init_pci(void) @@ -1862,6 +1884,22 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu) iommu->stored_addr_lo | 1); } +static void iommu_enable_ga(struct amd_iommu *iommu) +{ +#ifdef CONFIG_IRQ_REMAP + switch (amd_iommu_guest_ir) { + case AMD_IOMMU_GUEST_IR_VAPIC: + iommu_feature_enable(iommu, CONTROL_GAM_EN); + /* Fall through */ + case AMD_IOMMU_GUEST_IR_LEGACY_GA: + iommu_feature_enable(iommu, CONTROL_GA_EN); + break; + default: + break; + } +#endif +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -1877,6 +1915,7 @@ static void early_enable_iommus(void) iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); + iommu_enable_ga(iommu); iommu_enable(iommu); iommu_flush_all_caches(iommu); } @@ -2059,7 +2098,7 @@ static int __init early_amd_iommu_init(void) struct acpi_table_header *ivrs_base; acpi_size ivrs_size; acpi_status status; - int i, ret = 0; + int i, remap_cache_sz, ret = 0; if (!amd_iommu_detected) return -ENODEV; @@ -2157,10 +2196,14 @@ static int __init early_amd_iommu_init(void) * remapping tables. */ ret = -ENOMEM; + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32); + else + remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2); amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache", - MAX_IRQS_PER_TABLE * sizeof(u32), - IRQ_TABLE_ALIGNMENT, - 0, NULL); + remap_cache_sz, + IRQ_TABLE_ALIGNMENT, + 0, NULL); if (!amd_iommu_irq_cache) goto out; @@ -2413,6 +2456,21 @@ static int __init parse_amd_iommu_dump(char *str) return 1; } +static int __init parse_amd_iommu_intr(char *str) +{ + for (; *str; ++str) { + if (strncmp(str, "legacy", 6) == 0) { + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; + break; + } + if (strncmp(str, "vapic", 5) == 0) { + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC; + break; + } + } + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -2521,6 +2579,7 @@ static int __init parse_ivrs_acpihid(char *str) __setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); +__setup("amd_iommu_intr=", parse_amd_iommu_intr); __setup("ivrs_ioapic", parse_ivrs_ioapic); __setup("ivrs_hpet", parse_ivrs_hpet); __setup("ivrs_acpihid", parse_ivrs_acpihid); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 0bd9eb3..faa3b48 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -38,6 +38,7 @@ extern int amd_iommu_enable(void); extern void amd_iommu_disable(void); extern int amd_iommu_reenable(int); extern int amd_iommu_enable_faulting(void); +extern int amd_iommu_guest_ir; /* IOMMUv2 specific functions */ struct iommu_domain; diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index caf5e38..3d1fc37 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -92,6 +92,7 @@ #define FEATURE_GA (1ULL<<7) #define FEATURE_HE (1ULL<<8) #define FEATURE_PC (1ULL<<9) +#define FEATURE_GAM_VAPIC (1ULL<<21) #define FEATURE_PASID_SHIFT 32 #define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) @@ -146,6 +147,8 @@ #define CONTROL_PPFINT_EN 0x0eULL #define CONTROL_PPR_EN 0x0fULL #define CONTROL_GT_EN 0x10ULL +#define CONTROL_GA_EN 0x11ULL +#define CONTROL_GAM_EN 0x19ULL #define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) #define CTRL_INV_TO_NONE 0 @@ -329,6 +332,12 @@ #define IOMMU_CAP_NPCACHE 26 #define IOMMU_CAP_EFR 27 +/* IOMMU Feature Reporting Field (for IVHD type 10h */ +#define IOMMU_FEAT_GASUP_SHIFT 6 + +/* IOMMU Extended Feature Register (EFR) */ +#define IOMMU_EFR_GASUP_SHIFT 7 + #define MAX_DOMAIN_ID 65536 /* Protection domain flags */ @@ -681,4 +690,19 @@ static inline int get_hpet_devid(int id) return -EINVAL; } +enum amd_iommu_intr_mode_type { + AMD_IOMMU_GUEST_IR_LEGACY, + + /* This mode is not visible to users. It is used when + * we cannot fully enable vAPIC and fallback to only support + * legacy interrupt remapping via 128-bit IRTE. + */ + AMD_IOMMU_GUEST_IR_LEGACY_GA, + AMD_IOMMU_GUEST_IR_VAPIC, +}; + +#define AMD_IOMMU_GUEST_IR_GA(x) (x == AMD_IOMMU_GUEST_IR_VAPIC || \ + x == AMD_IOMMU_GUEST_IR_LEGACY_GA) + +#define AMD_IOMMU_GUEST_IR_VAPIC(x) (x == AMD_IOMMU_GUEST_IR_VAPIC) #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ -- cgit v0.10.2 From a38180bd366f9912a08f52dd6f9a843bf0107d5f Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:33 -0500 Subject: iommu/amd: Move and introduce new IRTE-related unions and structures Move existing unions and structs for accessing/managing IRTE to a proper header file. This is mainly to simplify variable declarations in subsequent patches. Besides, this patch also introduces new struct irte_ga for the new 128-bit IRTE format. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 96de97a..c262a82 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3511,34 +3511,6 @@ EXPORT_SYMBOL(amd_iommu_device_info); * *****************************************************************************/ -union irte { - u32 val; - struct { - u32 valid : 1, - no_fault : 1, - int_type : 3, - rq_eoi : 1, - dm : 1, - rsvd_1 : 1, - destination : 8, - vector : 8, - rsvd_2 : 8; - } fields; -}; - -struct irq_2_irte { - u16 devid; /* Device ID for IRTE table */ - u16 index; /* Index into IRTE table*/ -}; - -struct amd_ir_data { - struct irq_2_irte irq_2_irte; - union irte irte_entry; - union { - struct msi_msg msi_entry; - }; -}; - static struct irq_chip amd_ir_chip; #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 3d1fc37..8b61289 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -705,4 +706,79 @@ enum amd_iommu_intr_mode_type { x == AMD_IOMMU_GUEST_IR_LEGACY_GA) #define AMD_IOMMU_GUEST_IR_VAPIC(x) (x == AMD_IOMMU_GUEST_IR_VAPIC) + +union irte { + u32 val; + struct { + u32 valid : 1, + no_fault : 1, + int_type : 3, + rq_eoi : 1, + dm : 1, + rsvd_1 : 1, + destination : 8, + vector : 8, + rsvd_2 : 8; + } fields; +}; + +union irte_ga_lo { + u64 val; + + /* For int remapping */ + struct { + u64 valid : 1, + no_fault : 1, + /* ------ */ + int_type : 3, + rq_eoi : 1, + dm : 1, + /* ------ */ + guest_mode : 1, + destination : 8, + rsvd : 48; + } fields_remap; + + /* For guest vAPIC */ + struct { + u64 valid : 1, + no_fault : 1, + /* ------ */ + ga_log_intr : 1, + rsvd1 : 3, + is_run : 1, + /* ------ */ + guest_mode : 1, + destination : 8, + rsvd2 : 16, + ga_tag : 32; + } fields_vapic; +}; + +union irte_ga_hi { + u64 val; + struct { + u64 vector : 8, + rsvd_1 : 4, + ga_root_ptr : 40, + rsvd_2 : 12; + } fields; +}; + +struct irte_ga { + union irte_ga_lo lo; + union irte_ga_hi hi; +}; + +struct irq_2_irte { + u16 devid; /* Device ID for IRTE table */ + u16 index; /* Index into IRTE table*/ +}; + +struct amd_ir_data { + struct irq_2_irte irq_2_irte; + union irte irte_entry; + struct msi_msg msi_entry; +}; + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ -- cgit v0.10.2 From 880ac60e2538337f84d9f76ab7b3c13ee7787804 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:34 -0500 Subject: iommu/amd: Introduce interrupt remapping ops structure Currently, IOMMU support two interrupt remapping table entry formats, 32-bit (legacy) and 128-bit (GA). The spec also implies that it might support additional modes/formats in the future. So, this patch introduces the new struct amd_irte_ops, which allows the same code to work with different irte formats by providing hooks for various operations on an interrupt remapping table entry. Suggested-by: Joerg Roedel Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c262a82..baf2c2c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3644,11 +3644,12 @@ out: return index; } -static int modify_irte(u16 devid, int index, union irte irte) +static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte) { struct irq_remap_table *table; struct amd_iommu *iommu; unsigned long flags; + struct irte_ga *entry; iommu = amd_iommu_rlookup_table[devid]; if (iommu == NULL) @@ -3659,7 +3660,38 @@ static int modify_irte(u16 devid, int index, union irte irte) return -ENOMEM; spin_lock_irqsave(&table->lock, flags); - table->table[index] = irte.val; + + entry = (struct irte_ga *)table->table; + entry = &entry[index]; + entry->lo.fields_remap.valid = 0; + entry->hi.val = irte->hi.val; + entry->lo.val = irte->lo.val; + entry->lo.fields_remap.valid = 1; + + spin_unlock_irqrestore(&table->lock, flags); + + iommu_flush_irt(iommu, devid); + iommu_completion_wait(iommu); + + return 0; +} + +static int modify_irte(u16 devid, int index, union irte *irte) +{ + struct irq_remap_table *table; + struct amd_iommu *iommu; + unsigned long flags; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + return -EINVAL; + + table = get_irq_table(devid, false); + if (!table) + return -ENOMEM; + + spin_lock_irqsave(&table->lock, flags); + table->table[index] = irte->val; spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); @@ -3690,6 +3722,134 @@ static void free_irte(u16 devid, int index) iommu_completion_wait(iommu); } +static void irte_prepare(void *entry, + u32 delivery_mode, u32 dest_mode, + u8 vector, u32 dest_apicid) +{ + union irte *irte = (union irte *) entry; + + irte->val = 0; + irte->fields.vector = vector; + irte->fields.int_type = delivery_mode; + irte->fields.destination = dest_apicid; + irte->fields.dm = dest_mode; + irte->fields.valid = 1; +} + +static void irte_ga_prepare(void *entry, + u32 delivery_mode, u32 dest_mode, + u8 vector, u32 dest_apicid) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + + irte->lo.val = 0; + irte->hi.val = 0; + irte->lo.fields_remap.guest_mode = 0; + irte->lo.fields_remap.int_type = delivery_mode; + irte->lo.fields_remap.dm = dest_mode; + irte->hi.fields.vector = vector; + irte->lo.fields_remap.destination = dest_apicid; + irte->lo.fields_remap.valid = 1; +} + +static void irte_activate(void *entry, u16 devid, u16 index) +{ + union irte *irte = (union irte *) entry; + + irte->fields.valid = 1; + modify_irte(devid, index, irte); +} + +static void irte_ga_activate(void *entry, u16 devid, u16 index) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + + irte->lo.fields_remap.valid = 1; + modify_irte_ga(devid, index, irte); +} + +static void irte_deactivate(void *entry, u16 devid, u16 index) +{ + union irte *irte = (union irte *) entry; + + irte->fields.valid = 0; + modify_irte(devid, index, irte); +} + +static void irte_ga_deactivate(void *entry, u16 devid, u16 index) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + + irte->lo.fields_remap.valid = 0; + modify_irte_ga(devid, index, irte); +} + +static void irte_set_affinity(void *entry, u16 devid, u16 index, + u8 vector, u32 dest_apicid) +{ + union irte *irte = (union irte *) entry; + + irte->fields.vector = vector; + irte->fields.destination = dest_apicid; + modify_irte(devid, index, irte); +} + +static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, + u8 vector, u32 dest_apicid) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + + irte->hi.fields.vector = vector; + irte->lo.fields_remap.destination = dest_apicid; + irte->lo.fields_remap.guest_mode = 0; + modify_irte_ga(devid, index, irte); +} + +static void irte_set_allocated(struct irq_remap_table *table, int index) +{ + table->table[index] = IRTE_ALLOCATED; +} + +static void irte_ga_set_allocated(struct irq_remap_table *table, int index) +{ + struct irte_ga *ptr = (struct irte_ga *)table->table; + struct irte_ga *irte = &ptr[index]; + + memset(&irte->lo.val, 0, sizeof(u64)); + memset(&irte->hi.val, 0, sizeof(u64)); + irte->hi.fields.vector = 0xff; +} + +static bool irte_is_allocated(struct irq_remap_table *table, int index) +{ + union irte *ptr = (union irte *)table->table; + union irte *irte = &ptr[index]; + + return irte->val != 0; +} + +static bool irte_ga_is_allocated(struct irq_remap_table *table, int index) +{ + struct irte_ga *ptr = (struct irte_ga *)table->table; + struct irte_ga *irte = &ptr[index]; + + return irte->hi.fields.vector != 0; +} + +static void irte_clear_allocated(struct irq_remap_table *table, int index) +{ + table->table[index] = 0; +} + +static void irte_ga_clear_allocated(struct irq_remap_table *table, int index) +{ + struct irte_ga *ptr = (struct irte_ga *)table->table; + struct irte_ga *irte = &ptr[index]; + + memset(&irte->lo.val, 0, sizeof(u64)); + memset(&irte->hi.val, 0, sizeof(u64)); +} + static int get_devid(struct irq_alloc_info *info) { int devid = -1; @@ -3817,6 +3977,26 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, } } +struct amd_irte_ops irte_32_ops = { + .prepare = irte_prepare, + .activate = irte_activate, + .deactivate = irte_deactivate, + .set_affinity = irte_set_affinity, + .set_allocated = irte_set_allocated, + .is_allocated = irte_is_allocated, + .clear_allocated = irte_clear_allocated, +}; + +struct amd_irte_ops irte_128_ops = { + .prepare = irte_ga_prepare, + .activate = irte_ga_activate, + .deactivate = irte_ga_deactivate, + .set_affinity = irte_ga_set_affinity, + .set_allocated = irte_ga_set_allocated, + .is_allocated = irte_ga_is_allocated, + .clear_allocated = irte_ga_clear_allocated, +}; + static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { @@ -3922,7 +4102,7 @@ static void irq_remapping_activate(struct irq_domain *domain, struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; - modify_irte(irte_info->devid, irte_info->index, data->irte_entry); + modify_irte(irte_info->devid, irte_info->index, &data->irte_entry); } static void irq_remapping_deactivate(struct irq_domain *domain, @@ -3933,7 +4113,7 @@ static void irq_remapping_deactivate(struct irq_domain *domain, union irte entry; entry.val = 0; - modify_irte(irte_info->devid, irte_info->index, data->irte_entry); + modify_irte(irte_info->devid, irte_info->index, &data->irte_entry); } static struct irq_domain_ops amd_ir_domain_ops = { @@ -3962,7 +4142,7 @@ static int amd_ir_set_affinity(struct irq_data *data, */ ir_data->irte_entry.fields.vector = cfg->vector; ir_data->irte_entry.fields.destination = cfg->dest_apicid; - modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry); + modify_irte(irte_info->devid, irte_info->index, &ir_data->irte_entry); /* * After this point, all the interrupts will start arriving diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 8b61289..6b93ebc 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -410,6 +410,7 @@ struct amd_iommu_fault { struct iommu_domain; struct irq_domain; +struct amd_irte_ops; /* * This structure contains generic data for IOMMU protection domains @@ -533,6 +534,8 @@ struct amd_iommu { #ifdef CONFIG_IRQ_REMAP struct irq_domain *ir_domain; struct irq_domain *msi_domain; + + struct amd_irte_ops *irte_ops; #endif }; @@ -779,6 +782,23 @@ struct amd_ir_data { struct irq_2_irte irq_2_irte; union irte irte_entry; struct msi_msg msi_entry; + void *entry; /* Pointer to union irte or struct irte_ga */ +}; + +struct amd_irte_ops { + void (*prepare)(void *, u32, u32, u8, u32); + void (*activate)(void *, u16, u16); + void (*deactivate)(void *, u16, u16); + void (*set_affinity)(void *, u16, u16, u8, u32); + void *(*get)(struct irq_remap_table *, int); + void (*set_allocated)(struct irq_remap_table *, int); + bool (*is_allocated)(struct irq_remap_table *, int); + void (*clear_allocated)(struct irq_remap_table *, int); }; +#ifdef CONFIG_IRQ_REMAP +extern struct amd_irte_ops irte_32_ops; +extern struct amd_irte_ops irte_128_ops; +#endif + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ -- cgit v0.10.2 From 77bdab46f04ffd93140c574f4fbd48ab521fdbe0 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:35 -0500 Subject: iommu/amd: Add support for multiple IRTE formats This patch enables support for the new 128-bit IOMMU IRTE format, which can be used for both legacy and vapic interrupt remapping modes. It replaces the existing operations on IRTE, which can only support the older 32-bit IRTE format, with calls to the new struct amd_irt_ops. It also provides helper functions for setting up, accessing, and updating interrupt remapping table entries in different mode. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index baf2c2c..99c0f4c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3532,8 +3532,6 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) amd_iommu_dev_table[devid].data[2] = dte; } -#define IRTE_ALLOCATED (~1U) - static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) { struct irq_remap_table *table = NULL; @@ -3579,13 +3577,18 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) goto out; } - memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32)); + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + memset(table->table, 0, + MAX_IRQS_PER_TABLE * sizeof(u32)); + else + memset(table->table, 0, + (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); if (ioapic) { int i; for (i = 0; i < 32; ++i) - table->table[i] = IRTE_ALLOCATED; + iommu->irte_ops->set_allocated(table, i); } irq_lookup_table[devid] = table; @@ -3611,6 +3614,10 @@ static int alloc_irq_index(u16 devid, int count) struct irq_remap_table *table; unsigned long flags; int index, c; + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + if (!iommu) + return -ENODEV; table = get_irq_table(devid, false); if (!table) @@ -3622,14 +3629,14 @@ static int alloc_irq_index(u16 devid, int count) for (c = 0, index = table->min_index; index < MAX_IRQS_PER_TABLE; ++index) { - if (table->table[index] == 0) + if (!iommu->irte_ops->is_allocated(table, index)) c += 1; else c = 0; if (c == count) { for (; c != 0; --c) - table->table[index - c + 1] = IRTE_ALLOCATED; + iommu->irte_ops->set_allocated(table, index - c + 1); index -= count - 1; goto out; @@ -3715,7 +3722,7 @@ static void free_irte(u16 devid, int index) return; spin_lock_irqsave(&table->lock, flags); - table->table[index] = 0; + iommu->irte_ops->clear_allocated(table, index); spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); @@ -3805,6 +3812,7 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, modify_irte_ga(devid, index, irte); } +#define IRTE_ALLOCATED (~1U) static void irte_set_allocated(struct irq_remap_table *table, int index) { table->table[index] = IRTE_ALLOCATED; @@ -3934,19 +3942,17 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, { struct irq_2_irte *irte_info = &data->irq_2_irte; struct msi_msg *msg = &data->msi_entry; - union irte *irte = &data->irte_entry; struct IO_APIC_route_entry *entry; + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + if (!iommu) + return; data->irq_2_irte.devid = devid; data->irq_2_irte.index = index + sub_handle; - - /* Setup IRTE for IOMMU */ - irte->val = 0; - irte->fields.vector = irq_cfg->vector; - irte->fields.int_type = apic->irq_delivery_mode; - irte->fields.destination = irq_cfg->dest_apicid; - irte->fields.dm = apic->irq_dest_mode; - irte->fields.valid = 1; + iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode, + apic->irq_dest_mode, irq_cfg->vector, + irq_cfg->dest_apicid); switch (info->type) { case X86_IRQ_ALLOC_TYPE_IOAPIC: @@ -4002,7 +4008,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, { struct irq_alloc_info *info = arg; struct irq_data *irq_data; - struct amd_ir_data *data; + struct amd_ir_data *data = NULL; struct irq_cfg *cfg; int i, ret, devid; int index = -1; @@ -4054,6 +4060,16 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (!data) goto out_free_data; + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + data->entry = kzalloc(sizeof(union irte), GFP_KERNEL); + else + data->entry = kzalloc(sizeof(struct irte_ga), + GFP_KERNEL); + if (!data->entry) { + kfree(data); + goto out_free_data; + } + irq_data->hwirq = (devid << 16) + i; irq_data->chip_data = data; irq_data->chip = &amd_ir_chip; @@ -4090,6 +4106,7 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, data = irq_data->chip_data; irte_info = &data->irq_2_irte; free_irte(irte_info->devid, irte_info->index); + kfree(data->entry); kfree(data); } } @@ -4101,8 +4118,11 @@ static void irq_remapping_activate(struct irq_domain *domain, { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; + struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; - modify_irte(irte_info->devid, irte_info->index, &data->irte_entry); + if (iommu) + iommu->irte_ops->activate(data->entry, irte_info->devid, + irte_info->index); } static void irq_remapping_deactivate(struct irq_domain *domain, @@ -4110,10 +4130,11 @@ static void irq_remapping_deactivate(struct irq_domain *domain, { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; - union irte entry; + struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; - entry.val = 0; - modify_irte(irte_info->devid, irte_info->index, &data->irte_entry); + if (iommu) + iommu->irte_ops->deactivate(data->entry, irte_info->devid, + irte_info->index); } static struct irq_domain_ops amd_ir_domain_ops = { @@ -4130,8 +4151,12 @@ static int amd_ir_set_affinity(struct irq_data *data, struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct irq_cfg *cfg = irqd_cfg(data); struct irq_data *parent = data->parent_data; + struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; int ret; + if (!iommu) + return -ENODEV; + ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; @@ -4140,9 +4165,8 @@ static int amd_ir_set_affinity(struct irq_data *data, * Atomically updates the IRTE with the new destination, vector * and flushes the interrupt entry cache. */ - ir_data->irte_entry.fields.vector = cfg->vector; - ir_data->irte_entry.fields.destination = cfg->dest_apicid; - modify_irte(irte_info->devid, irte_info->index, &ir_data->irte_entry); + iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid, + irte_info->index, cfg->vector, cfg->dest_apicid); /* * After this point, all the interrupts will start arriving diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index c3afd86..c17febb 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1893,8 +1893,10 @@ static void iommu_enable_ga(struct amd_iommu *iommu) /* Fall through */ case AMD_IOMMU_GUEST_IR_LEGACY_GA: iommu_feature_enable(iommu, CONTROL_GA_EN); + iommu->irte_ops = &irte_128_ops; break; default: + iommu->irte_ops = &irte_32_ops; break; } #endif diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 6b93ebc..77a8fb6 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -780,7 +780,6 @@ struct irq_2_irte { struct amd_ir_data { struct irq_2_irte irq_2_irte; - union irte irte_entry; struct msi_msg msi_entry; void *entry; /* Pointer to union irte or struct irte_ga */ }; -- cgit v0.10.2 From 8bda0cfbdc1a6278a1bbdba795139da682f296ff Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:36 -0500 Subject: iommu/amd: Detect and initialize guest vAPIC log This patch adds support to detect and initialize IOMMU Guest vAPIC log (GALOG). By default, it also enable GALog interrupt to notify IOMMU driver when GA Log entry is created. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index c17febb..156ab4b 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -84,6 +84,7 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +#define LOOP_TIMEOUT 100000 /* * ACPI table definitions * @@ -388,6 +389,10 @@ static void iommu_disable(struct amd_iommu *iommu) iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + /* Disable IOMMU GA_LOG */ + iommu_feature_disable(iommu, CONTROL_GALOG_EN); + iommu_feature_disable(iommu, CONTROL_GAINT_EN); + /* Disable IOMMU hardware itself */ iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } @@ -673,6 +678,99 @@ static void __init free_ppr_log(struct amd_iommu *iommu) free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE)); } +static void free_ga_log(struct amd_iommu *iommu) +{ +#ifdef CONFIG_IRQ_REMAP + if (iommu->ga_log) + free_pages((unsigned long)iommu->ga_log, + get_order(GA_LOG_SIZE)); + if (iommu->ga_log_tail) + free_pages((unsigned long)iommu->ga_log_tail, + get_order(8)); +#endif +} + +static int iommu_ga_log_enable(struct amd_iommu *iommu) +{ +#ifdef CONFIG_IRQ_REMAP + u32 status, i; + + if (!iommu->ga_log) + return -EINVAL; + + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + + /* Check if already running */ + if (status & (MMIO_STATUS_GALOG_RUN_MASK)) + return 0; + + iommu_feature_enable(iommu, CONTROL_GAINT_EN); + iommu_feature_enable(iommu, CONTROL_GALOG_EN); + + for (i = 0; i < LOOP_TIMEOUT; ++i) { + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + if (status & (MMIO_STATUS_GALOG_RUN_MASK)) + break; + } + + if (i >= LOOP_TIMEOUT) + return -EINVAL; +#endif /* CONFIG_IRQ_REMAP */ + return 0; +} + +#ifdef CONFIG_IRQ_REMAP +static int iommu_init_ga_log(struct amd_iommu *iommu) +{ + u64 entry; + + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + return 0; + + iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(GA_LOG_SIZE)); + if (!iommu->ga_log) + goto err_out; + + iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(8)); + if (!iommu->ga_log_tail) + goto err_out; + + entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET, + &entry, sizeof(entry)); + entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; + memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET, + &entry, sizeof(entry)); + writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET); + + return 0; +err_out: + free_ga_log(iommu); + return -EINVAL; +} +#endif /* CONFIG_IRQ_REMAP */ + +static int iommu_init_ga(struct amd_iommu *iommu) +{ + int ret = 0; + +#ifdef CONFIG_IRQ_REMAP + /* Note: We have already checked GASup from IVRS table. + * Now, we need to make sure that GAMSup is set. + */ + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && + !iommu_feature(iommu, FEATURE_GAM_VAPIC)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; + + ret = iommu_init_ga_log(iommu); +#endif /* CONFIG_IRQ_REMAP */ + + return ret; +} + static void iommu_enable_gt(struct amd_iommu *iommu) { if (!iommu_feature(iommu, FEATURE_GT)) @@ -1146,6 +1244,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu) free_command_buffer(iommu); free_event_buffer(iommu); free_ppr_log(iommu); + free_ga_log(iommu); iommu_unmap_mmio_space(iommu); } @@ -1438,6 +1537,7 @@ static int iommu_init_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc, low, high; + int ret; iommu->dev = pci_get_bus_and_slot(PCI_BUS_NUM(iommu->devid), iommu->devid & 0xff); @@ -1494,13 +1594,9 @@ static int iommu_init_pci(struct amd_iommu *iommu) if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu)) return -ENOMEM; - /* Note: We have already checked GASup from IVRS table. - * Now, we need to make sure that GAMSup is set. - */ - if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && - !iommu_feature(iommu, FEATURE_GAM_VAPIC)) - amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; - + ret = iommu_init_ga(iommu); + if (ret) + return ret; if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) amd_iommu_np_cache = true; @@ -1667,6 +1763,8 @@ enable_faults: if (iommu->ppr_log != NULL) iommu_feature_enable(iommu, CONTROL_PPFINT_EN); + iommu_ga_log_enable(iommu); + return 0; } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 77a8fb6..8a7a93d 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -70,6 +70,8 @@ #define MMIO_EXCL_LIMIT_OFFSET 0x0028 #define MMIO_EXT_FEATURES 0x0030 #define MMIO_PPR_LOG_OFFSET 0x0038 +#define MMIO_GA_LOG_BASE_OFFSET 0x00e0 +#define MMIO_GA_LOG_TAIL_OFFSET 0x00e8 #define MMIO_CMD_HEAD_OFFSET 0x2000 #define MMIO_CMD_TAIL_OFFSET 0x2008 #define MMIO_EVT_HEAD_OFFSET 0x2010 @@ -77,6 +79,8 @@ #define MMIO_STATUS_OFFSET 0x2020 #define MMIO_PPR_HEAD_OFFSET 0x2030 #define MMIO_PPR_TAIL_OFFSET 0x2038 +#define MMIO_GA_HEAD_OFFSET 0x2040 +#define MMIO_GA_TAIL_OFFSET 0x2048 #define MMIO_CNTR_CONF_OFFSET 0x4000 #define MMIO_CNTR_REG_OFFSET 0x40000 #define MMIO_REG_END_OFFSET 0x80000 @@ -112,6 +116,9 @@ #define MMIO_STATUS_EVT_INT_MASK (1 << 1) #define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) #define MMIO_STATUS_PPR_INT_MASK (1 << 6) +#define MMIO_STATUS_GALOG_RUN_MASK (1 << 8) +#define MMIO_STATUS_GALOG_OVERFLOW_MASK (1 << 9) +#define MMIO_STATUS_GALOG_INT_MASK (1 << 10) /* event logging constants */ #define EVENT_ENTRY_SIZE 0x10 @@ -150,6 +157,8 @@ #define CONTROL_GT_EN 0x10ULL #define CONTROL_GA_EN 0x11ULL #define CONTROL_GAM_EN 0x19ULL +#define CONTROL_GALOG_EN 0x1CULL +#define CONTROL_GAINT_EN 0x1DULL #define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) #define CTRL_INV_TO_NONE 0 @@ -228,6 +237,19 @@ #define PPR_REQ_FAULT 0x01 +/* Constants for GA Log handling */ +#define GA_LOG_ENTRIES 512 +#define GA_LOG_SIZE_SHIFT 56 +#define GA_LOG_SIZE_512 (0x8ULL << GA_LOG_SIZE_SHIFT) +#define GA_ENTRY_SIZE 8 +#define GA_LOG_SIZE (GA_ENTRY_SIZE * GA_LOG_ENTRIES) + +#define GA_TAG(x) (u32)(x & 0xffffffffULL) +#define GA_DEVID(x) (u16)(((x) >> 32) & 0xffffULL) +#define GA_REQ_TYPE(x) (((x) >> 60) & 0xfULL) + +#define GA_GUEST_NR 0x1 + #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 #define PAGE_MODE_2_LEVEL 0x02 @@ -501,6 +523,12 @@ struct amd_iommu { /* Base of the PPR log, if present */ u8 *ppr_log; + /* Base of the GA log, if present */ + u8 *ga_log; + + /* Tail of the GA log, if present */ + u8 *ga_log_tail; + /* true if interrupts for this IOMMU are already enabled */ bool int_enabled; -- cgit v0.10.2 From bd6fcefc66f6d038406e38edf96a95d9842f819d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:37 -0500 Subject: iommu/amd: Adding GALOG interrupt handler This patch adds AMD IOMMU guest virtual APIC log (GALOG) handler. When IOMMU hardware receives an interrupt targeting a blocking vcpu, it creates an entry in the GALOG, and generates an interrupt to notify the AMD IOMMU driver. At this point, the driver processes the log entry, and notify the SVM driver via the registered iommu_ga_log_notifier function. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 99c0f4c..eec37b2 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -707,14 +707,74 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) } } +#ifdef CONFIG_IRQ_REMAP +static int (*iommu_ga_log_notifier)(u32); + +int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) +{ + iommu_ga_log_notifier = notifier; + + return 0; +} +EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); + +static void iommu_poll_ga_log(struct amd_iommu *iommu) +{ + u32 head, tail, cnt = 0; + + if (iommu->ga_log == NULL) + return; + + head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET); + + while (head != tail) { + volatile u64 *raw; + u64 log_entry; + + raw = (u64 *)(iommu->ga_log + head); + cnt++; + + /* Avoid memcpy function-call overhead */ + log_entry = *raw; + + /* Update head pointer of hardware ring-buffer */ + head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE; + writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); + + /* Handle GA entry */ + switch (GA_REQ_TYPE(log_entry)) { + case GA_GUEST_NR: + if (!iommu_ga_log_notifier) + break; + + pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n", + __func__, GA_DEVID(log_entry), + GA_TAG(log_entry)); + + if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0) + pr_err("AMD-Vi: GA log notifier failed.\n"); + break; + default: + break; + } + } +} +#endif /* CONFIG_IRQ_REMAP */ + +#define AMD_IOMMU_INT_MASK \ + (MMIO_STATUS_EVT_INT_MASK | \ + MMIO_STATUS_PPR_INT_MASK | \ + MMIO_STATUS_GALOG_INT_MASK) + irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu = (struct amd_iommu *) data; u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) { - /* Enable EVT and PPR interrupts again */ - writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK), + while (status & AMD_IOMMU_INT_MASK) { + /* Enable EVT and PPR and GA interrupts again */ + writel(AMD_IOMMU_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & MMIO_STATUS_EVT_INT_MASK) { @@ -727,6 +787,13 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data) iommu_poll_ppr_log(iommu); } +#ifdef CONFIG_IRQ_REMAP + if (status & MMIO_STATUS_GALOG_INT_MASK) { + pr_devel("AMD-Vi: Processing IOMMU GA Log\n"); + iommu_poll_ga_log(iommu); + } +#endif + /* * Hardware bug: ERBT1312 * When re-enabling interrupt (by writing 1 diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 2b08e79..465d096 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -168,11 +168,25 @@ typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid); extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, amd_iommu_invalidate_ctx cb); - -#else +#else /* CONFIG_AMD_IOMMU */ static inline int amd_iommu_detect(void) { return -ENODEV; } -#endif +#endif /* CONFIG_AMD_IOMMU */ + +#if defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) + +/* IOMMU AVIC Function */ +extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); + +#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ + +static inline int +amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) +{ + return 0; +} + +#endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ #endif /* _ASM_X86_AMD_IOMMU_H */ -- cgit v0.10.2 From 8dbea3fd7becd4af8ca882c3132be4b1a857e301 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:38 -0500 Subject: iommu/amd: Introduce amd_iommu_update_ga() Introduces a new IOMMU API, amd_iommu_update_ga(), which allows KVM (SVM) to update existing posted interrupt IOMMU IRTE when load/unload vcpu. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index eec37b2..089e1ed 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4269,4 +4269,43 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } + +int amd_iommu_update_ga(int cpu, bool is_run, void *data) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct irq_remap_table *irt; + struct amd_ir_data *ir_data = (struct amd_ir_data *)data; + int devid = ir_data->irq_2_irte.devid; + struct irte_ga *entry = (struct irte_ga *) ir_data->entry; + struct irte_ga *ref = (struct irte_ga *) ir_data->ref; + + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || + !ref || !entry || !entry->lo.fields_vapic.guest_mode) + return 0; + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return -ENODEV; + + irt = get_irq_table(devid, false); + if (!irt) + return -ENODEV; + + spin_lock_irqsave(&irt->lock, flags); + + if (ref->lo.fields_vapic.guest_mode) { + if (cpu >= 0) + ref->lo.fields_vapic.destination = cpu; + ref->lo.fields_vapic.is_run = is_run; + barrier(); + } + + spin_unlock_irqrestore(&irt->lock, flags); + + iommu_flush_irt(iommu, devid); + iommu_completion_wait(iommu); + return 0; +} +EXPORT_SYMBOL(amd_iommu_update_ga); #endif diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 8a7a93d..60018a8 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -810,6 +810,7 @@ struct amd_ir_data { struct irq_2_irte irq_2_irte; struct msi_msg msi_entry; void *entry; /* Pointer to union irte or struct irte_ga */ + void *ref; /* Pointer to the actual irte */ }; struct amd_irte_ops { diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 465d096..d8d48ac 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -179,6 +179,9 @@ static inline int amd_iommu_detect(void) { return -ENODEV; } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); +extern int +amd_iommu_update_ga(int cpu, bool is_run, void *data); + #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ static inline int @@ -187,6 +190,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } +static inline int +amd_iommu_update_ga(int cpu, bool is_run, void *data) +{ + return 0; +} + #endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ #endif /* _ASM_X86_AMD_IOMMU_H */ -- cgit v0.10.2 From b9fc6b56f478b487dc8fc400da73d89ac9137201 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:39 -0500 Subject: iommu/amd: Implements irq_set_vcpu_affinity() hook to setup vapic mode for pass-through devices This patch implements irq_set_vcpu_affinity() function to set up interrupt remapping table entry with vapic mode for pass-through devices. In case requirements for vapic mode are not met, it falls back to set up the IRTE in legacy mode. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 089e1ed..69fc57e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3718,7 +3718,8 @@ out: return index; } -static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte) +static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte, + struct amd_ir_data *data) { struct irq_remap_table *table; struct amd_iommu *iommu; @@ -3741,6 +3742,8 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte) entry->hi.val = irte->hi.val; entry->lo.val = irte->lo.val; entry->lo.fields_remap.valid = 1; + if (data) + data->ref = entry; spin_unlock_irqrestore(&table->lock, flags); @@ -3839,7 +3842,7 @@ static void irte_ga_activate(void *entry, u16 devid, u16 index) struct irte_ga *irte = (struct irte_ga *) entry; irte->lo.fields_remap.valid = 1; - modify_irte_ga(devid, index, irte); + modify_irte_ga(devid, index, irte, NULL); } static void irte_deactivate(void *entry, u16 devid, u16 index) @@ -3855,7 +3858,7 @@ static void irte_ga_deactivate(void *entry, u16 devid, u16 index) struct irte_ga *irte = (struct irte_ga *) entry; irte->lo.fields_remap.valid = 0; - modify_irte_ga(devid, index, irte); + modify_irte_ga(devid, index, irte, NULL); } static void irte_set_affinity(void *entry, u16 devid, u16 index, @@ -3876,7 +3879,7 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, irte->hi.fields.vector = vector; irte->lo.fields_remap.destination = dest_apicid; irte->lo.fields_remap.guest_mode = 0; - modify_irte_ga(devid, index, irte); + modify_irte_ga(devid, index, irte, NULL); } #define IRTE_ALLOCATED (~1U) @@ -4211,6 +4214,62 @@ static struct irq_domain_ops amd_ir_domain_ops = { .deactivate = irq_remapping_deactivate, }; +static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) +{ + struct amd_iommu *iommu; + struct amd_iommu_pi_data *pi_data = vcpu_info; + struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; + struct amd_ir_data *ir_data = data->chip_data; + struct irte_ga *irte = (struct irte_ga *) ir_data->entry; + struct irq_2_irte *irte_info = &ir_data->irq_2_irte; + + pi_data->ir_data = ir_data; + + /* Note: + * SVM tries to set up for VAPIC mode, but we are in + * legacy mode. So, we force legacy mode instead. + */ + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { + pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n", + __func__); + pi_data->is_guest_mode = false; + } + + iommu = amd_iommu_rlookup_table[irte_info->devid]; + if (iommu == NULL) + return -EINVAL; + + pi_data->prev_ga_tag = ir_data->cached_ga_tag; + if (pi_data->is_guest_mode) { + /* Setting */ + irte->hi.fields.ga_root_ptr = (pi_data->base >> 12); + irte->hi.fields.vector = vcpu_pi_info->vector; + irte->lo.fields_vapic.guest_mode = 1; + irte->lo.fields_vapic.ga_tag = pi_data->ga_tag; + + ir_data->cached_ga_tag = pi_data->ga_tag; + } else { + /* Un-Setting */ + struct irq_cfg *cfg = irqd_cfg(data); + + irte->hi.val = 0; + irte->lo.val = 0; + irte->hi.fields.vector = cfg->vector; + irte->lo.fields_remap.guest_mode = 0; + irte->lo.fields_remap.destination = cfg->dest_apicid; + irte->lo.fields_remap.int_type = apic->irq_delivery_mode; + irte->lo.fields_remap.dm = apic->irq_dest_mode; + + /* + * This communicates the ga_tag back to the caller + * so that it can do all the necessary clean up. + */ + ir_data->cached_ga_tag = 0; + } + + return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data); +} + static int amd_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -4255,6 +4314,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) static struct irq_chip amd_ir_chip = { .irq_ack = ir_ack_apic_edge, .irq_set_affinity = amd_ir_set_affinity, + .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, .irq_compose_msi_msg = ir_compose_msi_msg, }; diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 60018a8..79e91e6 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -807,6 +807,7 @@ struct irq_2_irte { }; struct amd_ir_data { + u32 cached_ga_tag; struct irq_2_irte irq_2_irte; struct msi_msg msi_entry; void *entry; /* Pointer to union irte or struct irte_ga */ diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index d8d48ac..09751d3 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -22,6 +22,20 @@ #include +/* + * This is mainly used to communicate information back-and-forth + * between SVM and IOMMU for setting up and tearing down posted + * interrupt + */ +struct amd_iommu_pi_data { + u32 ga_tag; + u32 prev_ga_tag; + u64 base; + bool is_guest_mode; + struct vcpu_data *vcpu_data; + void *ir_data; +}; + #ifdef CONFIG_AMD_IOMMU struct task_struct; -- cgit v0.10.2 From d98de49a53e48f51332e97568127e722415e1232 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:40 -0500 Subject: iommu/amd: Enable vAPIC interrupt remapping mode by default Introduce struct iommu_dev_data.use_vapic flag, which IOMMU driver uses to determine if it should enable vAPIC support, by setting the ga_mode bit in the device's interrupt remapping table entry. Currently, it is enabled for all pass-through device if vAPIC mode is enabled. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 69fc57e..a7aa0e7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -137,6 +137,7 @@ struct iommu_dev_data { bool pri_tlp; /* PASID TLB required for PPR completions */ u32 errata; /* Bitmap for errata to apply */ + bool use_vapic; /* Enable device to use vapic mode */ }; /* @@ -3015,6 +3016,12 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && + (dom->type == IOMMU_DOMAIN_UNMANAGED)) + dev_data->use_vapic = 0; +#endif + iommu_completion_wait(iommu); } @@ -3040,6 +3047,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, ret = attach_device(dev, domain); +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { + if (dom->type == IOMMU_DOMAIN_UNMANAGED) + dev_data->use_vapic = 1; + else + dev_data->use_vapic = 0; + } +#endif + iommu_completion_wait(iommu); return ret; @@ -3801,7 +3817,7 @@ static void free_irte(u16 devid, int index) static void irte_prepare(void *entry, u32 delivery_mode, u32 dest_mode, - u8 vector, u32 dest_apicid) + u8 vector, u32 dest_apicid, int devid) { union irte *irte = (union irte *) entry; @@ -3815,13 +3831,14 @@ static void irte_prepare(void *entry, static void irte_ga_prepare(void *entry, u32 delivery_mode, u32 dest_mode, - u8 vector, u32 dest_apicid) + u8 vector, u32 dest_apicid, int devid) { struct irte_ga *irte = (struct irte_ga *) entry; + struct iommu_dev_data *dev_data = search_dev_data(devid); irte->lo.val = 0; irte->hi.val = 0; - irte->lo.fields_remap.guest_mode = 0; + irte->lo.fields_remap.guest_mode = dev_data ? dev_data->use_vapic : 0; irte->lo.fields_remap.int_type = delivery_mode; irte->lo.fields_remap.dm = dest_mode; irte->hi.fields.vector = vector; @@ -3875,11 +3892,14 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, u8 vector, u32 dest_apicid) { struct irte_ga *irte = (struct irte_ga *) entry; + struct iommu_dev_data *dev_data = search_dev_data(devid); - irte->hi.fields.vector = vector; - irte->lo.fields_remap.destination = dest_apicid; - irte->lo.fields_remap.guest_mode = 0; - modify_irte_ga(devid, index, irte, NULL); + if (!dev_data || !dev_data->use_vapic) { + irte->hi.fields.vector = vector; + irte->lo.fields_remap.destination = dest_apicid; + irte->lo.fields_remap.guest_mode = 0; + modify_irte_ga(devid, index, irte, NULL); + } } #define IRTE_ALLOCATED (~1U) @@ -4022,7 +4042,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, data->irq_2_irte.index = index + sub_handle; iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode, apic->irq_dest_mode, irq_cfg->vector, - irq_cfg->dest_apicid); + irq_cfg->dest_apicid, devid); switch (info->type) { case X86_IRQ_ALLOC_TYPE_IOAPIC: @@ -4222,6 +4242,14 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) struct amd_ir_data *ir_data = data->chip_data; struct irte_ga *irte = (struct irte_ga *) ir_data->entry; struct irq_2_irte *irte_info = &ir_data->irq_2_irte; + struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid); + + /* Note: + * This device has never been set up for guest mode. + * we should not modify the IRTE + */ + if (!dev_data || !dev_data->use_vapic) + return 0; pi_data->ir_data = ir_data; diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 156ab4b..cd17136 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -146,7 +146,7 @@ struct ivmd_header { bool amd_iommu_dump; bool amd_iommu_irq_remap __read_mostly; -int amd_iommu_guest_ir; +int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC; static bool amd_iommu_detected; static bool __initdata amd_iommu_disabled; @@ -2019,6 +2019,11 @@ static void early_enable_iommus(void) iommu_enable(iommu); iommu_flush_all_caches(iommu); } + +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP); +#endif } static void enable_iommus_v2(void) @@ -2044,6 +2049,11 @@ static void disable_iommus(void) for_each_iommu(iommu) iommu_disable(iommu); + +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP); +#endif } /* diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 79e91e6..fa766ee 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -815,7 +815,7 @@ struct amd_ir_data { }; struct amd_irte_ops { - void (*prepare)(void *, u32, u32, u8, u32); + void (*prepare)(void *, u32, u32, u8, u32, int); void (*activate)(void *, u16, u16); void (*deactivate)(void *, u16, u16); void (*set_affinity)(void *, u16, u16, u8, u32); -- cgit v0.10.2 From 72e0ae58a0fbded49d6fa80dcca5aaea9eb5a73f Mon Sep 17 00:00:00 2001 From: Liang Li Date: Thu, 18 Aug 2016 15:49:19 +0800 Subject: vmx: refine validity check for guest linear address The validity check for the guest line address is inefficient, check the invalid value instead of enumerating the valid ones. Signed-off-by: Liang Li Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5cede40..8d28804 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6109,7 +6109,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + if (gla_validity == 0x2) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), -- cgit v0.10.2 From f15a75eedc18e1fece8e01f1a6f7d135e61b0750 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 30 Aug 2016 16:14:01 +0800 Subject: KVM: nVMX: make emulated nested preemption timer pinned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 61abdbe0bc ("kvm: x86: make lapic hrtimer pinned") pins the emulated lapic timer. This patch does the same for the emulated nested preemption timer to avoid vmexit an unrelated vCPU and the latency of kicking IPI to another vCPU. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Yunhong Jiang Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8d28804..ccb0356 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7013,7 +7013,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_num = 0; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.vmxon = true; -- cgit v0.10.2 From 3706feacd007c89f49bdb20c5c1dd17c8badfc43 Mon Sep 17 00:00:00 2001 From: Bhaktipriya Shridhar Date: Tue, 30 Aug 2016 23:29:51 +0530 Subject: KVM: Remove deprecated create_singlethread_workqueue The workqueue "irqfd_cleanup_wq" queues a single work item &irqfd->shutdown and hence doesn't require ordering. It is a host-wide workqueue for issuing deferred shutdown requests aggregated from all vm* instances. It is not being used on a memory reclaim path. Hence, it has been converted to use system_wq. The work item has been flushed in kvm_irqfd_release(). The workqueue "wqueue" queues a single work item &timer->expired and hence doesn't require ordering. Also, it is not being used on a memory reclaim path. Hence, it has been converted to use system_wq. System workqueues have been able to handle high level of concurrency for a long time now and hence it's not required to have a singlethreaded workqueue just to gain concurrency. Unlike a dedicated per-cpu workqueue created with create_singlethread_workqueue(), system_wq allows multiple work items to overlap executions even on the same CPU; however, a per-cpu workqueue doesn't have any CPU locality or global ordering guarantee unless the target CPU is explicitly specified and thus the increase of local concurrency shouldn't make any difference. Signed-off-by: Bhaktipriya Shridhar Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 77e6ccf..4309b60 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -31,7 +31,6 @@ #include "trace.h" static struct timecounter *timecounter; -static struct workqueue_struct *wqueue; static unsigned int host_vtimer_irq; static u32 host_vtimer_irq_flags; @@ -141,7 +140,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_RESTART; } - queue_work(wqueue, &timer->expired); + schedule_work(&timer->expired); return HRTIMER_NORESTART; } @@ -449,12 +448,6 @@ int kvm_timer_hyp_init(void) goto out; } - wqueue = create_singlethread_workqueue("kvm_arch_timer"); - if (!wqueue) { - err = -ENOMEM; - goto out_free; - } - kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, @@ -518,7 +511,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) * VCPUs have the enabled variable set, before entering the guest, if * the arch timers are enabled. */ - if (timecounter && wqueue) + if (timecounter) timer->enabled = 1; return 0; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index e469b60..f397e9b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -42,7 +42,6 @@ #ifdef CONFIG_HAVE_KVM_IRQFD -static struct workqueue_struct *irqfd_cleanup_wq; static void irqfd_inject(struct work_struct *work) @@ -168,7 +167,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) list_del_init(&irqfd->list); - queue_work(irqfd_cleanup_wq, &irqfd->shutdown); + schedule_work(&irqfd->shutdown); } int __attribute__((weak)) kvm_arch_set_irq_inatomic( @@ -555,7 +554,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) * so that we guarantee there will not be any more interrupts on this * gsi once this deassign function returns. */ - flush_workqueue(irqfd_cleanup_wq); + flush_work(&irqfd->shutdown); return 0; } @@ -592,7 +591,7 @@ kvm_irqfd_release(struct kvm *kvm) * Block until we know all outstanding shutdown jobs have completed * since we do not take a kvm* reference. */ - flush_workqueue(irqfd_cleanup_wq); + flush_work(&irqfd->shutdown); } @@ -622,23 +621,8 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_unlock_irq(&kvm->irqfds.lock); } -/* - * create a host-wide workqueue for issuing deferred shutdown requests - * aggregated from all vm* instances. We need our own isolated single-thread - * queue to prevent deadlock against flushing the normal work-queue. - */ -int kvm_irqfd_init(void) -{ - irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); - if (!irqfd_cleanup_wq) - return -ENOMEM; - - return 0; -} - void kvm_irqfd_exit(void) { - destroy_workqueue(irqfd_cleanup_wq); } #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1950782..b3fa12c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3807,12 +3807,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, * kvm_arch_init makes sure there's at most one caller * for architectures that support multiple implementations, * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. */ - r = kvm_irqfd_init(); - if (r) - goto out_irqfd; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; @@ -3894,7 +3889,6 @@ out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: kvm_irqfd_exit(); -out_irqfd: kvm_arch_exit(); out_fail: return r; -- cgit v0.10.2 From 119a9c01a5922600a78106d9bbdd6aaafc851a27 Mon Sep 17 00:00:00 2001 From: Jan Dakinevich Date: Sun, 4 Sep 2016 21:22:47 +0300 Subject: KVM: nVMX: pass valid guest linear-address to the L1 If EPT support is exposed to L1 hypervisor, guest linear-address field of VMCS should contain GVA of L2, the access to which caused EPT violation. Signed-off-by: Jan Dakinevich Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ccb0356..4c1a814 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10500,6 +10500,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_ept(vmcs12)) + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); -- cgit v0.10.2 From bbe41b950813ec643a14d2a6005475f66625292e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 19 Aug 2016 17:51:20 +0200 Subject: KVM: x86: ratelimit and decrease severity for guest-triggered printk These are mostly related to nested VMX. They needn't have a loglevel as high as KERN_WARN, and mustn't be allowed to pollute the host logs. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4c1a814..2029c00 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6726,7 +6726,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) @@ -9598,7 +9598,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_warn_ratelimited( + pr_debug_ratelimited( "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", addr_field, maxphyaddr, count, addr); return -EINVAL; @@ -9671,13 +9671,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) for (i = 0; i < count; i++) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); goto fail; } if (nested_vmx_load_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); goto fail; @@ -9685,7 +9685,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.index = e.index; msr.data = e.value; if (kvm_set_msr(vcpu, &msr)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); goto fail; @@ -9706,13 +9706,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); return -EINVAL; } if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); return -EINVAL; @@ -9720,7 +9720,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr_info.host_initiated = false; msr_info.index = e.index; if (kvm_get_msr(vcpu, &msr_info)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); return -EINVAL; @@ -9729,7 +9729,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), &msr_info.data, sizeof(msr_info.data))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, msr_info.data); return -EINVAL; -- cgit v0.10.2 From 1a6982353db90939fc12bbd11c29fbe3c19a2fd0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 19 Aug 2016 17:51:29 +0200 Subject: KVM: x86: remove stale comments handle_external_intr does not enable interrupts anymore, vcpu_enter_guest does it after calling guest_exit_irqoff. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19f9f9e..57ffe78 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6700,7 +6700,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); - /* Interrupt is enabled by handle_external_intr() */ kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; -- cgit v0.10.2 From 16cb025565fcc9c687d07bf239e57964957b50e8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 5 Sep 2016 15:57:00 +0200 Subject: KVM: VMX: not use vmcs_config in setup_vmcs_config setup_vmcs_config takes a pointer to the vmcs_config global. The indirection is somewhat pointless, but just keep things consistent for now. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2029c00..2d584d9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3457,7 +3457,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; -- cgit v0.10.2 From 9ac7e3e815060efdc86b6d12448200e3c3597e01 Mon Sep 17 00:00:00 2001 From: Jan Dakinevich Date: Sun, 4 Sep 2016 21:23:15 +0300 Subject: KVM: nVMX: expose INS/OUTS information support Expose the feature to L1 hypervisor if host CPU supports it, since certain hypervisors requires it for own purposes. According to Intel SDM A.1, if CPU supports the feature, VMX_INSTRUCTION_INFO field of VMCS will contain detailed information about INS/OUTS instructions handling. This field is already copied to VMCS12 for L1 hypervisor (see prepare_vmcs12 routine) independently feature presence. Signed-off-by: Jan Dakinevich Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d584d9..f9939d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -939,6 +939,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock); static struct vmcs_config { int size; int order; + u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; @@ -1215,6 +1216,11 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); @@ -2877,6 +2883,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + if (cpu_has_vmx_basic_inout()) + *pdata |= VMX_BASIC_INOUT; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: @@ -3458,6 +3466,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->order = get_order(vmcs_conf->size); + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; -- cgit v0.10.2 From e64fb7e272885c1ea3cd2f68f267ae12fa04c8b1 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 2 Aug 2016 14:03:19 +1000 Subject: KVM: PPC: Book3S HV: Move struct kvmppc_vcore from kvm_host.h to kvm_book3s.h The next commit will introduce a member to the kvmppc_vcore struct which references MAX_SMT_THREADS which is defined in kvm_book3s_asm.h, however this file isn't included in kvm_host.h directly. Thus compiling for certain platforms such as pmac32_defconfig and ppc64e_defconfig with KVM fails due to MAX_SMT_THREADS not being defined. Move the struct kvmppc_vcore definition to kvm_book3s.h which explicitly includes kvm_book3s_asm.h. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 8f39796..a50c5fe 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -69,6 +69,41 @@ struct hpte_cache { int pagesize; }; +/* + * Struct for a virtual core. + * Note: entry_exit_map combines a bitmap of threads that have entered + * in the bottom 8 bits and a bitmap of threads that have exited in the + * next 8 bits. This is so that we can atomically set the entry bit + * iff the exit map is 0 without taking a lock. + */ +struct kvmppc_vcore { + int n_runnable; + int num_threads; + int entry_exit_map; + int napping_threads; + int first_vcpuid; + u16 pcpu; + u16 last_cpu; + u8 vcore_state; + u8 in_guest; + struct kvmppc_vcore *master_vcore; + struct list_head runnable_threads; + struct list_head preempt_list; + spinlock_t lock; + struct swait_queue_head wq; + spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ + u64 stolen_tb; + u64 preempt_tb; + struct kvm_vcpu *runner; + struct kvm *kvm; + u64 tb_offset; /* guest timebase - host timebase */ + ulong lpcr; + u32 arch_compat; + ulong pcr; + ulong dpdes; /* doorbell state (POWER8) */ + ulong conferring_threads; +}; + struct kvmppc_vcpu_book3s { struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e36ce0c..7ff9919 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -277,41 +277,6 @@ struct kvm_arch { #endif }; -/* - * Struct for a virtual core. - * Note: entry_exit_map combines a bitmap of threads that have entered - * in the bottom 8 bits and a bitmap of threads that have exited in the - * next 8 bits. This is so that we can atomically set the entry bit - * iff the exit map is 0 without taking a lock. - */ -struct kvmppc_vcore { - int n_runnable; - int num_threads; - int entry_exit_map; - int napping_threads; - int first_vcpuid; - u16 pcpu; - u16 last_cpu; - u8 vcore_state; - u8 in_guest; - struct kvmppc_vcore *master_vcore; - struct list_head runnable_threads; - struct list_head preempt_list; - spinlock_t lock; - struct swait_queue_head wq; - spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ - u64 stolen_tb; - u64 preempt_tb; - struct kvm_vcpu *runner; - struct kvm *kvm; - u64 tb_offset; /* guest timebase - host timebase */ - ulong lpcr; - u32 arch_compat; - ulong pcr; - ulong dpdes; /* doorbell state (POWER8) */ - ulong conferring_threads; -}; - #define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff) #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) #define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) -- cgit v0.10.2 From 7b5f8272c792d49da73d98e9ca32f4cbb6d53808 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 2 Aug 2016 14:03:20 +1000 Subject: KVM: PPC: Book3S HV: Change vcore element runnable_threads from linked-list to array The struct kvmppc_vcore is a structure used to store various information about a virtual core for a kvm guest. The runnable_threads element of the struct provides a list of all of the currently runnable vcpus on the core (those in the KVMPPC_VCPU_RUNNABLE state). The previous implementation of this list was a linked_list. The next patch requires that the list be able to be iterated over without holding the vcore lock. Reimplement the runnable_threads list in the kvmppc_vcore struct as an array. Implement function to iterate over valid entries in the array and update access sites accordingly. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a50c5fe..151f817 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -87,7 +87,7 @@ struct kvmppc_vcore { u8 vcore_state; u8 in_guest; struct kvmppc_vcore *master_vcore; - struct list_head runnable_threads; + struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; struct list_head preempt_list; spinlock_t lock; struct swait_queue_head wq; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7ff9919..851575a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -635,7 +635,6 @@ struct kvm_vcpu_arch { long pgfault_index; unsigned long pgfault_hpte[2]; - struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2fd5580..ebbab1b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "book3s.h" @@ -97,6 +98,26 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, + int *ip) +{ + int i = *ip; + struct kvm_vcpu *vcpu; + + while (++i < MAX_SMT_THREADS) { + vcpu = READ_ONCE(vc->runnable_threads[i]); + if (vcpu) { + *ip = i; + return vcpu; + } + } + return NULL; +} + +/* Used to traverse the list of runnable threads for a given vcore */ +#define for_each_runnable_thread(i, vcpu, vc) \ + for (i = -1; (vcpu = next_runnable_thread(vc, &i)); ) + static bool kvmppc_ipi_thread(int cpu) { /* On POWER8 for IPIs to threads in the same core, use msgsnd */ @@ -1493,7 +1514,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) if (vcore == NULL) return NULL; - INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); spin_lock_init(&vcore->stoltb_lock); init_swait_queue_head(&vcore->wq); @@ -1802,7 +1822,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; spin_unlock_irq(&vcpu->arch.tbacct_lock); --vc->n_runnable; - list_del(&vcpu->arch.run_list); + WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL); } static int kvmppc_grab_hwthread(int cpu) @@ -2209,10 +2229,10 @@ static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, static void prepare_threads(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + int i; + struct kvm_vcpu *vcpu; - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { if (signal_pending(vcpu->arch.run_task)) vcpu->arch.ret = -EINTR; else if (vcpu->arch.vpa.update_pending || @@ -2259,15 +2279,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) { - int still_running = 0; + int still_running = 0, i; u64 now; long ret; - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu; spin_lock(&vc->lock); now = get_tb(); - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { /* cancel pending dec exception if dec is positive */ if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) @@ -2307,8 +2326,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) } if (vc->n_runnable > 0 && vc->runner == NULL) { /* make sure there's a candidate runner awake */ - vcpu = list_first_entry(&vc->runnable_threads, - struct kvm_vcpu, arch.run_list); + i = -1; + vcpu = next_runnable_thread(vc, &i); wake_up(&vcpu->arch.cpu_run); } } @@ -2361,7 +2380,7 @@ static inline void kvmppc_set_host_core(int cpu) */ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu; int i; int srcu_idx; struct core_info core_info; @@ -2397,8 +2416,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ if ((threads_per_core > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); @@ -2477,8 +2495,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) active |= 1 << thr; list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { pvc->pcpu = pcpu + thr; - list_for_each_entry(vcpu, &pvc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, pvc) { kvmppc_start_thread(vcpu, pvc); kvmppc_create_dtl_entry(vcpu, pvc); trace_kvm_guest_enter(vcpu); @@ -2611,7 +2628,7 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { struct kvm_vcpu *vcpu; - int do_sleep = 1; + int do_sleep = 1, i; DECLARE_SWAITQUEUE(wait); prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); @@ -2620,7 +2637,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) * Check one last time for pending exceptions and ceded state after * we put ourselves on the wait queue */ - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) { do_sleep = 0; break; @@ -2644,9 +2661,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - int n_ceded; + int n_ceded, i; struct kvmppc_vcore *vc; - struct kvm_vcpu *v, *vn; + struct kvm_vcpu *v; trace_kvmppc_run_vcpu_enter(vcpu); @@ -2666,7 +2683,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; vcpu->arch.busy_preempt = TB_NIL; - list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); + WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu); ++vc->n_runnable; /* @@ -2706,8 +2723,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE); continue; } - list_for_each_entry_safe(v, vn, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, v, vc) { kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { kvmppc_remove_runnable(vc, v); @@ -2720,7 +2736,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) break; n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + for_each_runnable_thread(i, v, vc) { if (!v->arch.pending_exceptions) n_ceded += v->arch.ceded; else @@ -2759,8 +2775,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { /* Wake up some vcpu to run the core */ - v = list_first_entry(&vc->runnable_threads, - struct kvm_vcpu, arch.run_list); + i = -1; + v = next_runnable_thread(vc, &i); wake_up(&v->arch.cpu_run); } -- cgit v0.10.2 From 0cda69dd7cd64fdd54bdf584b5d6ba53767ba422 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 2 Aug 2016 14:03:21 +1000 Subject: KVM: PPC: Book3S HV: Implement halt polling This patch introduces new halt polling functionality into the kvm_hv kernel module. When a vcore is idle it will poll for some period of time before scheduling itself out. When all of the runnable vcpus on a vcore have ceded (and thus the vcore is idle) we schedule ourselves out to allow something else to run. In the event that we need to wake up very quickly (for example an interrupt arrives), we are required to wait until we get scheduled again. Implement halt polling so that when a vcore is idle, and before scheduling ourselves, we poll for vcpus in the runnable_threads list which have pending exceptions or which leave the ceded state. If we poll successfully then we can get back into the guest very quickly without ever scheduling ourselves, otherwise we schedule ourselves out as before. There exists generic halt_polling code in virt/kvm_main.c, however on powerpc the polling conditions are different to the generic case. It would be nice if we could just implement an arch specific kvm_check_block() function, but there is still other arch specific things which need to be done for kvm_hv (for example manipulating vcore states) which means that a separate implementation is the best option. Testing of this patch with a TCP round robin test between two guests with virtio network interfaces has found a decrease in round trip time of ~15us on average. A performance gain is only seen when going out of and back into the guest often and quickly, otherwise there is no net benefit from the polling. The polling interval is adjusted such that when we are often scheduled out for long periods of time it is reduced, and when we often poll successfully it is increased. The rate at which the polling interval increases or decreases, and the maximum polling interval, can be set through module parameters. Based on the implementation in the generic kvm module by Wanpeng Li and Paolo Bonzini, and on direction from Paul Mackerras. Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 151f817..c261f52 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -102,6 +102,7 @@ struct kvmppc_vcore { ulong pcr; ulong dpdes; /* doorbell state (POWER8) */ ulong conferring_threads; + unsigned int halt_poll_ns; }; struct kvmppc_vcpu_book3s { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 851575a..6ece4a8 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -296,6 +296,7 @@ struct kvm_arch { #define VCORE_SLEEPING 3 #define VCORE_RUNNING 4 #define VCORE_EXITING 5 +#define VCORE_POLLING 6 /* * Struct used to manage memory for a virtual processor area diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ebbab1b..3c85c3b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -95,6 +95,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); + +/* Factor by which the vcore halt poll interval is grown, default is to double + */ +static unsigned int halt_poll_ns_grow = 2; +module_param(halt_poll_ns_grow, int, S_IRUGO); +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); + +/* Factor by which the vcore halt poll interval is shrunk, default is to reset + */ +static unsigned int halt_poll_ns_shrink; +module_param(halt_poll_ns_shrink, int, S_IRUGO); +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -2621,32 +2638,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, finish_wait(&vcpu->arch.cpu_run, &wait); } +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) +{ + /* 10us base */ + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) + vc->halt_poll_ns = 10000; + else + vc->halt_poll_ns *= halt_poll_ns_grow; + + if (vc->halt_poll_ns > halt_poll_max_ns) + vc->halt_poll_ns = halt_poll_max_ns; +} + +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) +{ + if (halt_poll_ns_shrink == 0) + vc->halt_poll_ns = 0; + else + vc->halt_poll_ns /= halt_poll_ns_shrink; +} + +/* Check to see if any of the runnable vcpus on the vcore have pending + * exceptions or are no longer ceded + */ +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu; + int i; + + for_each_runnable_thread(i, vcpu, vc) { + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) + return 1; + } + + return 0; +} + /* * All the vcpus in this vcore are idle, so wait for a decrementer * or external interrupt to one of the vcpus. vc->lock is held. */ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu; - int do_sleep = 1, i; + int do_sleep = 1; + ktime_t cur, start; + u64 block_ns; DECLARE_SWAITQUEUE(wait); - prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + /* Poll for pending exceptions and ceded state */ + cur = start = ktime_get(); + if (vc->halt_poll_ns) { + ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns); - /* - * Check one last time for pending exceptions and ceded state after - * we put ourselves on the wait queue - */ - for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) { - do_sleep = 0; - break; - } + vc->vcore_state = VCORE_POLLING; + spin_unlock(&vc->lock); + + do { + if (kvmppc_vcore_check_block(vc)) { + do_sleep = 0; + break; + } + cur = ktime_get(); + } while (single_task_running() && ktime_before(cur, stop)); + + spin_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; + + if (!do_sleep) + goto out; } - if (!do_sleep) { + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + + if (kvmppc_vcore_check_block(vc)) { finish_swait(&vc->wq, &wait); - return; + do_sleep = 0; + goto out; } vc->vcore_state = VCORE_SLEEPING; @@ -2657,6 +2724,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); + + cur = ktime_get(); + +out: + block_ns = ktime_to_ns(cur) - ktime_to_ns(start); + + /* Adjust poll time */ + if (halt_poll_max_ns) { + if (block_ns <= vc->halt_poll_ns) + ; + /* We slept and blocked for longer than the max halt time */ + else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns) + shrink_halt_poll_ns(vc); + /* We slept and our poll time is too small */ + else if (vc->halt_poll_ns < halt_poll_max_ns && + block_ns < halt_poll_max_ns) + grow_halt_poll_ns(vc); + } else + vc->halt_poll_ns = 0; + + trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 33d9daf..fb21990 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked, __entry->runner_vcpu, __entry->n_runnable, __entry->tgid) ); +TRACE_EVENT(kvmppc_vcore_wakeup, + TP_PROTO(int do_sleep, __u64 ns), + + TP_ARGS(do_sleep, ns), + + TP_STRUCT__entry( + __field(__u64, ns) + __field(int, waited) + __field(pid_t, tgid) + ), + + TP_fast_assign( + __entry->ns = ns; + __entry->waited = do_sleep; + __entry->tgid = current->tgid; + ), + + TP_printk("%s time %lld ns, tgid=%d", + __entry->waited ? "wait" : "poll", + __entry->ns, __entry->tgid) +); + TRACE_EVENT(kvmppc_run_vcpu_enter, TP_PROTO(struct kvm_vcpu *vcpu), -- cgit v0.10.2 From 8a7e75d47b68193339f8727cf4503271d0a0b1d0 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 2 Aug 2016 14:03:22 +1000 Subject: KVM: Add provisioning for ulong vm stats and u64 vcpu stats vms and vcpus have statistics associated with them which can be viewed within the debugfs. Currently it is assumed within the vcpu_stat_get() and vm_stat_get() functions that all of these statistics are represented as u32s, however the next patch adds some u64 vcpu statistics. Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly. Since vcpu statistics are per vcpu, they will only be updated by a single vcpu at a time so this shouldn't present a problem on 32-bit machines which can't atomically increment 64-bit numbers. However vm statistics could potentially be updated by multiple vcpus from that vm at a time. To avoid the overhead of atomics make all vm statistics ulong such that they are 64-bit on 64-bit systems where they can be atomically incremented and are 32-bit on 32-bit systems which may not be able to atomically increment 64-bit numbers. Modify vm_stat_get() to expect ulongs. Signed-off-by: Suraj Jitindar Singh Reviewed-by: David Matlack Acked-by: Christian Borntraeger Signed-off-by: Paul Mackerras diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index de338d9..6ad21f04 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -183,15 +183,15 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 hvc_exit_stat; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3eda975..bd94e67 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -290,15 +290,15 @@ struct kvm_vcpu_arch { #endif struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 hvc_exit_stat; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b54bcad..5f488dc 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -110,32 +110,32 @@ extern atomic_t kvm_mips_instance; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 wait_exits; - u32 cache_exits; - u32 signal_exits; - u32 int_exits; - u32 cop_unusable_exits; - u32 tlbmod_exits; - u32 tlbmiss_ld_exits; - u32 tlbmiss_st_exits; - u32 addrerr_st_exits; - u32 addrerr_ld_exits; - u32 syscall_exits; - u32 resvd_inst_exits; - u32 break_inst_exits; - u32 trap_inst_exits; - u32 msa_fpe_exits; - u32 fpe_exits; - u32 msa_disabled_exits; - u32 flush_dcache_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; + u64 wait_exits; + u64 cache_exits; + u64 signal_exits; + u64 int_exits; + u64 cop_unusable_exits; + u64 tlbmod_exits; + u64 tlbmiss_ld_exits; + u64 tlbmiss_st_exits; + u64 addrerr_st_exits; + u64 addrerr_ld_exits; + u64 syscall_exits; + u64 resvd_inst_exits; + u64 break_inst_exits; + u64 trap_inst_exits; + u64 msa_fpe_exits; + u64 fpe_exits; + u64 msa_disabled_exits; + u64 flush_dcache_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; }; struct kvm_arch_memory_slot { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 6ece4a8..5ec1dcf 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -97,41 +97,41 @@ struct kvmppc_vcpu_book3s; struct kvmppc_book3s_shadow_vcpu; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 sum_exits; - u32 mmio_exits; - u32 signal_exits; - u32 light_exits; + u64 sum_exits; + u64 mmio_exits; + u64 signal_exits; + u64 light_exits; /* Account for special types of light exits: */ - u32 itlb_real_miss_exits; - u32 itlb_virt_miss_exits; - u32 dtlb_real_miss_exits; - u32 dtlb_virt_miss_exits; - u32 syscall_exits; - u32 isi_exits; - u32 dsi_exits; - u32 emulated_inst_exits; - u32 dec_exits; - u32 ext_intr_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 dbell_exits; - u32 gdbell_exits; - u32 ld; - u32 st; + u64 itlb_real_miss_exits; + u64 itlb_virt_miss_exits; + u64 dtlb_real_miss_exits; + u64 dtlb_virt_miss_exits; + u64 syscall_exits; + u64 isi_exits; + u64 dsi_exits; + u64 emulated_inst_exits; + u64 dec_exits; + u64 ext_intr_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 dbell_exits; + u64 gdbell_exits; + u64 ld; + u64 st; #ifdef CONFIG_PPC_BOOK3S - u32 pf_storage; - u32 pf_instruc; - u32 sp_storage; - u32 sp_instruc; - u32 queue_intr; - u32 ld_slow; - u32 st_slow; + u64 pf_storage; + u64 pf_instruc; + u64 sp_storage; + u64 sp_instruc; + u64 queue_intr; + u64 ld_slow; + u64 st_slow; #endif }; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8e5daf7..2bbe675 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -245,72 +245,72 @@ struct sie_page { } __packed; struct kvm_vcpu_stat { - u32 exit_userspace; - u32 exit_null; - u32 exit_external_request; - u32 exit_external_interrupt; - u32 exit_stop_request; - u32 exit_validity; - u32 exit_instruction; - u32 exit_pei; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 instruction_lctl; - u32 instruction_lctlg; - u32 instruction_stctl; - u32 instruction_stctg; - u32 exit_program_interruption; - u32 exit_instr_and_program; - u32 exit_operation_exception; - u32 deliver_external_call; - u32 deliver_emergency_signal; - u32 deliver_service_signal; - u32 deliver_virtio_interrupt; - u32 deliver_stop_signal; - u32 deliver_prefix_signal; - u32 deliver_restart_signal; - u32 deliver_program_int; - u32 deliver_io_int; - u32 exit_wait_state; - u32 instruction_pfmf; - u32 instruction_stidp; - u32 instruction_spx; - u32 instruction_stpx; - u32 instruction_stap; - u32 instruction_storage_key; - u32 instruction_ipte_interlock; - u32 instruction_stsch; - u32 instruction_chsc; - u32 instruction_stsi; - u32 instruction_stfl; - u32 instruction_tprot; - u32 instruction_sie; - u32 instruction_essa; - u32 instruction_sthyi; - u32 instruction_sigp_sense; - u32 instruction_sigp_sense_running; - u32 instruction_sigp_external_call; - u32 instruction_sigp_emergency; - u32 instruction_sigp_cond_emergency; - u32 instruction_sigp_start; - u32 instruction_sigp_stop; - u32 instruction_sigp_stop_store_status; - u32 instruction_sigp_store_status; - u32 instruction_sigp_store_adtl_status; - u32 instruction_sigp_arch; - u32 instruction_sigp_prefix; - u32 instruction_sigp_restart; - u32 instruction_sigp_init_cpu_reset; - u32 instruction_sigp_cpu_reset; - u32 instruction_sigp_unknown; - u32 diagnose_10; - u32 diagnose_44; - u32 diagnose_9c; - u32 diagnose_258; - u32 diagnose_308; - u32 diagnose_500; + u64 exit_userspace; + u64 exit_null; + u64 exit_external_request; + u64 exit_external_interrupt; + u64 exit_stop_request; + u64 exit_validity; + u64 exit_instruction; + u64 exit_pei; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 instruction_lctl; + u64 instruction_lctlg; + u64 instruction_stctl; + u64 instruction_stctg; + u64 exit_program_interruption; + u64 exit_instr_and_program; + u64 exit_operation_exception; + u64 deliver_external_call; + u64 deliver_emergency_signal; + u64 deliver_service_signal; + u64 deliver_virtio_interrupt; + u64 deliver_stop_signal; + u64 deliver_prefix_signal; + u64 deliver_restart_signal; + u64 deliver_program_int; + u64 deliver_io_int; + u64 exit_wait_state; + u64 instruction_pfmf; + u64 instruction_stidp; + u64 instruction_spx; + u64 instruction_stpx; + u64 instruction_stap; + u64 instruction_storage_key; + u64 instruction_ipte_interlock; + u64 instruction_stsch; + u64 instruction_chsc; + u64 instruction_stsi; + u64 instruction_stfl; + u64 instruction_tprot; + u64 instruction_sie; + u64 instruction_essa; + u64 instruction_sthyi; + u64 instruction_sigp_sense; + u64 instruction_sigp_sense_running; + u64 instruction_sigp_external_call; + u64 instruction_sigp_emergency; + u64 instruction_sigp_cond_emergency; + u64 instruction_sigp_start; + u64 instruction_sigp_stop; + u64 instruction_sigp_stop_store_status; + u64 instruction_sigp_store_status; + u64 instruction_sigp_store_adtl_status; + u64 instruction_sigp_arch; + u64 instruction_sigp_prefix; + u64 instruction_sigp_restart; + u64 instruction_sigp_init_cpu_reset; + u64 instruction_sigp_cpu_reset; + u64 instruction_sigp_unknown; + u64 diagnose_10; + u64 diagnose_44; + u64 diagnose_9c; + u64 diagnose_258; + u64 diagnose_308; + u64 diagnose_500; }; #define PGM_OPERATION 0x01 @@ -577,7 +577,7 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_arch_memory_slot { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33ae3a4..67c8f52 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -790,45 +790,45 @@ struct kvm_arch { }; struct kvm_vm_stat { - u32 mmu_shadow_zapped; - u32 mmu_pte_write; - u32 mmu_pte_updated; - u32 mmu_pde_zapped; - u32 mmu_flooded; - u32 mmu_recycled; - u32 mmu_cache_miss; - u32 mmu_unsync; - u32 remote_tlb_flush; - u32 lpages; + ulong mmu_shadow_zapped; + ulong mmu_pte_write; + ulong mmu_pte_updated; + ulong mmu_pde_zapped; + ulong mmu_flooded; + ulong mmu_recycled; + ulong mmu_cache_miss; + ulong mmu_unsync; + ulong remote_tlb_flush; + ulong lpages; }; struct kvm_vcpu_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 nmi_window_exits; - u32 halt_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 host_state_reload; - u32 efer_reload; - u32 fpu_reload; - u32 insn_emulation; - u32 insn_emulation_fail; - u32 hypercalls; - u32 irq_injections; - u32 nmi_injections; + u64 pf_fixed; + u64 pf_guest; + u64 tlb_flush; + u64 invlpg; + + u64 exits; + u64 io_exits; + u64 mmio_exits; + u64 signal_exits; + u64 irq_window_exits; + u64 nmi_window_exits; + u64 halt_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 request_irq_exits; + u64 irq_exits; + u64 host_state_reload; + u64 efer_reload; + u64 fpu_reload; + u64 insn_emulation; + u64 insn_emulation_fail; + u64 hypercalls; + u64 irq_injections; + u64 nmi_injections; }; struct x86_instruction_info; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1950782..4171ef3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3619,7 +3619,7 @@ static int vm_stat_get_per_vm(void *data, u64 *val) { struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset); + *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); return 0; } @@ -3649,7 +3649,7 @@ static int vcpu_stat_get_per_vm(void *data, u64 *val) *val = 0; kvm_for_each_vcpu(i, vcpu, stat_data->kvm) - *val += *(u32 *)((void *)vcpu + stat_data->offset); + *val += *(u64 *)((void *)vcpu + stat_data->offset); return 0; } -- cgit v0.10.2 From 2a27f514a47d39c50aaa5c07831ab35178955d47 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 2 Aug 2016 14:03:23 +1000 Subject: KVM: PPC: Implement existing and add new halt polling vcpu stats vcpu stats are used to collect information about a vcpu which can be viewed in the debugfs. For example halt_attempted_poll and halt_successful_poll are used to keep track of the number of times the vcpu attempts to and successfully polls. These stats are currently not used on powerpc. Implement incrementation of the halt_attempted_poll and halt_successful_poll vcpu stats for powerpc. Since these stats are summed over all the vcpus for all running guests it doesn't matter which vcpu they are attributed to, thus we choose the current runner vcpu of the vcore. Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns to be used to accumulate the total time spend polling successfully, polling unsuccessfully and waiting respectively, and halt_successful_wait to accumulate the number of times the vcpu waits. Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are expressed in nanoseconds it is necessary to represent these as 64-bit quantities, otherwise they would overflow after only about 4 seconds. Given that the total time spend either polling or waiting will be known and the number of times that each was done, it will be possible to determine the average poll and wait times. This will give the ability to tune the kvm module parameters based on the calculated average wait and poll times. Signed-off-by: Suraj Jitindar Singh Reviewed-by: David Matlack Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 5ec1dcf..373003f 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -116,8 +116,12 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; + u64 halt_wait_ns; u64 halt_successful_poll; u64 halt_attempted_poll; + u64 halt_successful_wait; u64 halt_poll_invalid; u64 halt_wakeup; u64 dbell_exits; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 47018fc..71eb8f3 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "queue_intr", VCPU_STAT(queue_intr) }, + { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) }, + { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) }, + { "halt_wait_ns", VCPU_STAT(halt_wait_ns) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, + { "halt_successful_wait", VCPU_STAT(halt_successful_wait) }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3c85c3b..30ff8ab 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2680,15 +2680,16 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) */ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { + ktime_t cur, start_poll, start_wait; int do_sleep = 1; - ktime_t cur, start; u64 block_ns; DECLARE_SWAITQUEUE(wait); /* Poll for pending exceptions and ceded state */ - cur = start = ktime_get(); + cur = start_poll = ktime_get(); if (vc->halt_poll_ns) { - ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns); + ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); + ++vc->runner->stat.halt_attempted_poll; vc->vcore_state = VCORE_POLLING; spin_unlock(&vc->lock); @@ -2704,8 +2705,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; - if (!do_sleep) + if (!do_sleep) { + ++vc->runner->stat.halt_successful_poll; goto out; + } } prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); @@ -2713,9 +2716,14 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) if (kvmppc_vcore_check_block(vc)) { finish_swait(&vc->wq, &wait); do_sleep = 0; + /* If we polled, count this as a successful poll */ + if (vc->halt_poll_ns) + ++vc->runner->stat.halt_successful_poll; goto out; } + start_wait = ktime_get(); + vc->vcore_state = VCORE_SLEEPING; trace_kvmppc_vcore_blocked(vc, 0); spin_unlock(&vc->lock); @@ -2724,11 +2732,29 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); + ++vc->runner->stat.halt_successful_wait; cur = ktime_get(); out: - block_ns = ktime_to_ns(cur) - ktime_to_ns(start); + block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll); + + /* Attribute wait time */ + if (do_sleep) { + vc->runner->stat.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(start_wait); + /* Attribute failed poll time */ + if (vc->halt_poll_ns) + vc->runner->stat.halt_poll_fail_ns += + ktime_to_ns(start_wait) - + ktime_to_ns(start_poll); + } else { + /* Attribute successful poll time */ + if (vc->halt_poll_ns) + vc->runner->stat.halt_poll_success_ns += + ktime_to_ns(cur) - + ktime_to_ns(start_poll); + } /* Adjust poll time */ if (halt_poll_max_ns) { -- cgit v0.10.2 From d6404ded303600a253e1770fa6670236adf12ce9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Oct 2015 16:47:36 +0200 Subject: KVM: s390: factor out actual delivery of machine checks Let's factor this out to prepare for bigger changes. Reorder to calls to match the logical order given in the PoP. Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 24524c0..6e9442a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -400,12 +400,40 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __write_machine_check(struct kvm_vcpu *vcpu, + struct kvm_s390_mchk_info *mchk) +{ + unsigned long ext_sa_addr; + int rc; + + /* Extended save area */ + rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr, + sizeof(unsigned long)); + rc |= kvm_s390_vcpu_store_adtl_status(vcpu, ext_sa_addr); + + /* General interruption information */ + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, mchk->mcic, (u64 __user *) __LC_MCCK_CODE); + + /* Register-save areas */ + rc |= kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED); + + /* Extended interruption information */ + rc |= put_guest_lc(vcpu, mchk->failing_storage_address, + (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); + rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout, + sizeof(mchk->fixed_logout)); + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_mchk_info mchk = {}; - unsigned long adtl_status_addr; int deliver = 0; int rc = 0; @@ -446,29 +474,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK, mchk.cr14, mchk.mcic); - - rc = kvm_s390_vcpu_store_status(vcpu, - KVM_S390_STORE_STATUS_PREFIXED); - rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, - &adtl_status_addr, - sizeof(unsigned long)); - rc |= kvm_s390_vcpu_store_adtl_status(vcpu, - adtl_status_addr); - rc |= put_guest_lc(vcpu, mchk.mcic, - (u64 __user *) __LC_MCCK_CODE); - rc |= put_guest_lc(vcpu, mchk.failing_storage_address, - (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); - rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, - &mchk.fixed_logout, - sizeof(mchk.fixed_logout)); - rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); + rc = __write_machine_check(vcpu, &mchk); } - return rc ? -EFAULT : 0; + return rc; } static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From 0319dae6770fe3727c2d2cd03ca2c5d1b67f31b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 3 Aug 2016 11:18:57 +0200 Subject: KVM: s390: split store status and machine check handling Store status writes the prefix which is not to be done by a machine check. Also, the psw is stored and later on overwritten by the failing-storage address, which looks strange at first sight. Store status and machine check handling look similar, but they are actually two different things. Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 6e9442a..84d6dc6 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -404,14 +405,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, struct kvm_s390_mchk_info *mchk) { unsigned long ext_sa_addr; + freg_t fprs[NUM_FPRS]; int rc; + /* take care of lazy register loading via vcpu load/put */ + save_fpu_regs(); + save_access_regs(vcpu->run->s.regs.acrs); + /* Extended save area */ rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr, sizeof(unsigned long)); rc |= kvm_s390_vcpu_store_adtl_status(vcpu, ext_sa_addr); /* General interruption information */ + rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID); rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, @@ -419,7 +426,27 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, rc |= put_guest_lc(vcpu, mchk->mcic, (u64 __user *) __LC_MCCK_CODE); /* Register-save areas */ - rc |= kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED); + if (MACHINE_HAS_VX) { + convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); + } else { + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, + vcpu->run->s.regs.fprs, 128); + } + rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA, + vcpu->run->s.regs.gprs, 128); + rc |= put_guest_lc(vcpu, current->thread.fpu.fpc, + (u32 __user *) __LC_FP_CREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr, + (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu), + (u64 __user *) __LC_CPU_TIMER_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8, + (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA); + rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA, + &vcpu->run->s.regs.acrs, 64); + rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA, + &vcpu->arch.sie_block->gcr, 128); /* Extended interruption information */ rc |= put_guest_lc(vcpu, mchk->failing_storage_address, -- cgit v0.10.2 From ff5dc1492a11a6c90955ab34063be1cddc54ec00 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Oct 2015 16:57:56 +0200 Subject: KVM: s390: fix delivery of vector regs during machine checks Vector registers are only to be stored if the facility is available and if the guest has set up the machine check extended save area. If anything goes wrong while writing the vector registers, the vector registers are to be marked as invalid. Please note that we are allowed to write the registers although they are marked as invalid. Machine checks and "store status" SIGP orders are two different concepts, let's correctly separate these. As the SIGP part is completely handled in user space, we can drop it. This patch is based on a patch from Cornelia Huck. Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 84d6dc6..c0ef625 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -406,8 +407,10 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, { unsigned long ext_sa_addr; freg_t fprs[NUM_FPRS]; + union mci mci; int rc; + mci.val = mchk->mcic; /* take care of lazy register loading via vcpu load/put */ save_fpu_regs(); save_access_regs(vcpu->run->s.regs.acrs); @@ -415,7 +418,15 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, /* Extended save area */ rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr, sizeof(unsigned long)); - rc |= kvm_s390_vcpu_store_adtl_status(vcpu, ext_sa_addr); + /* Only bits 0-53 are used for address formation */ + ext_sa_addr &= ~0x3ffUL; + if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) { + if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs, + 512)) + mci.vr = 0; + } else { + mci.vr = 0; + } /* General interruption information */ rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID); @@ -423,7 +434,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= put_guest_lc(vcpu, mchk->mcic, (u64 __user *) __LC_MCCK_CODE); + rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); /* Register-save areas */ if (MACHINE_HAS_VX) { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f142215..dead20a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2837,38 +2837,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return kvm_s390_store_status_unloaded(vcpu, addr); } -/* - * store additional status at address - */ -int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, - unsigned long gpa) -{ - /* Only bits 0-53 are used for address formation */ - if (!(gpa & ~0x3ff)) - return 0; - - return write_guest_abs(vcpu, gpa & ~0x3ff, - (void *)&vcpu->run->s.regs.vrs, 512); -} - -int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr) -{ - if (!test_kvm_facility(vcpu->kvm, 129)) - return 0; - - /* - * The guest VXRS are in the host VXRs due to the lazy - * copying in vcpu load/put. We can simply call save_fpu_regs() - * to save the current register state because we are in the - * middle of a load/put cycle. - * - * Let's update our copies before we save it into the save area. - */ - save_fpu_regs(); - - return kvm_s390_store_adtl_status_unloaded(vcpu, addr); -} - static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index b843286..c3d4f16 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -273,10 +273,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu); void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); -int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, - unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); -- cgit v0.10.2 From 8953fb08abf8bf9fd2ee5db148ce52a0f36c284f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 3 Aug 2016 12:25:08 +0200 Subject: KVM: s390: write external damage code on machine checks Let's also write the external damage code already provided by struct kvm_s390_mchk_info. Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 1f95cc1..f3df9e0 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -125,6 +125,7 @@ int main(void) OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list); OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list); OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); + OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c0ef625..353e0b7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -460,6 +460,8 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, &vcpu->arch.sie_block->gcr, 128); /* Extended interruption information */ + rc |= put_guest_lc(vcpu, mchk->ext_damage_code, + (u32 __user *) __LC_EXT_DAMAGE_CODE); rc |= put_guest_lc(vcpu, mchk->failing_storage_address, (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout, -- cgit v0.10.2 From b1ffffbd0f3bf362e578ee577437a8df5e06e495 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 27 May 2016 15:24:33 +0200 Subject: KVM: s390: guestdbg: separate defines for per code Let's avoid working with the PER_EVENT* defines, used for control register manipulation, when checking the u8 PER code. Introduce separate defines based on the existing defines. Reviewed-by: Eric Farman Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 31a0533..d1f8241 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -382,14 +382,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; } +#define PER_CODE_MASK (PER_EVENT_MASK >> 24) +#define PER_CODE_BRANCH (PER_EVENT_BRANCH >> 24) +#define PER_CODE_IFETCH (PER_EVENT_IFETCH >> 24) +#define PER_CODE_STORE (PER_EVENT_STORE >> 24) +#define PER_CODE_STORE_REAL (PER_EVENT_STORE_REAL >> 24) + #define per_bp_event(code) \ - (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH)) + (code & (PER_CODE_IFETCH | PER_CODE_BRANCH)) #define per_write_wp_event(code) \ - (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL)) + (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) static int debug_exit_required(struct kvm_vcpu *vcpu) { - u32 perc = (vcpu->arch.sie_block->perc << 24); + u8 perc = vcpu->arch.sie_block->perc; struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; @@ -444,7 +450,7 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) const u8 ilen = kvm_s390_get_ilen(vcpu); struct kvm_s390_pgm_info pgm_info = { .code = PGM_PER, - .per_code = PER_EVENT_IFETCH >> 24, + .per_code = PER_CODE_IFETCH, .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), }; @@ -458,33 +464,33 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) static void filter_guest_per_event(struct kvm_vcpu *vcpu) { - u32 perc = vcpu->arch.sie_block->perc << 24; + const u8 perc = vcpu->arch.sie_block->perc; u64 peraddr = vcpu->arch.sie_block->peraddr; u64 addr = vcpu->arch.sie_block->gpsw.addr; u64 cr9 = vcpu->arch.sie_block->gcr[9]; u64 cr10 = vcpu->arch.sie_block->gcr[10]; u64 cr11 = vcpu->arch.sie_block->gcr[11]; /* filter all events, demanded by the guest */ - u32 guest_perc = perc & cr9 & PER_EVENT_MASK; + u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; if (!guest_per_enabled(vcpu)) guest_perc = 0; /* filter "successful-branching" events */ - if (guest_perc & PER_EVENT_BRANCH && + if (guest_perc & PER_CODE_BRANCH && cr9 & PER_CONTROL_BRANCH_ADDRESS && !in_addr_range(addr, cr10, cr11)) - guest_perc &= ~PER_EVENT_BRANCH; + guest_perc &= ~PER_CODE_BRANCH; /* filter "instruction-fetching" events */ - if (guest_perc & PER_EVENT_IFETCH && + if (guest_perc & PER_CODE_IFETCH && !in_addr_range(peraddr, cr10, cr11)) - guest_perc &= ~PER_EVENT_IFETCH; + guest_perc &= ~PER_CODE_IFETCH; /* All other PER events will be given to the guest */ /* TODO: Check altered address/address space */ - vcpu->arch.sie_block->perc = guest_perc >> 24; + vcpu->arch.sie_block->perc = guest_perc; if (!guest_perc) vcpu->arch.sie_block->iprcc &= ~PGM_PER; -- cgit v0.10.2 From c14b88d76624e02b9895914b85b9d8b2c984563c Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 29 Jul 2016 11:36:04 +0200 Subject: KVM: s390: gaccess: simplify translation exception handling The payload data for protection exceptions is a superset of the payload of other translation exceptions. Let's set the additional flags and use a fall through to minimize code duplication. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 5420020..4aa8a7e 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -495,6 +495,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; switch (code) { + case PGM_PROTECTION: + switch (prot) { + case PROT_TYPE_ALC: + tec->b60 = 1; + /* FALL THROUGH */ + case PROT_TYPE_DAT: + tec->b61 = 1; + break; + default: /* LA and KEYC set b61 to 0, other params undefined */ + return code; + } + /* FALL THROUGH */ case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: case PGM_REGION_FIRST_TRANS: @@ -504,8 +516,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, /* * op_access_id only applies to MOVE_PAGE -> set bit 61 * exc_access_id has to be set to 0 for some instructions. Both - * cases have to be handled by the caller. We can always store - * exc_access_id, as it is undefined for non-ar cases. + * cases have to be handled by the caller. */ tec->addr = gva >> PAGE_SHIFT; tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; @@ -516,25 +527,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, case PGM_ASTE_VALIDITY: case PGM_ASTE_SEQUENCE: case PGM_EXTENDED_AUTHORITY: + /* + * We can always store exc_access_id, as it is + * undefined for non-ar cases. It is undefined for + * most DAT protection exceptions. + */ pgm->exc_access_id = ar; break; - case PGM_PROTECTION: - switch (prot) { - case PROT_TYPE_ALC: - tec->b60 = 1; - /* FALL THROUGH */ - case PROT_TYPE_DAT: - tec->b61 = 1; - tec->addr = gva >> PAGE_SHIFT; - tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; - /* exc_access_id is undefined for most cases */ - pgm->exc_access_id = ar; - break; - default: /* LA and KEYC set b61 to 0, other params undefined */ - break; - } - break; } return code; } -- cgit v0.10.2 From 714848026531043ae06ca7bcd4a852c8bb8348c7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 13 May 2016 16:41:41 +0200 Subject: KVM: arm/arm64: Add VGICv3 save/restore API documentation Factor out the GICv3 and ITS-specific documentation into a separate documentation file. Add description for how to access distributor, redistributor, and CPU interface registers for GICv3 in this new file, and add a group for accessing level triggered IRQ information for GICv3 as well. Reviewed-by: Marc Zyngier Reviewed-by: Peter Maydell Signed-off-by: Christoffer Dall diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt new file mode 100644 index 0000000..6081a5b --- /dev/null +++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt @@ -0,0 +1,38 @@ +ARM Virtual Interrupt Translation Service (ITS) +=============================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller + +The ITS allows MSI(-X) interrupts to be injected into guests. This extension is +optional. Creating a virtual ITS controller also requires a host GICv3 (see +arm-vgic-v3.txt), but does not depend on having physical ITS controllers. + +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. + This address needs to be 64K aligned and the region covers 128K. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -EFAULT: Invalid user pointer for attr->addr. + -ENODEV: Incorrect attribute or the ITS is not supported. + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the ITS, no additional parameter in + kvm_device_attr.addr. + Errors: + -ENXIO: ITS not properly configured as required prior to setting + this attribute + -ENOMEM: Memory shortage when allocating ITS internal data diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt new file mode 100644 index 0000000..9348b3c --- /dev/null +++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt @@ -0,0 +1,206 @@ +ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) +============================================================== + + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + +Only one VGIC instance may be instantiated through this API. The created VGIC +will act as the VM interrupt controller, requiring emulated user-space devices +to inject interrupts to the VGIC instead of directly to CPUs. It is not +possible to create both a GICv3 and GICv2 on the same VM. + +Creating a guest GICv3 device requires a host GICv3 as well. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned and the region covers 64 KByte. + + KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 + redistributor register mappings. There are two 64K pages for each + VCPU and all of the redistributor pages are contiguous. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT: Invalid user pointer for attr->addr. + + + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 32 | 31 .... 0 | + values: | mpidr | offset | + + All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a + __u32 value. 64-bit registers must be accessed by separately accessing the + lower and higher word. + + Writes to read-only registers are ignored by the kernel. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU + specified by the mpidr. + + The offset is relative to the "[Re]Distributor base address" as defined + in the GICv3/4 specs. Getting or setting such a register has the same + effect as reading or writing the register on real hardware, except for the + following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, + GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave + differently when accessed via this interface compared to their + architecturally defined behavior to allow software a full view of the + VGIC's internal state. + + The mpidr field is used to specify which + redistributor is accessed. The mpidr is ignored for the distributor. + + The mpidr encoding is based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + Note that distributor fields are not banked, but return the same value + regardless of the mpidr used to access the register. + + The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such + that a write of a clear bit has no effect, whereas a write with a set bit + clears that value. To allow userspace to freely set the values of these two + registers, setting the attributes with the register offsets for these two + registers simply sets the non-reserved bits to the value written. + + + Accesses (reads and writes) to the GICD_ISPENDR register region and + GICR_ISPENDR0 registers get/set the value of the latched pending state for + the interrupts. + + This is identical to the value returned by a guest read from ISPENDR for an + edge triggered interrupt, but may differ for level triggered interrupts. + For edge triggered interrupts, once an interrupt becomes pending (whether + because of an edge detected on the input line or because of a guest write + to ISPENDR) this state is "latched", and only cleared when either the + interrupt is activated or when the guest writes to ICPENDR. A level + triggered interrupt may be pending either because the level input is held + high by a device, or because of a guest write to the ISPENDR register. Only + ISPENDR writes are latched; if the device lowers the line level then the + interrupt is no longer pending unless the guest also wrote to ISPENDR, and + conversely writes to ICPENDR or activations of the interrupt do not clear + the pending status if the line level is still being held high. (These + rules are documented in the GICv3 specification descriptions of the ICPENDR + and ISPENDR registers.) For a level triggered interrupt the value accessed + here is that of the latch which is set by ISPENDR and cleared by ICPENDR or + interrupt activation, whereas the value returned by a guest read from + ISPENDR is the logical OR of the latch value and the input line level. + + Raw access to the latch state is provided to userspace so that it can save + and restore the entire GIC internal state (which is defined by the + combination of the current input line level and the latch state, and cannot + be deduced from purely the line level and the value of the ISPENDR + registers). + + Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have + RAZ/WI semantics, meaning that reads always return 0 and writes are always + ignored. + + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + + + KVM_DEV_ARM_VGIC_CPU_SYSREGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | + values: | mpidr | RES | instr | + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + The instr field encodes the system register to access based on the fields + defined in the A64 instruction set encoding for system register access + (RES means the bits are reserved for future use and should be zero): + + | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | + | Op 0 | Op1 | CRn | CRm | Op2 | + + All system regs accessed through this API are (rw, 64-bit) and + kvm_device_attr.addr points to a __u64 value. + + KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the + CPU specified by the mpidr field. + + Errors: + -ENXIO: Getting or setting this register is not yet supported + -EBUSY: VCPU is running + -EINVAL: Invalid mpidr supplied + + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + kvm_device_attr.addr points to a __u32 value. + + Errors: + -EINVAL: Value set is out of the expected range + -EBUSY: Value has already be set. + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC, no additional parameter in + kvm_device_attr.addr. + Errors: + -ENXIO: VGIC not properly configured as required prior to calling + this attribute + -ENODEV: no online VCPU + -ENOMEM: memory shortage when allocating vgic internal data + + + KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO + Attributes: + The attr field of kvm_device_attr encodes the following values: + bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | + values: | mpidr | info | vINTID | + + The vINTID specifies which set of IRQs is reported on. + + The info field specifies which information userspace wants to get or set + using this interface. Currently we support the following info values: + + VGIC_LEVEL_INFO_LINE_LEVEL: + Get/Set the input level of the IRQ line for a set of 32 contiguously + numbered interrupts. + vINTID must be a multiple of 32. + + kvm_device_attr.addr points to a __u32 value which will contain a + bitmap where a set bit means the interrupt level is asserted. + + Bit[n] indicates the status for interrupt vINTID + n. + + SGIs and any interrupt with a higher ID than the number of interrupts + supported, will be RAZ/WI. LPIs are always edge-triggered and are + therefore not supported by this interface. + + PPIs are reported per VCPU as specified in the mpidr field, and SPIs are + reported with the same value regardless of the mpidr specified. + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows: + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 89182f8..76e61c8 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -1,24 +1,19 @@ -ARM Virtual Generic Interrupt Controller (VGIC) -=============================================== +ARM Virtual Generic Interrupt Controller v2 (VGIC) +================================================== Device types supported: KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 - KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 - KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller -Only one VGIC instance of the V2/V3 types above may be instantiated through -either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will -act as the VM interrupt controller, requiring emulated user-space devices to -inject interrupts to the VGIC instead of directly to CPUs. +Only one VGIC instance may be instantiated through either this API or the +legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt +controller, requiring emulated user-space devices to inject interrupts to the +VGIC instead of directly to CPUs. -Creating a guest GICv3 device requires a host GICv3 as well. -GICv3 implementations with hardware compatibility support allow a guest GICv2 -as well. +GICv3 implementations with hardware compatibility support allow creating a +guest GICv2 through this interface. For information on creating a guest GICv3 +device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to +create both a GICv3 and GICv2 device on the same VM. -Creating a virtual ITS controller requires a host GICv3 (but does not depend -on having physical ITS controllers). -There can be multiple ITS controllers per guest, each of them has to have -a separate, non-overlapping MMIO region. Groups: KVM_DEV_ARM_VGIC_GRP_ADDR @@ -32,26 +27,13 @@ Groups: Base address in the guest physical address space of the GIC virtual cpu interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. This address needs to be 4K aligned and the region covers 4 KByte. - - KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned and the region covers 64 KByte. - - KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 - redistributor register mappings. There are two 64K pages for each - VCPU and all of the redistributor pages are contiguous. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned. - - KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit) - Base address in the guest physical address space of the GICv3 ITS - control register frame. The ITS allows MSI(-X) interrupts to be - injected into guests. This extension is optional. If the kernel - does not support the ITS, the call returns -ENODEV. - Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS. - This address needs to be 64K aligned and the region covers 128K. + Errors: + -E2BIG: Address outside of addressable IPA range + -EINVAL: Incorrectly aligned address + -EEXIST: Address already configured + -ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT: Invalid user pointer for attr->addr. KVM_DEV_ARM_VGIC_GRP_DIST_REGS Attributes: -- cgit v0.10.2 From ba7b9169b52038ce017782479c6a24cd37f34ae5 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 16 Aug 2016 16:48:20 +0200 Subject: KVM: arm/arm64: Factor out vgic_attr_regs_access functionality As we are about to deal with multiple data types and situations where the vgic should not be initialized when doing userspace accesses on the register attributes, factor out the functionality of vgic_attr_regs_access into smaller bits which can be reused by a new function later. Signed-off-by: Christoffer Dall Reviewed-by: Eric Auger diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 1813f93..19fa331 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -233,6 +233,67 @@ int kvm_register_vgic_device(unsigned long type) return ret; } +struct vgic_reg_attr { + struct kvm_vcpu *vcpu; + gpa_t addr; +}; + +static int parse_vgic_v2_attr(struct kvm_device *dev, + struct kvm_device_attr *attr, + struct vgic_reg_attr *reg_attr) +{ + int cpuid; + + cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> + KVM_DEV_ARM_VGIC_CPUID_SHIFT; + + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) + return -EINVAL; + + reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid); + reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + + return 0; +} + +/* unlocks vcpus from @vcpu_lock_idx and smaller */ +static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) +{ + struct kvm_vcpu *tmp_vcpu; + + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } +} + +static void unlock_all_vcpus(struct kvm *kvm) +{ + unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); +} + +/* Returns true if all vcpus were locked, false otherwise */ +static bool lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *tmp_vcpu; + int c; + + /* + * Any time a vcpu is run, vcpu_load is called which tries to grab the + * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure + * that no other VCPUs are run and fiddle with the vgic state while we + * access it. + */ + kvm_for_each_vcpu(c, tmp_vcpu, kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) { + unlock_vcpus(kvm, c - 1); + return false; + } + } + + return true; +} + /** vgic_attr_regs_access: allows user space to read/write VGIC registers * * @dev: kvm device handle @@ -245,15 +306,17 @@ static int vgic_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, u32 *reg, bool is_write) { + struct vgic_reg_attr reg_attr; gpa_t addr; - int cpuid, ret, c; - struct kvm_vcpu *vcpu, *tmp_vcpu; - int vcpu_lock_idx = -1; + struct kvm_vcpu *vcpu; + int ret; - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - vcpu = kvm_get_vcpu(dev->kvm, cpuid); - addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + ret = parse_vgic_v2_attr(dev, attr, ®_attr); + if (ret) + return ret; + + vcpu = reg_attr.vcpu; + addr = reg_attr.addr; mutex_lock(&dev->kvm->lock); @@ -261,24 +324,11 @@ static int vgic_attr_regs_access(struct kvm_device *dev, if (ret) goto out; - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { - ret = -EINVAL; + if (!lock_all_vcpus(dev->kvm)) { + ret = -EBUSY; goto out; } - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run and fiddle with the vgic state while we - * access it. - */ - ret = -EBUSY; - kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) { - if (!mutex_trylock(&tmp_vcpu->mutex)) - goto out; - vcpu_lock_idx = c; - } - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); @@ -291,12 +341,8 @@ static int vgic_attr_regs_access(struct kvm_device *dev, break; } + unlock_all_vcpus(dev->kvm); out: - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx); - mutex_unlock(&tmp_vcpu->mutex); - } - mutex_unlock(&dev->kvm->lock); return ret; } -- cgit v0.10.2 From 1fe0009833892f950d6955b8c77c5b00e7700802 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 16 Aug 2016 16:52:14 +0200 Subject: KVM: arm/arm64: Rename vgic_attr_regs_access to vgic_attr_regs_access_v2 Just a rename so we can implement a v3-specific function later. We take the chance to get rid of the V2/V3 ops comments as well. No functional change. Signed-off-by: Christoffer Dall Reviewed-by: Eric Auger diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 19fa331..163b057 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -294,17 +294,17 @@ static bool lock_all_vcpus(struct kvm *kvm) return true; } -/** vgic_attr_regs_access: allows user space to read/write VGIC registers - * - * @dev: kvm device handle - * @attr: kvm device attribute - * @reg: address the value is read or written - * @is_write: write flag +/** + * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state * + * @dev: kvm device handle + * @attr: kvm device attribute + * @reg: address the value is read or written + * @is_write: true if userspace is writing a register */ -static int vgic_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u32 *reg, bool is_write) +static int vgic_attr_regs_access_v2(struct kvm_device *dev, + struct kvm_device_attr *attr, + u32 *reg, bool is_write) { struct vgic_reg_attr reg_attr; gpa_t addr; @@ -347,8 +347,6 @@ out: return ret; } -/* V2 ops */ - static int vgic_v2_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { @@ -367,7 +365,7 @@ static int vgic_v2_set_attr(struct kvm_device *dev, if (get_user(reg, uaddr)) return -EFAULT; - return vgic_attr_regs_access(dev, attr, ®, true); + return vgic_attr_regs_access_v2(dev, attr, ®, true); } } @@ -389,7 +387,7 @@ static int vgic_v2_get_attr(struct kvm_device *dev, u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 reg = 0; - ret = vgic_attr_regs_access(dev, attr, ®, false); + ret = vgic_attr_regs_access_v2(dev, attr, ®, false); if (ret) return ret; return put_user(reg, uaddr); @@ -433,8 +431,6 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = { .has_attr = vgic_v2_has_attr, }; -/* V3 ops */ - #ifdef CONFIG_KVM_ARM_VGIC_V3 static int vgic_v3_set_attr(struct kvm_device *dev, -- cgit v0.10.2 From 68381b2b00f728a6c910e777bd62986c951323a3 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 30 Aug 2016 21:08:32 -0500 Subject: arm64: KVM: Optimize __guest_enter/exit() to save a few instructions We are doing an unnecessary stack push/pop operation when restoring the guest registers x0-x18 in __guest_enter(). This patch saves the two instructions by using x18 as a base register. No need to store the vcpu context pointer in stack because it is redundant, the same information is available in tpidr_el2. The function __guest_exit() calling convention is slightly modified, caller only pushes the regs x0-x1 to stack instead of regs x0-x3. Signed-off-by: Shanker Donthineni Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index ce9e5e5..3967c231 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -55,79 +55,78 @@ */ ENTRY(__guest_enter) // x0: vcpu - // x1: host/guest context - // x2-x18: clobbered by macros + // x1: host context + // x2-x17: clobbered by macros + // x18: guest context // Store the host regs save_callee_saved_regs x1 - // Preserve vcpu & host_ctxt for use at exit time - stp x0, x1, [sp, #-16]! + // Store the host_ctxt for use at exit time + str x1, [sp, #-16]! - add x1, x0, #VCPU_CONTEXT + add x18, x0, #VCPU_CONTEXT - // Prepare x0-x1 for later restore by pushing them onto the stack - ldp x2, x3, [x1, #CPU_XREG_OFFSET(0)] - stp x2, x3, [sp, #-16]! + // Restore guest regs x0-x17 + ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)] + ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)] - // x2-x18 - ldp x2, x3, [x1, #CPU_XREG_OFFSET(2)] - ldp x4, x5, [x1, #CPU_XREG_OFFSET(4)] - ldp x6, x7, [x1, #CPU_XREG_OFFSET(6)] - ldp x8, x9, [x1, #CPU_XREG_OFFSET(8)] - ldp x10, x11, [x1, #CPU_XREG_OFFSET(10)] - ldp x12, x13, [x1, #CPU_XREG_OFFSET(12)] - ldp x14, x15, [x1, #CPU_XREG_OFFSET(14)] - ldp x16, x17, [x1, #CPU_XREG_OFFSET(16)] - ldr x18, [x1, #CPU_XREG_OFFSET(18)] - - // x19-x29, lr - restore_callee_saved_regs x1 - - // Last bits of the 64bit state - ldp x0, x1, [sp], #16 + // Restore guest regs x19-x29, lr + restore_callee_saved_regs x18 + + // Restore guest reg x18 + ldr x18, [x18, #CPU_XREG_OFFSET(18)] // Do not touch any register after this! eret ENDPROC(__guest_enter) ENTRY(__guest_exit) - // x0: vcpu - // x1: return code - // x2-x3: free - // x4-x29,lr: vcpu regs - // vcpu x0-x3 on the stack - - add x2, x0, #VCPU_CONTEXT - - stp x4, x5, [x2, #CPU_XREG_OFFSET(4)] - stp x6, x7, [x2, #CPU_XREG_OFFSET(6)] - stp x8, x9, [x2, #CPU_XREG_OFFSET(8)] - stp x10, x11, [x2, #CPU_XREG_OFFSET(10)] - stp x12, x13, [x2, #CPU_XREG_OFFSET(12)] - stp x14, x15, [x2, #CPU_XREG_OFFSET(14)] - stp x16, x17, [x2, #CPU_XREG_OFFSET(16)] - str x18, [x2, #CPU_XREG_OFFSET(18)] - - ldp x6, x7, [sp], #16 // x2, x3 - ldp x4, x5, [sp], #16 // x0, x1 - - stp x4, x5, [x2, #CPU_XREG_OFFSET(0)] - stp x6, x7, [x2, #CPU_XREG_OFFSET(2)] + // x0: return code + // x1: vcpu + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + add x1, x1, #VCPU_CONTEXT + + // Store the guest regs x2 and x3 + stp x2, x3, [x1, #CPU_XREG_OFFSET(2)] + + // Retrieve the guest regs x0-x1 from the stack + ldp x2, x3, [sp], #16 // x0, x1 + + // Store the guest regs x0-x1 and x4-x18 + stp x2, x3, [x1, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x1, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x1, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x1, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x1, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x1, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x1, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x1, #CPU_XREG_OFFSET(16)] + str x18, [x1, #CPU_XREG_OFFSET(18)] + + // Store the guest regs x19-x29, lr + save_callee_saved_regs x1 - save_callee_saved_regs x2 + // Restore the host_ctxt from the stack + ldr x2, [sp], #16 - // Restore vcpu & host_ctxt from the stack - // (preserving return code in x1) - ldp x0, x2, [sp], #16 // Now restore the host regs restore_callee_saved_regs x2 - mov x0, x1 ret ENDPROC(__guest_exit) ENTRY(__fpsimd_guest_restore) + stp x2, x3, [sp, #-16]! stp x4, lr, [sp, #-16]! alternative_if_not ARM64_HAS_VIRT_HOST_EXTN diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index f6d9694..d6cae54 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -27,16 +27,6 @@ .text .pushsection .hyp.text, "ax" -.macro save_x0_to_x3 - stp x0, x1, [sp, #-16]! - stp x2, x3, [sp, #-16]! -.endm - -.macro restore_x0_to_x3 - ldp x2, x3, [sp], #16 - ldp x0, x1, [sp], #16 -.endm - .macro do_el2_call /* * Shuffle the parameters before calling the function @@ -79,23 +69,23 @@ ENTRY(__kvm_hyp_teardown) ENDPROC(__kvm_hyp_teardown) el1_sync: // Guest trapped into EL2 - save_x0_to_x3 + stp x0, x1, [sp, #-16]! alternative_if_not ARM64_HAS_VIRT_HOST_EXTN mrs x1, esr_el2 alternative_else mrs x1, esr_el1 alternative_endif - lsr x2, x1, #ESR_ELx_EC_SHIFT + lsr x0, x1, #ESR_ELx_EC_SHIFT - cmp x2, #ESR_ELx_EC_HVC64 + cmp x0, #ESR_ELx_EC_HVC64 b.ne el1_trap - mrs x3, vttbr_el2 // If vttbr is valid, the 64bit guest - cbnz x3, el1_trap // called HVC + mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest + cbnz x1, el1_trap // called HVC /* Here, we're pretty sure the host called HVC. */ - restore_x0_to_x3 + ldp x0, x1, [sp], #16 cmp x0, #HVC_GET_VECTORS b.ne 1f @@ -113,22 +103,21 @@ alternative_endif el1_trap: /* - * x1: ESR - * x2: ESR_EC + * x0: ESR_EC */ /* Guest accessed VFP/SIMD registers, save host, restore Guest */ - cmp x2, #ESR_ELx_EC_FP_ASIMD + cmp x0, #ESR_ELx_EC_FP_ASIMD b.eq __fpsimd_guest_restore - mrs x0, tpidr_el2 - mov x1, #ARM_EXCEPTION_TRAP + mrs x1, tpidr_el2 + mov x0, #ARM_EXCEPTION_TRAP b __guest_exit el1_irq: - save_x0_to_x3 - mrs x0, tpidr_el2 - mov x1, #ARM_EXCEPTION_IRQ + stp x0, x1, [sp, #-16]! + mrs x1, tpidr_el2 + mov x0, #ARM_EXCEPTION_IRQ b __guest_exit ENTRY(__hyp_do_panic) -- cgit v0.10.2 From 55d7cad6a98bbd6f8a5a98b9943b6819d7dc1f22 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Aug 2016 18:54:19 +0100 Subject: KVM: arm: vgic: Drop build compatibility hack for older kernel versions As kvm_set_routing_entry() was changing prototype between 4.7 and 4.8, an ugly hack was put in place in order to survive both building in -next and the merge window. Now that everything has been merged, let's dump the compatibility hack for good. Signed-off-by: Marc Zyngier Reviewed-by: Eric Auger Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c index b31a51a..d918dcf 100644 --- a/virt/kvm/arm/vgic/vgic-irqfd.c +++ b/virt/kvm/arm/vgic/vgic-irqfd.c @@ -46,15 +46,9 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e, * @ue: user api routing entry handle * return 0 on success, -EINVAL on errors. */ -#ifdef KVM_CAP_X2APIC_API int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) -#else -/* Remove this version and the ifdefery once merged into 4.8 */ -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) -#endif { int r = -EINVAL; -- cgit v0.10.2 From dcadda146f4fd25a732382747f306465d337cda6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 30 Aug 2016 17:05:55 +0100 Subject: arm/kvm: excise redundant cache maintenance When modifying Stage-2 page tables, we perform cache maintenance to account for non-coherent page table walks. However, this is unnecessary, as page table walks are guaranteed to be coherent in the presence of the virtualization extensions. Per ARM DDI 0406C.c, section B1.7 ("The Virtualization Extensions"), the virtualization extensions mandate the multiprocessing extensions. Per ARM DDI 0406C.c, section B3.10.1 ("General TLB maintenance requirements"), as described in the sub-section titled "TLB maintenance operations and the memory order model", this maintenance is not required in the presence of the multiprocessing extensions. Hence, we need not perform this cache maintenance when modifying Stage-2 entries. This patch removes the logic for performing the redundant maintenance. To ensure visibility and ordering of updates, a dsb(ishst) that was otherwise implicit in the maintenance is folded into kvm_set_pmd() and kvm_set_pte(). Signed-off-by: Mark Rutland Cc: Christoffer Dall Cc: Marc Zyngier Cc: kvmarm@lists.cs.columbia.edu Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3bb803d..74a44727 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -63,37 +63,13 @@ void kvm_clear_hyp_idmap(void); static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) { *pmd = new_pmd; - flush_pmd_entry(pmd); + dsb(ishst); } static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) { *pte = new_pte; - /* - * flush_pmd_entry just takes a void pointer and cleans the necessary - * cache entries, so we can reuse the function for ptes. - */ - flush_pmd_entry(pte); -} - -static inline void kvm_clean_pgd(pgd_t *pgd) -{ - clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t)); -} - -static inline void kvm_clean_pmd(pmd_t *pmd) -{ - clean_dcache_area(pmd, PTRS_PER_PMD * sizeof(pmd_t)); -} - -static inline void kvm_clean_pmd_entry(pmd_t *pmd) -{ - clean_pmd_entry(pmd); -} - -static inline void kvm_clean_pte(pte_t *pte) -{ - clean_pte_table(pte); + dsb(ishst); } static inline pte_t kvm_s2pte_mkwrite(pte_t pte) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 29d0b23..344755d 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -744,7 +744,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!pgd) return -ENOMEM; - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; } @@ -936,7 +935,6 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ pte = mmu_memory_cache_alloc(cache); - kvm_clean_pte(pte); pmd_populate_kernel(NULL, pmd, pte); get_page(virt_to_page(pmd)); } -- cgit v0.10.2 From 777c155772f9a234d72b9319325d8b6abd0c0e36 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 30 Aug 2016 17:05:56 +0100 Subject: arm64/kvm: remove unused stub functions Now that 32-bit KVM no longer performs cache maintenance for page table updates, we no longer need empty stubs for arm64. Remove them. Signed-off-by: Mark Rutland Cc: Christoffer Dall Cc: Marc Zyngier Cc: kvmarm@lists.cs.columbia.edu Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b6bb834..8f99ab6 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -166,12 +166,6 @@ void kvm_clear_hyp_idmap(void); #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) #define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) -static inline void kvm_clean_pgd(pgd_t *pgd) {} -static inline void kvm_clean_pmd(pmd_t *pmd) {} -static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} -static inline void kvm_clean_pte(pte_t *pte) {} -static inline void kvm_clean_pte_entry(pte_t *pte) {} - static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= PTE_S2_RDWR; -- cgit v0.10.2 From cf0ba18a441410aeaf3ef97eead3a3fa5381f46f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 1 Sep 2016 13:16:03 +0200 Subject: KVM: arm/arm64: Get rid of exported aliases to static functions When rewriting the assembly code to C code, it was useful to have exported aliases or static functions so that we could keep the existing common C code unmodified and at the same time rewrite arm64 from assembly to C code, and later do the arm part. Now when both are done, we really don't need this level of indirection anymore, and it's time to save a few lines and brain cells. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index b13caa9..37b3365 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -134,7 +134,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -static int __hyp_text __guest_run(struct kvm_vcpu *vcpu) +int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; @@ -191,8 +191,6 @@ again: return exit_code; } -__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu); - static const char * const __hyp_panic_string[] = { [ARM_EXCEPTION_RESET] = "\nHYP panic: RST PC:%08x CPSR:%08x", [ARM_EXCEPTION_UNDEFINED] = "\nHYP panic: UNDEF PC:%08x CPSR:%08x", diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index a263600..7296528 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -34,7 +34,7 @@ * As v7 does not support flushing per IPA, just nuke the whole TLB * instead, ignoring the ipa value. */ -static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) +void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) { dsb(ishst); @@ -50,21 +50,14 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) write_sysreg(0, VTTBR); } -__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm); - -static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) +void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { - __tlb_flush_vmid(kvm); + __kvm_tlb_flush_vmid(kvm); } -__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, - phys_addr_t ipa); - -static void __hyp_text __tlb_flush_vm_context(void) +void __hyp_text __kvm_flush_vm_context(void) { write_sysreg(0, TLBIALLNSNHIS); write_sysreg(0, ICIALLUIS); dsb(ish); } - -__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void); diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 33342a7..4ba5c90 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -131,9 +131,7 @@ void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu) vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY; } -static u32 __hyp_text __debug_read_mdcr_el2(void) +u32 __hyp_text __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } - -__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 5a84b45..35d2e09 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -232,7 +232,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -static int __hyp_text __guest_run(struct kvm_vcpu *vcpu) +int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; @@ -293,8 +293,6 @@ again: return exit_code; } -__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu); - static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par) diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index be8177c..9cc0ea7 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -17,7 +17,7 @@ #include -static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) +void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { dsb(ishst); @@ -48,10 +48,7 @@ static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) write_sysreg(0, vttbr_el2); } -__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, - phys_addr_t ipa); - -static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) +void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) { dsb(ishst); @@ -67,14 +64,10 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) write_sysreg(0, vttbr_el2); } -__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm); - -static void __hyp_text __tlb_flush_vm_context(void) +void __hyp_text __kvm_flush_vm_context(void) { dsb(ishst); asm volatile("tlbi alle1is \n" "ic ialluis ": : ); dsb(ish); } - -__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 5f8f80b..ee1ea63 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -335,9 +335,7 @@ void __hyp_text __vgic_v3_init_lrs(void) __gic_v3_set_lr(0, i); } -static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void) +u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) { return read_gicreg(ICH_VTR_EL2); } - -__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void); -- cgit v0.10.2 From cb96408da4e11698674abd04aeac941c1bed2038 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Thu, 1 Sep 2016 15:29:03 +0100 Subject: arm64: KVM: VHE: reset PSTATE.PAN on entry to EL2 SCTLR_EL2.SPAN bit controls what happens with the PSTATE.PAN bit on an exception. However, this bit has no effect on the PSTATE.PAN when HCR_EL2.E2H or HCR_EL2.TGE is unset. Thus when VHE is used and exception taken from a guest PSTATE.PAN bit left unchanged and we continue with a value guest has set. To address that always reset PSTATE.PAN on entry from EL1. Fixes: 1f364c8c48a0 ("arm64: VHE: Add support for running Linux in EL2 mode") Signed-off-by: Vladimir Murzin Reviewed-by: James Morse Acked-by: Marc Zyngier Cc: # v4.6+ Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 3967c231..b5926ee 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -96,6 +96,8 @@ ENTRY(__guest_exit) add x1, x1, #VCPU_CONTEXT + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + // Store the guest regs x2 and x3 stp x2, x3, [x1, #CPU_XREG_OFFSET(2)] -- cgit v0.10.2 From 3e51d43516b99a5ede461381b4d031998f8dbdf3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:41 +0100 Subject: arm64: KVM: Move kvm_vcpu_get_condition out of emulate.c In order to make emulate.c more generic, move the arch-specific manupulation bits out of emulate.c. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 4cdeae3..b336434d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -147,6 +147,16 @@ static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu) return vcpu->arch.fault.esr_el2; } +static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) +{ + u32 esr = kvm_vcpu_get_hsr(vcpu); + + if (esr & ESR_ELx_CV) + return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; + + return -1; +} + static inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.far_el2; diff --git a/arch/arm64/kvm/emulate.c b/arch/arm64/kvm/emulate.c index f87d8fb..40098ad 100644 --- a/arch/arm64/kvm/emulate.c +++ b/arch/arm64/kvm/emulate.c @@ -22,7 +22,6 @@ */ #include -#include #include /* @@ -52,16 +51,6 @@ static const unsigned short cc_map[16] = { 0 /* NV */ }; -static int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) -{ - u32 esr = kvm_vcpu_get_hsr(vcpu); - - if (esr & ESR_ELx_CV) - return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; - - return -1; -} - /* * Check if a trapped instruction should have been executed or not. */ -- cgit v0.10.2 From 427d7cacf97220844aa39146e11365655bbff8bd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:42 +0100 Subject: arm64: KVM: Move the AArch32 conditional execution to common code It would make some sense to share the conditional execution code between 32 and 64bit. In order to achieve this, let's move that code to virt/kvm/arm/aarch32.c. While we're at it, drop a superfluous BUG_ON() that wasn't that useful. Following patches will migrate the 32bit port to that code base. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index af93e3f..eda9ddd 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -221,9 +221,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) unsigned long cpsr = *vcpu_cpsr(vcpu); bool is_arm = !(cpsr & PSR_T_BIT); - BUG_ON(is_arm && (cpsr & PSR_IT_MASK)); - - if (!(cpsr & PSR_IT_MASK)) + if (is_arm || !(cpsr & PSR_IT_MASK)) return; cond = (cpsr & 0xe000) >> 13; diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 695eb3c..d50a82a 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -16,9 +16,10 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/e kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o -kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o +kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o diff --git a/arch/arm64/kvm/emulate.c b/arch/arm64/kvm/emulate.c deleted file mode 100644 index 40098ad..0000000 --- a/arch/arm64/kvm/emulate.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include - -/* - * stolen from arch/arm/kernel/opcodes.c - * - * condition code lookup table - * index into the table is test code: EQ, NE, ... LT, GT, AL, NV - * - * bit position in short is condition code: NZCV - */ -static const unsigned short cc_map[16] = { - 0xF0F0, /* EQ == Z set */ - 0x0F0F, /* NE */ - 0xCCCC, /* CS == C set */ - 0x3333, /* CC */ - 0xFF00, /* MI == N set */ - 0x00FF, /* PL */ - 0xAAAA, /* VS == V set */ - 0x5555, /* VC */ - 0x0C0C, /* HI == C set && Z clear */ - 0xF3F3, /* LS == C clear || Z set */ - 0xAA55, /* GE == (N==V) */ - 0x55AA, /* LT == (N!=V) */ - 0x0A05, /* GT == (!Z && (N==V)) */ - 0xF5FA, /* LE == (Z || (N!=V)) */ - 0xFFFF, /* AL always */ - 0 /* NV */ -}; - -/* - * Check if a trapped instruction should have been executed or not. - */ -bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) -{ - unsigned long cpsr; - u32 cpsr_cond; - int cond; - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - /* Is condition field valid? */ - cond = kvm_vcpu_get_condition(vcpu); - if (cond == 0xE) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - if (cond < 0) { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - cpsr_cond = cpsr >> 28; - - if (!((cc_map[cond] >> cpsr_cond) & 1)) - return false; - - return true; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & COMPAT_PSR_T_BIT); - - BUG_ON(is_arm && (cpsr & COMPAT_PSR_IT_MASK)); - - if (!(cpsr & COMPAT_PSR_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~COMPAT_PSR_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; - else - *vcpu_pc(vcpu) += 4; - kvm_adjust_itstate(vcpu); -} diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c new file mode 100644 index 0000000..cb02e56 --- /dev/null +++ b/virt/kvm/arm/aarch32.c @@ -0,0 +1,146 @@ +/* + * (not much of an) Emulation layer for 32bit guests. + * + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * based on arch/arm/kvm/emulate.c + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +/* + * stolen from arch/arm/kernel/opcodes.c + * + * condition code lookup table + * index into the table is test code: EQ, NE, ... LT, GT, AL, NV + * + * bit position in short is condition code: NZCV + */ +static const unsigned short cc_map[16] = { + 0xF0F0, /* EQ == Z set */ + 0x0F0F, /* NE */ + 0xCCCC, /* CS == C set */ + 0x3333, /* CC */ + 0xFF00, /* MI == N set */ + 0x00FF, /* PL */ + 0xAAAA, /* VS == V set */ + 0x5555, /* VC */ + 0x0C0C, /* HI == C set && Z clear */ + 0xF3F3, /* LS == C clear || Z set */ + 0xAA55, /* GE == (N==V) */ + 0x55AA, /* LT == (N!=V) */ + 0x0A05, /* GT == (!Z && (N==V)) */ + 0xF5FA, /* LE == (Z || (N!=V)) */ + 0xFFFF, /* AL always */ + 0 /* NV */ +}; + +/* + * Check if a trapped instruction should have been executed or not. + */ +bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) +{ + unsigned long cpsr; + u32 cpsr_cond; + int cond; + + /* Top two bits non-zero? Unconditional. */ + if (kvm_vcpu_get_hsr(vcpu) >> 30) + return true; + + /* Is condition field valid? */ + cond = kvm_vcpu_get_condition(vcpu); + if (cond == 0xE) + return true; + + cpsr = *vcpu_cpsr(vcpu); + + if (cond < 0) { + /* This can happen in Thumb mode: examine IT state. */ + unsigned long it; + + it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); + + /* it == 0 => unconditional. */ + if (it == 0) + return true; + + /* The cond for this insn works out as the top 4 bits. */ + cond = (it >> 4); + } + + cpsr_cond = cpsr >> 28; + + if (!((cc_map[cond] >> cpsr_cond) & 1)) + return false; + + return true; +} + +/** + * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block + * @vcpu: The VCPU pointer + * + * When exceptions occur while instructions are executed in Thumb IF-THEN + * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have + * to do this little bit of work manually. The fields map like this: + * + * IT[7:0] -> CPSR[26:25],CPSR[15:10] + */ +static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) +{ + unsigned long itbits, cond; + unsigned long cpsr = *vcpu_cpsr(vcpu); + bool is_arm = !(cpsr & COMPAT_PSR_T_BIT); + + if (is_arm || !(cpsr & COMPAT_PSR_IT_MASK)) + return; + + cond = (cpsr & 0xe000) >> 13; + itbits = (cpsr & 0x1c00) >> (10 - 2); + itbits |= (cpsr & (0x3 << 25)) >> 25; + + /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ + if ((itbits & 0x7) == 0) + itbits = cond = 0; + else + itbits = (itbits << 1) & 0x1f; + + cpsr &= ~COMPAT_PSR_IT_MASK; + cpsr |= cond << 13; + cpsr |= (itbits & 0x1c) << (10 - 2); + cpsr |= (itbits & 0x3) << 25; + *vcpu_cpsr(vcpu) = cpsr; +} + +/** + * kvm_skip_instr - skip a trapped instruction and proceed to the next + * @vcpu: The vcpu pointer + */ +void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + bool is_thumb; + + is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT); + if (is_thumb && !is_wide_instr) + *vcpu_pc(vcpu) += 2; + else + *vcpu_pc(vcpu) += 4; + kvm_adjust_itstate(vcpu); +} -- cgit v0.10.2 From 3aedd5c49e63c0d31a53b00ab906d48f53abb68b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:43 +0100 Subject: arm: KVM: Use common AArch32 conditional execution code Add the bit of glue and const-ification that is required to use the code inherited from the arm64 port, and move over to it. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index ee5328f..448d63c 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -40,18 +40,28 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, *vcpu_reg(vcpu, reg_num) = val; } -bool kvm_condition_valid(struct kvm_vcpu *vcpu); -void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr); +bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); +void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); +static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) +{ + return kvm_condition_valid32(vcpu); +} + +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + kvm_skip_instr32(vcpu, is_wide_instr); +} + static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { vcpu->arch.hcr = HCR_GUEST_MASK; } -static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu) +static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu) { return vcpu->arch.hcr; } @@ -61,7 +71,7 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr) vcpu->arch.hcr = hcr; } -static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu) +static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) { return 1; } @@ -71,9 +81,9 @@ static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu) return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc; } -static inline unsigned long *vcpu_cpsr(struct kvm_vcpu *vcpu) +static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu) { - return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr; + return (unsigned long *)&vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr; } static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) @@ -93,11 +103,21 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) return cpsr_mode > USR_MODE;; } -static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu) +static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.hsr; } +static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) +{ + u32 hsr = kvm_vcpu_get_hsr(vcpu); + + if (hsr & HSR_CV) + return (hsr & HSR_COND) >> HSR_COND_SHIFT; + + return -1; +} + static inline unsigned long kvm_vcpu_get_hfar(struct kvm_vcpu *vcpu) { return vcpu->arch.fault.hxfar; diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 10d77a6..339ec88 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o +obj-y += $(KVM)/arm/aarch32.o obj-y += $(KVM)/arm/vgic/vgic.o obj-y += $(KVM)/arm/vgic/vgic-init.o diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index eda9ddd..ff9acd1 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -161,103 +161,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu) } } -/* - * A conditional instruction is allowed to trap, even though it - * wouldn't be executed. So let's re-implement the hardware, in - * software! - */ -bool kvm_condition_valid(struct kvm_vcpu *vcpu) -{ - unsigned long cpsr, cond, insn; - - /* - * Exception Code 0 can only happen if we set HCR.TGE to 1, to - * catch undefined instructions, and then we won't get past - * the arm_exit_handlers test anyway. - */ - BUG_ON(!kvm_vcpu_trap_get_class(vcpu)); - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - /* Is condition field valid? */ - if ((kvm_vcpu_get_hsr(vcpu) & HSR_CV) >> HSR_CV_SHIFT) - cond = (kvm_vcpu_get_hsr(vcpu) & HSR_COND) >> HSR_COND_SHIFT; - else { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - /* Shift makes it look like an ARM-mode instruction */ - insn = cond << 28; - return arm_check_condition(insn, cpsr) != ARM_OPCODE_CONDTEST_FAIL; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & PSR_T_BIT); - - if (is_arm || !(cpsr & PSR_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~PSR_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; - else - *vcpu_pc(vcpu) += 4; - kvm_adjust_itstate(vcpu); -} - - /****************************************************************************** * Inject exceptions into the guest */ diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index cb02e56..ba4417b 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -24,6 +24,11 @@ #include #include +#ifndef CONFIG_ARM64 +#define COMPAT_PSR_T_BIT PSR_T_BIT +#define COMPAT_PSR_IT_MASK PSR_IT_MASK +#endif + /* * stolen from arch/arm/kernel/opcodes.c * -- cgit v0.10.2 From 8cebe750c4d9acdfdce41ee0a83a58be973ebc5e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:44 +0100 Subject: arm64: KVM: Make kvm_skip_instr32 available to HYP As we plan to do some emulation at HYP, let's make kvm_skip_instr32 as part of the hyp_text section. This doesn't preclude the kernel from using it. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index ba4417b..528af4b 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -23,6 +23,7 @@ #include #include +#include #ifndef CONFIG_ARM64 #define COMPAT_PSR_T_BIT PSR_T_BIT @@ -108,7 +109,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) * * IT[7:0] -> CPSR[26:25],CPSR[15:10] */ -static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) +static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) { unsigned long itbits, cond; unsigned long cpsr = *vcpu_cpsr(vcpu); @@ -138,7 +139,7 @@ static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) * kvm_skip_instr - skip a trapped instruction and proceed to the next * @vcpu: The vcpu pointer */ -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) +void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) { bool is_thumb; -- cgit v0.10.2 From fb5ee369ccd3986b28adc20d43d73a2b2c141977 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:45 +0100 Subject: arm64: KVM: vgic-v2: Add the GICV emulation infrastructure In order to efficiently perform the GICV access on behalf of the guest, we need to be able to avoid going back all the way to the host kernel. For this, we introduce a new hook in the world switch code, conveniently placed just after populating the fault info. At that point, we only have saved/restored the GP registers, and we can quickly perform all the required checks (data abort, translation fault, valid faulting syndrome, not an external abort, not a PTW). Coming back from the emulation code, we need to skip the emulated instruction. This involves an additional bit of save/restore in order to be able to access the guest's PC (and possibly CPSR if this is a 32bit guest). At this stage, no emulation code is provided. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index cff5105..88ec3ac 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -123,6 +123,7 @@ typeof(orig) * __hyp_text fname(void) \ void __vgic_v2_save_state(struct kvm_vcpu *vcpu); void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); +bool __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 35d2e09..b3a66c5 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -17,6 +17,7 @@ #include #include +#include #include static bool __hyp_text __fpsimd_enabled_nvhe(void) @@ -232,6 +233,21 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } +static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(elr); + + if (vcpu_mode_is_32bit(vcpu)) { + vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); + kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); + } else { + *vcpu_pc(vcpu) += 4; + } + + write_sysreg_el2(*vcpu_pc(vcpu), elr); +} + int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; @@ -270,6 +286,22 @@ again: if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu)) goto again; + if (static_branch_unlikely(&vgic_v2_cpuif_trap) && + exit_code == ARM_EXCEPTION_TRAP) { + bool valid; + + valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && + kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_dabt_isextabt(vcpu) && + !kvm_vcpu_dabt_iss1tw(vcpu); + + if (valid && __vgic_v2_perform_cpuif_access(vcpu)) { + __skip_instr(vcpu); + goto again; + } + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 19b698e..8eb1501 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,8 @@ struct vgic_cpu { bool lpis_enabled; }; +extern struct static_key_false vgic_v2_cpuif_trap; + int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_create(struct kvm *kvm, u32 type); diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index 7cffd93..3e2a62e 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -167,3 +167,10 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR); vcpu->arch.vgic_cpu.live_lrs = live_lrs; } + +#ifdef CONFIG_ARM64 +bool __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) +{ + return false; +} +#endif diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 0bf6709..b8da901 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -294,6 +294,8 @@ out: return ret; } +DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); + /** * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT * @node: pointer to the DT node -- cgit v0.10.2 From bf8feb39642b4c71c644e2d534ce53029bd2219b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:46 +0100 Subject: arm64: KVM: vgic-v2: Add GICV access from HYP Now that we have the necessary infrastructure to handle MMIO accesses in HYP, perform the GICV access on behalf of the guest. This requires checking that the access is strictly 32bit, properly aligned, and falls within the expected range. When all condition are satisfied, we perform the access and tell the rest of the HYP code that the instruction has been correctly emulated. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 8eb1501..bb46c03 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -50,6 +50,9 @@ struct vgic_global { /* Physical address of vgic virtual cpu interface */ phys_addr_t vcpu_base; + /* GICV mapping */ + void __iomem *vcpu_base_va; + /* virtual control interface mapping */ void __iomem *vctrl_base; diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index 3e2a62e..a052f20 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -19,6 +19,7 @@ #include #include +#include #include static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, @@ -171,6 +172,44 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_ARM64 bool __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) { + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + struct vgic_dist *vgic = &kvm->arch.vgic; + phys_addr_t fault_ipa; + void __iomem *addr; + int rd; + + /* Build the full address */ + fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + + /* If not for GICV, move on */ + if (fault_ipa < vgic->vgic_cpu_base || + fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE)) return false; + + /* Reject anything but a 32bit access */ + if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) + return false; + + /* Not aligned? Don't bother */ + if (fault_ipa & 3) + return false; + + rd = kvm_vcpu_dabt_get_rd(vcpu); + addr = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va); + addr += fault_ipa - vgic->vgic_cpu_base; + + if (kvm_vcpu_dabt_iswrite(vcpu)) { + u32 data = vcpu_data_guest_to_host(vcpu, + vcpu_get_reg(vcpu, rd), + sizeof(u32)); + writel_relaxed(data, addr); + } else { + u32 data = readl_relaxed(addr); + vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data, + sizeof(u32))); + } + + return true; } #endif -- cgit v0.10.2 From a07d3b07a876d44827e62f3ef55837ebbf61c4e6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 09:28:47 +0100 Subject: arm64: KVM: vgic-v2: Enable GICV access from HYP if access from guest is unsafe So far, we've been disabling KVM on systems where the GICV region couldn't be safely given to a guest. Now that we're able to handle this access safely by emulating it in HYP, we can enable this feature when we detect an unsafe configuration. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index b8da901..0a063af 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -278,12 +278,14 @@ int vgic_v2_map_resources(struct kvm *kvm) goto out; } - ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, - kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out; + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { + ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, + kvm_vgic_global_state.vcpu_base, + KVM_VGIC_V2_CPU_SIZE, true); + if (ret) { + kvm_err("Unable to remap VGIC CPU to VCPU\n"); + goto out; + } } dist->ready = true; @@ -312,45 +314,51 @@ int vgic_v2_probe(const struct gic_kvm_info *info) return -ENXIO; } - if (!PAGE_ALIGNED(info->vcpu.start)) { - kvm_err("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)info->vcpu.start); - return -ENXIO; - } + if (!PAGE_ALIGNED(info->vcpu.start) || + !PAGE_ALIGNED(resource_size(&info->vcpu))) { + kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); + kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start, + resource_size(&info->vcpu)); + if (!kvm_vgic_global_state.vcpu_base_va) { + kvm_err("Cannot ioremap GICV\n"); + return -ENOMEM; + } - if (!PAGE_ALIGNED(resource_size(&info->vcpu))) { - kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&info->vcpu), - PAGE_SIZE); - return -ENXIO; + ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va, + kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu), + info->vcpu.start); + if (ret) { + kvm_err("Cannot map GICV into hyp\n"); + goto out; + } + + static_branch_enable(&vgic_v2_cpuif_trap); } kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start, resource_size(&info->vctrl)); if (!kvm_vgic_global_state.vctrl_base) { kvm_err("Cannot ioremap GICH\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; - ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - if (ret) { - kvm_err("Cannot register GICv2 KVM device\n"); - iounmap(kvm_vgic_global_state.vctrl_base); - return ret; - } - ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base, kvm_vgic_global_state.vctrl_base + resource_size(&info->vctrl), info->vctrl.start); if (ret) { kvm_err("Cannot map VCTRL into hyp\n"); - kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); - iounmap(kvm_vgic_global_state.vctrl_base); - return ret; + goto out; + } + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device\n"); + goto out; } kvm_vgic_global_state.can_emulate_gicv2 = true; @@ -361,4 +369,11 @@ int vgic_v2_probe(const struct gic_kvm_info *info) kvm_info("vgic-v2@%llx\n", info->vctrl.start); return 0; +out: + if (kvm_vgic_global_state.vctrl_base) + iounmap(kvm_vgic_global_state.vctrl_base); + if (kvm_vgic_global_state.vcpu_base_va) + iounmap(kvm_vgic_global_state.vcpu_base_va); + + return ret; } -- cgit v0.10.2 From 7b17145ec7de9267e24038c0621c9ed47853ae4f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:01:59 +0100 Subject: arm64: KVM: Rename HCR_VA to HCR_VSE HCR_VA is a leftover from ARMv7, On ARMv8, this is HCR_VSE (which stands for Virtual System Error), and has better defined semantics. Let's rename the constant. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 4b5c977..2a2752b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -50,7 +50,7 @@ #define HCR_BSU (3 << 10) #define HCR_BSU_IS (UL(1) << 10) #define HCR_FB (UL(1) << 9) -#define HCR_VA (UL(1) << 8) +#define HCR_VSE (UL(1) << 8) #define HCR_VI (UL(1) << 7) #define HCR_VF (UL(1) << 6) #define HCR_AMO (UL(1) << 5) @@ -80,7 +80,7 @@ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW) -#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) +#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) -- cgit v0.10.2 From 44636f976f5bb92631f0dd5fd26547d64c2c6a80 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:00 +0100 Subject: arm64: KVM: Preserve pending vSError in world switch The HCR_EL2.VSE bit is used to signal an SError to a guest, and has the peculiar feature of getting cleared when the guest has taken the abort (this is the only bit that behaves as such in this register). This means that if we signal such an abort, we must leave it in the guest context until it disappears from HCR_EL2, and at which point it must be cleared from the context. This is achieved by reading back from HCR_EL2 until the guest takes the fault. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index b3a66c5..8246de2 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -110,6 +110,15 @@ static hyp_alternate_select(__deactivate_traps_arch, static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) { + /* + * If we pended a virtual abort, preserve it until it gets + * cleared. See D1.14.3 (Virtual Interrupts) for details, but + * the crucial bit is "On taking a vSError interrupt, + * HCR_EL2.VSE is cleared to 0." + */ + if (vcpu->arch.hcr_el2 & HCR_VSE) + vcpu->arch.hcr_el2 = read_sysreg(hcr_el2); + __deactivate_traps_arch()(); write_sysreg(0, hstr_el2); write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2); -- cgit v0.10.2 From 10cf33900fb27a5b5a107e9c37fa249e0deb5a96 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:01 +0100 Subject: arm64: KVM: Add Virtual Abort injection helper Now that we're able to context switch the HCR_EL2.VA bit, let's introduce a helper that injects an Abort into a vcpu. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index b336434d..fd9d5fd 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -38,6 +38,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 898c0e6..da6a8cf 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -231,3 +231,15 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) else inject_undef64(vcpu); } + +/** + * kvm_inject_vabt - inject an async abort / SError into the guest + * @vcpu: The VCPU to receive the exception + * + * It is assumed that this code is called from the VCPU thread and that the + * VCPU therefore is not currently executing guest code. + */ +void kvm_inject_vabt(struct kvm_vcpu *vcpu) +{ + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE); +} -- cgit v0.10.2 From 9aecafc86ca2f4f4d7f5cbaf674f5df244221c51 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:02 +0100 Subject: arm64: KVM: Add exception code to report EL1 asynchronous aborts So far, we don't have a code to indicate that we've taken an asynchronous abort from EL1. Let's add one. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7561f63..d177e7e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -21,9 +21,10 @@ #include #define ARM_EXCEPTION_IRQ 0 -#define ARM_EXCEPTION_TRAP 1 +#define ARM_EXCEPTION_EL1_SERROR 1 +#define ARM_EXCEPTION_TRAP 2 /* The hyp-stub will return this for any kvm_call_hyp() call */ -#define ARM_EXCEPTION_HYP_GONE 2 +#define ARM_EXCEPTION_HYP_GONE 3 #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) -- cgit v0.10.2 From 0215a6e6dd4fdbf4f7dd6e9db116aec7818ff388 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:03 +0100 Subject: arm64: KVM: Add EL1 async abort handler If we've exited the guest because it has triggered an asynchronous abort from EL1, a possible course of action is to let it know it screwed up by giving it a Virtual Abort to chew on. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fa96fe2..08afc69a 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -173,6 +173,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; + case ARM_EXCEPTION_EL1_SERROR: + kvm_inject_vabt(vcpu); + return 1; case ARM_EXCEPTION_TRAP: /* * See ARM ARM B1.14.1: "Hyp traps on instructions -- cgit v0.10.2 From 1b51e5fac63f35b644302f2d39ec9af44887b598 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:04 +0100 Subject: arm64: KVM: Route asynchronous aborts As we now have some basic handling to EL1-triggered aborts, we can actually report them to KVM. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index d6cae54..d2f6640 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -120,6 +120,12 @@ el1_irq: mov x0, #ARM_EXCEPTION_IRQ b __guest_exit +el1_error: + stp x0, x1, [sp, #-16]! + mrs x1, tpidr_el2 + mov x0, #ARM_EXCEPTION_EL1_SERROR + b __guest_exit + ENTRY(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) @@ -148,7 +154,6 @@ ENDPROC(\label) invalid_vector el1_sync_invalid invalid_vector el1_irq_invalid invalid_vector el1_fiq_invalid - invalid_vector el1_error_invalid .ltorg @@ -168,10 +173,10 @@ ENTRY(__kvm_hyp_vector) ventry el1_sync // Synchronous 64-bit EL1 ventry el1_irq // IRQ 64-bit EL1 ventry el1_fiq_invalid // FIQ 64-bit EL1 - ventry el1_error_invalid // Error 64-bit EL1 + ventry el1_error // Error 64-bit EL1 ventry el1_sync // Synchronous 32-bit EL1 ventry el1_irq // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 - ventry el1_error_invalid // Error 32-bit EL1 + ventry el1_error // Error 32-bit EL1 ENDPROC(__kvm_hyp_vector) -- cgit v0.10.2 From 20163403a1f2fe3f00f59df2b45ea5cb74b85d97 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:05 +0100 Subject: arm64: KVM: Allow an exit code to be tagged with an SError Similarily to EL1, an asynchronous abort can be triggered whilst running at EL2. But instead of making that a new error code, we need to communicate it to the rest of KVM together with the exit reason. So let's hijack a single bit that allows the exception code to be tagged with a "pending SError" information. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index d177e7e..18f7465 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -20,6 +20,10 @@ #include +#define ARM_EXIT_WITH_SERROR_BIT 31 +#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) +#define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) + #define ARM_EXCEPTION_IRQ 0 #define ARM_EXCEPTION_EL1_SERROR 1 #define ARM_EXCEPTION_TRAP 2 -- cgit v0.10.2 From ddb3d07cfe90ce58c342cf97ce6ce53ba7d10973 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:06 +0100 Subject: arm64: KVM: Inject a Virtual SError if it was pending If we have caught an SError whilst exiting, we've tagged the exit code with the pending information. In that case, let's re-inject the error into the guest, after having adjusted the PC if required. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 08afc69a..a204adf 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -170,6 +170,26 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, { exit_handle_fn exit_handler; + if (ARM_SERROR_PENDING(exception_index)) { + u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu)); + + /* + * HVC/SMC already have an adjusted PC, which we need + * to correct in order to return to after having + * injected the SError. + */ + if (hsr_ec == ESR_ELx_EC_HVC32 || hsr_ec == ESR_ELx_EC_HVC64 || + hsr_ec == ESR_ELx_EC_SMC32 || hsr_ec == ESR_ELx_EC_SMC64) { + u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2; + *vcpu_pc(vcpu) -= adj; + } + + kvm_inject_vabt(vcpu); + return 1; + } + + exception_index = ARM_EXCEPTION_CODE(exception_index); + switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; -- cgit v0.10.2 From 395ea79ebe55d6b01bb8f67bfad0550e6b7cd6d6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:07 +0100 Subject: arm64: KVM: Handle async aborts delivered while at EL2 If EL1 generates an asynchronous abort and then traps into EL2 before the abort has been delivered, we may end-up with the abort firing at the worse possible place: on the host. In order to avoid this, it is necessary to take the abort at EL2, by clearing the PSTATE.A bit. In order to survive this abort, we do it at a point where we're in a known state with respect to the world switch, and handle the resulting exception, overloading the exit code in the process. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index b5926ee..12ee62d 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -124,7 +124,38 @@ ENTRY(__guest_exit) // Now restore the host regs restore_callee_saved_regs x2 - ret + // If we have a pending asynchronous abort, now is the + // time to find out. From your VAXorcist book, page 666: + // "Threaten me not, oh Evil one! For I speak with + // the power of DEC, and I command thee to show thyself!" + mrs x2, elr_el2 + mrs x3, esr_el2 + mrs x4, spsr_el2 + mov x5, x0 + + dsb sy // Synchronize against in-flight ld/st + msr daifclr, #4 // Unmask aborts + + // This is our single instruction exception window. A pending + // SError is guaranteed to occur at the earliest when we unmask + // it, and at the latest just after the ISB. + .global abort_guest_exit_start +abort_guest_exit_start: + + isb + + .global abort_guest_exit_end +abort_guest_exit_end: + + // If the exception took place, restore the EL1 exception + // context so that we can report some information. + // Merge the exception code with the SError pending bit. + tbz x0, #ARM_EXIT_WITH_SERROR_BIT, 1f + msr elr_el2, x2 + msr esr_el2, x3 + msr spsr_el2, x4 + orr x0, x0, x5 +1: ret ENDPROC(__guest_exit) ENTRY(__fpsimd_guest_restore) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index d2f6640..4e92399 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -126,6 +126,28 @@ el1_error: mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit +el2_error: + /* + * Only two possibilities: + * 1) Either we come from the exit path, having just unmasked + * PSTATE.A: change the return code to an EL2 fault, and + * carry on, as we're already in a sane state to handle it. + * 2) Or we come from anywhere else, and that's a bug: we panic. + * + * For (1), x0 contains the original return code and x1 doesn't + * contain anything meaningful at that stage. We can reuse them + * as temp registers. + * For (2), who cares? + */ + mrs x0, elr_el2 + adr x1, abort_guest_exit_start + cmp x0, x1 + adr x1, abort_guest_exit_end + ccmp x0, x1, #4, ne + b.ne __hyp_panic + mov x0, #(1 << ARM_EXIT_WITH_SERROR_BIT) + eret + ENTRY(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) @@ -150,7 +172,6 @@ ENDPROC(\label) invalid_vector el2h_sync_invalid invalid_vector el2h_irq_invalid invalid_vector el2h_fiq_invalid - invalid_vector el2h_error_invalid invalid_vector el1_sync_invalid invalid_vector el1_irq_invalid invalid_vector el1_fiq_invalid @@ -168,7 +189,7 @@ ENTRY(__kvm_hyp_vector) ventry el2h_sync_invalid // Synchronous EL2h ventry el2h_irq_invalid // IRQ EL2h ventry el2h_fiq_invalid // FIQ EL2h - ventry el2h_error_invalid // Error EL2h + ventry el2_error // Error EL2h ventry el1_sync // Synchronous 64-bit EL1 ventry el1_irq // IRQ 64-bit EL1 diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8246de2..8b81cc6 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -292,6 +292,12 @@ again: exit_code = __guest_enter(vcpu, host_ctxt); /* And we're baaack! */ + /* + * We're using the raw exception code in order to only process + * the trap if no SError is pending. We will come back to the + * same PC once the SError has been injected, and replay the + * trapping instruction. + */ if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu)) goto again; -- cgit v0.10.2 From 1f7e378d1235a3cb5c9cdd8c62fe13cb6c3cf157 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:08 +0100 Subject: arm: KVM: Preserve pending Virtual Abort in world switch The HCR.VA bit is used to signal an Abort to a guest, and has the peculiar feature of getting cleared when the guest has taken the abort (this is the only bit that behaves as such in this register). This means that if we signal such an abort, we must leave it in the guest context until it disappears from HCR, and at which point it must be cleared from the context. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index 37b3365..9da16fd 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -54,6 +54,15 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) { u32 val; + /* + * If we pended a virtual abort, preserve it until it gets + * cleared. See B1.9.9 (Virtual Abort exception) for details, + * but the crucial bit is the zeroing of HCR.VA in the + * pseudocode. + */ + if (vcpu->arch.hcr & HCR_VA) + vcpu->arch.hcr = read_sysreg(HCR); + write_sysreg(0, HCR); write_sysreg(0, HSTR); val = read_sysreg(HDCR); -- cgit v0.10.2 From bfb78b5c98d0e6b6dbd9af9cd71e853670a34044 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:09 +0100 Subject: arm: KVM: Add Virtual Abort injection helper Now that we're able to context switch the HCR.VA bit, let's introduce a helper that injects an Abort into a vcpu. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 448d63c..9a8a45a 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -43,6 +43,7 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index ff9acd1..0064b86 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -303,3 +303,15 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) { inject_abt(vcpu, true, addr); } + +/** + * kvm_inject_vabt - inject an async abort / SError into the guest + * @vcpu: The VCPU to receive the exception + * + * It is assumed that this code is called from the VCPU thread and that the + * VCPU therefore is not currently executing guest code. + */ +void kvm_inject_vabt(struct kvm_vcpu *vcpu) +{ + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA); +} -- cgit v0.10.2 From cc325cfc37b5b9e8814d7172e3a6e47d3b098ff1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:10 +0100 Subject: arm: KVM: Add HYP async abort handler If we've exited the guest because it has triggered an asynchronous abort, a possible course of action is to let it know it screwed up by giving it a Virtual Abort to chew on. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 3f1ef0d..863fdf4 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -160,6 +160,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, exit_handler = kvm_get_exit_handler(vcpu); return exit_handler(vcpu, run); + case ARM_EXCEPTION_DATA_ABORT: + kvm_inject_vabt(vcpu); + return 1; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); -- cgit v0.10.2 From 435bca5fe913e2e5f96881a77ab20e653f5ad894 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:11 +0100 Subject: arm: KVM: Allow an exit code to be tagged with a Virtual Abort An asynchronous abort can also be triggered whilst running at EL2. But instead of making that a new error code, we need to communicate it to the rest of KVM together with the exit reason. So let's hijack a single bit that allows the exception code to be tagged with a "pending Abort" information. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 58faff5..05e47fa 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -21,6 +21,10 @@ #include +#define ARM_EXIT_WITH_ABORT_BIT 31 +#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) +#define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) + #define ARM_EXCEPTION_RESET 0 #define ARM_EXCEPTION_UNDEFINED 1 #define ARM_EXCEPTION_SOFTWARE 2 -- cgit v0.10.2 From c39798f471259dacf56df6bfb2bd071dfe074486 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:12 +0100 Subject: arm: KVM: Handle async aborts delivered while at HYP Just like for arm64, we can handle asynchronous aborts being delivered at HYP while being caused by the guest. We use the exact same method to catch such an abort, and soldier on. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/hyp/entry.S b/arch/arm/kvm/hyp/entry.S index 21c2388..60783f3 100644 --- a/arch/arm/kvm/hyp/entry.S +++ b/arch/arm/kvm/hyp/entry.S @@ -18,6 +18,7 @@ #include #include #include +#include .arch_extension virt @@ -63,6 +64,36 @@ ENTRY(__guest_exit) ldr lr, [r0, #4] mov r0, r1 + mrs r1, SPSR + mrs r2, ELR_hyp + mrc p15, 4, r3, c5, c2, 0 @ HSR + + /* + * Force loads and stores to complete before unmasking aborts + * and forcing the delivery of the exception. This gives us a + * single instruction window, which the handler will try to + * match. + */ + dsb sy + cpsie a + + .global abort_guest_exit_start +abort_guest_exit_start: + + isb + + .global abort_guest_exit_end +abort_guest_exit_end: + + /* + * If we took an abort, r0[31] will be set, and cmp will set + * the N bit in PSTATE. + */ + cmp r0, #0 + msrmi SPSR_cxsf, r1 + msrmi ELR_hyp, r2 + mcrmi p15, 4, r3, c5, c2, 0 @ HSR + bx lr ENDPROC(__guest_exit) diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S index 7809138..96beb53 100644 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ b/arch/arm/kvm/hyp/hyp-entry.S @@ -81,7 +81,6 @@ __kvm_hyp_vector: invalid_vector hyp_undef ARM_EXCEPTION_UNDEFINED invalid_vector hyp_svc ARM_EXCEPTION_SOFTWARE invalid_vector hyp_pabt ARM_EXCEPTION_PREF_ABORT - invalid_vector hyp_dabt ARM_EXCEPTION_DATA_ABORT invalid_vector hyp_fiq ARM_EXCEPTION_FIQ ENTRY(__hyp_do_panic) @@ -164,6 +163,21 @@ hyp_irq: load_vcpu r0 @ Load VCPU pointer to r0 b __guest_exit +hyp_dabt: + push {r0, r1} + mrs r0, ELR_hyp + ldr r1, =abort_guest_exit_start +THUMB( add r1, r1, #1) + cmp r0, r1 + ldrne r1, =abort_guest_exit_end +THUMB( addne r1, r1, #1) + cmpne r0, r1 + pop {r0, r1} + bne __hyp_panic + + orr r0, r0, #(1 << ARM_EXIT_WITH_ABORT_BIT) + eret + .ltorg .popsection -- cgit v0.10.2 From 5d7bcf7d644a88183fb34fca12ad6ea40a8db7ab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:13 +0100 Subject: arm: KVM: Inject a Virtual Abort if it was pending If we have caught an Abort whilst exiting, we've tagged the exit code with the pending information. In that case, let's re-inject the error into the guest, after having adjusted the PC if required. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 863fdf4..4eacb5c 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -144,6 +144,25 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, { exit_handle_fn exit_handler; + if (ARM_ABORT_PENDING(exception_index)) { + u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); + + /* + * HVC/SMC already have an adjusted PC, which we need + * to correct in order to return to after having + * injected the abort. + */ + if (hsr_ec == HSR_EC_HVC || hsr_ec == HSR_EC_SMC) { + u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2; + *vcpu_pc(vcpu) -= adj; + } + + kvm_inject_vabt(vcpu); + return 1; + } + + exception_index = ARM_EXCEPTION_CODE(exception_index); + switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; -- cgit v0.10.2 From e656a1f91e701e6a1eddf8c029b45639b60877d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:14 +0100 Subject: arm: KVM: Drop unreachable HYP abort handlers Both data and prefetch aborts occuring in HYP lead to a well deserved panic. Let's get rid of these silly handlers. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 4eacb5c..4e40d19 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -28,14 +28,6 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); -static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* SVC called from Hyp mode should never get here */ - kvm_debug("SVC called from Hyp mode shouldn't go here\n"); - BUG(); - return -EINVAL; /* Squash warning */ -} - static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret; @@ -59,22 +51,6 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } -static int handle_pabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* The hypervisor should never cause aborts */ - kvm_err("Prefetch Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n", - kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu)); - return -EFAULT; -} - -static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* This is either an error in the ws. code or an external abort */ - kvm_err("Data Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n", - kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu)); - return -EFAULT; -} - /** * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests * @vcpu: the vcpu pointer @@ -112,13 +88,10 @@ static exit_handle_fn arm_exit_handlers[] = { [HSR_EC_CP14_64] = kvm_handle_cp14_access, [HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access, [HSR_EC_CP10_ID] = kvm_handle_cp10_id, - [HSR_EC_SVC_HYP] = handle_svc_hyp, [HSR_EC_HVC] = handle_hvc, [HSR_EC_SMC] = handle_smc, [HSR_EC_IABT] = kvm_handle_guest_abort, - [HSR_EC_IABT_HYP] = handle_pabt_hyp, [HSR_EC_DABT] = kvm_handle_guest_abort, - [HSR_EC_DABT_HYP] = handle_dabt_hyp, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From 4055710baca8cb067b059a635cf851eb86410a83 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:15 +0100 Subject: arm/arm64: KVM: Inject virtual abort when guest exits on external abort If we spot a data abort bearing the ESR_EL2.EA bit set, we know that this is an external abort, and that should be punished by the injection of an abort. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 344755d..60e0c1a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1432,6 +1432,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) int ret, idx; is_iabt = kvm_vcpu_trap_is_iabt(vcpu); + if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) { + kvm_inject_vabt(vcpu); + return 1; + } + fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), -- cgit v0.10.2 From 21977a4c57cd92c71113e21319302b5a399f9d2d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:16 +0100 Subject: arm/arm64: KVM: Remove external abort test from MMIO handling As we know handle external aborts pretty early, we can get rid of its handling in the MMIO code (which was a bit odd to begin with...). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 10f80a6..b6e715f 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -126,12 +126,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) int access_size; bool sign_extend; - if (kvm_vcpu_dabt_isextabt(vcpu)) { - /* cache operation on I/O addr, tell guest unsupported */ - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - if (kvm_vcpu_dabt_iss1tw(vcpu)) { /* page table accesses IO mem: tell guest to fix its TTBR */ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); -- cgit v0.10.2 From 3272f0d08e4490b792b99cf6034a2bb859bf6c9f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Sep 2016 14:02:17 +0100 Subject: arm64: KVM: Inject a vSerror if detecting a bad GICV access at EL2 If, when proxying a GICV access at EL2, we detect that the guest is doing something silly, report an EL1 SError instead ofgnoring the access. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 88ec3ac..b18e852 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -123,7 +123,7 @@ typeof(orig) * __hyp_text fname(void) \ void __vgic_v2_save_state(struct kvm_vcpu *vcpu); void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); -bool __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); +int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8b81cc6..731519c 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -311,9 +311,21 @@ again: !kvm_vcpu_dabt_isextabt(vcpu) && !kvm_vcpu_dabt_iss1tw(vcpu); - if (valid && __vgic_v2_perform_cpuif_access(vcpu)) { - __skip_instr(vcpu); - goto again; + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) { + __skip_instr(vcpu); + goto again; + } + + if (ret == -1) { + /* Promote an illegal access to an SError */ + __skip_instr(vcpu); + exit_code = ARM_EXCEPTION_EL1_SERROR; + } + + /* 0 falls through to be handler out of EL2 */ } } diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index a052f20..c8aeb7b 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -170,7 +170,18 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) } #ifdef CONFIG_ARM64 -bool __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) +/* + * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the + * guest. + * + * @vcpu: the offending vcpu + * + * Returns: + * 1: GICV access successfully performed + * 0: Not a GICV access + * -1: Illegal GICV access + */ +int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); struct vgic_dist *vgic = &kvm->arch.vgic; @@ -185,15 +196,15 @@ bool __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) /* If not for GICV, move on */ if (fault_ipa < vgic->vgic_cpu_base || fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE)) - return false; + return 0; /* Reject anything but a 32bit access */ if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) - return false; + return -1; /* Not aligned? Don't bother */ if (fault_ipa & 3) - return false; + return -1; rd = kvm_vcpu_dabt_get_rd(vcpu); addr = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va); @@ -210,6 +221,6 @@ bool __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) sizeof(u32))); } - return true; + return 1; } #endif -- cgit v0.10.2 From 5d947a1447f98eede0778f6f59a8fe03f0c53caf Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 8 Sep 2016 12:45:59 +0200 Subject: KVM: ARM: cleanup kvm_timer_hyp_init Remove two unnecessary labels now that kvm_timer_hyp_init is not creating its own workqueue anymore. Signed-off-by: Paolo Bonzini Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 4309b60..27a1f63 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -445,7 +445,7 @@ int kvm_timer_hyp_init(void) if (err) { kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", host_vtimer_irq, err); - goto out; + return err; } kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); @@ -453,10 +453,6 @@ int kvm_timer_hyp_init(void) cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, "AP_KVM_ARM_TIMER_STARTING", kvm_timer_starting_cpu, kvm_timer_dying_cpu); - goto out; -out_free: - free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); -out: return err; } -- cgit v0.10.2 From 5ea11f2b31b83bd3c05357e6bec20f598e00705a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:41 -0500 Subject: svm: Introduces AVIC per-VM ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces per-VM AVIC ID and helper functions to manage the IDs. Currently, the ID will be used to implement 32-bit AVIC IOMMU GA tag. The ID is 24-bit one-based indexing value, and is managed via helper functions to get the next ID, or to free an ID once a VM is destroyed. There should be no ID conflict for any active VMs. Reviewed-by: Radim Krčmář Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33ae3a4..d36176b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -781,6 +781,7 @@ struct kvm_arch { bool disabled_lapic_found; /* Struct members for AVIC */ + u32 avic_vm_id; u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af523d8..27e37d8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -96,6 +96,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -242,6 +255,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* AVIC VM ID bit masks and lock */ +static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); +static DEFINE_SPINLOCK(avic_vm_id_lock); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -1280,10 +1297,40 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static inline int avic_get_next_vm_id(void) +{ + int id; + + spin_lock(&avic_vm_id_lock); + + /* AVIC VM ID is one-based. */ + id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); + if (id <= AVIC_VM_ID_MASK) + __set_bit(id, avic_vm_id_bitmap); + else + id = -EAGAIN; + + spin_unlock(&avic_vm_id_lock); + return id; +} + +static inline int avic_free_vm_id(int id) +{ + if (id <= 0 || id > AVIC_VM_ID_MASK) + return -EINVAL; + + spin_lock(&avic_vm_id_lock); + __clear_bit(id, avic_vm_id_bitmap); + spin_unlock(&avic_vm_id_lock); + return 0; +} + static void avic_vm_destroy(struct kvm *kvm) { struct kvm_arch *vm_data = &kvm->arch; + avic_free_vm_id(vm_data->avic_vm_id); + if (vm_data->avic_logical_id_table_page) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) @@ -1300,6 +1347,10 @@ static int avic_vm_init(struct kvm *kvm) if (!avic) return 0; + vm_data->avic_vm_id = avic_get_next_vm_id(); + if (vm_data->avic_vm_id < 0) + return vm_data->avic_vm_id; + /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); if (!p_page) -- cgit v0.10.2 From 5881f73757cc3dbada878e67c119a801ed0f9a07 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:42 -0500 Subject: svm: Introduce AMD IOMMU avic_ga_log_notifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces avic_ga_log_notifier, which will be called by IOMMU driver whenever it handles the Guest vAPIC (GA) log entry. Reviewed-by: Radim Krčmář Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d36176b..4c738c2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -785,6 +785,7 @@ struct kvm_arch { u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + struct hlist_node hnode; bool x2apic_format; bool x2apic_broadcast_quirk_disabled; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 27e37d8..0a9e43b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include @@ -945,6 +947,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +/* Note: + * This hash table is used to map VM_ID to a struct kvm_arch, + * when handling AMD IOMMU GALOG notification to schedule in + * a particular vCPU. + */ +#define SVM_VM_DATA_HASH_BITS 8 +DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static spinlock_t svm_vm_data_hash_lock; + +/* Note: + * This function is called from IOMMU driver to notify + * SVM to schedule in a particular vCPU of a particular VM. + */ +static int avic_ga_log_notifier(u32 ga_tag) +{ + unsigned long flags; + struct kvm_arch *ka = NULL; + struct kvm_vcpu *vcpu = NULL; + u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + + pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_vm_id != vm_id) + continue; + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + break; + } + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + if (!vcpu) + return 0; + + /* Note: + * At this point, the IOMMU should have already set the pending + * bit in the vAPIC backing page. So, we just need to schedule + * in the vcpu. + */ + if (vcpu->mode == OUTSIDE_GUEST_MODE) + kvm_vcpu_wake_up(vcpu); + + return 0; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1003,10 +1054,15 @@ static __init int svm_hardware_setup(void) if (avic) { if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) + !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { avic = false; - else + } else { pr_info("AVIC enabled\n"); + + hash_init(svm_vm_data_hash); + spin_lock_init(&svm_vm_data_hash_lock); + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } } return 0; @@ -1327,6 +1383,7 @@ static inline int avic_free_vm_id(int id) static void avic_vm_destroy(struct kvm *kvm) { + unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; avic_free_vm_id(vm_data->avic_vm_id); @@ -1335,10 +1392,15 @@ static void avic_vm_destroy(struct kvm *kvm) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) __free_page(vm_data->avic_physical_id_table_page); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_del(&vm_data->hnode); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } static int avic_vm_init(struct kvm *kvm) { + unsigned long flags; int err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; @@ -1367,6 +1429,10 @@ static int avic_vm_init(struct kvm *kvm) vm_data->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + return 0; free_avic: -- cgit v0.10.2 From 411b44ba80ab0023383fe3f377e903cb0cb7d8bb Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 23 Aug 2016 13:52:43 -0500 Subject: svm: Implements update_pi_irte hook to setup posted interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements update_pi_irte function hook to allow SVM communicate to IOMMU driver regarding how to set up IRTE for handling posted interrupt. In case AVIC is enabled, during vcpu_load/unload, SVM needs to update IOMMU IRTE with appropriate host physical APIC ID. Also, when vcpu_blocking/unblocking, SVM needs to update the is-running bit in the IOMMU IRTE. Both are achieved via calling amd_iommu_update_ga(). However, if GA mode is not enabled for the pass-through device, IOMMU driver will simply just return when calling amd_iommu_update_ga. Signed-off-by: Suravee Suthikulpanit Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0a9e43b..db77c1c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "trace.h" @@ -200,6 +201,23 @@ struct vcpu_svm { struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; + + /* + * Per-vcpu list of struct amd_svm_iommu_ir: + * This is used mainly to store interrupt remapping information used + * when update the vcpu affinity. This avoids the need to scan for + * IRTE and try to match ga_tag in the IOMMU driver. + */ + struct list_head ir_list; + spinlock_t ir_list_lock; +}; + +/* + * This is a wrapper of struct amd_iommu_ir_data. + */ +struct amd_svm_iommu_ir { + struct list_head node; /* Used by SVM for per-vcpu ir_list */ + void *data; /* Storing pointer to struct amd_ir_data */ }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) @@ -1440,31 +1458,34 @@ free_avic: return err; } -/** - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +static inline int +avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { - u64 entry; - int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - svm->avic_is_running = is_run; + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) - return; + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)); + if (list_empty(&svm->ir_list)) + goto out; - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (is_run) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + list_for_each_entry(ir, &svm->ir_list, node) { + ret = amd_iommu_update_ga(cpu, r, ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; } static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1491,6 +1512,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, + svm->avic_is_running); } static void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1502,10 +1525,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->avic_is_running = is_run; + if (is_run) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); +} + static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1567,6 +1607,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = avic_init_backing_page(&svm->vcpu); if (err) goto free_page4; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); } /* We initialize this flag to true to make sure that the is_running @@ -4363,6 +4406,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + unsigned long flags; + struct amd_svm_iommu_ir *cur; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_for_each_entry(cur, &svm->ir_list, node) { + if (cur->data != pi->ir_data) + continue; + list_del(&cur->node); + kfree(cur); + break; + } + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + + /** + * In some cases, the existing irte is updaed and re-set, + * so we need to check here if it's already been * added + * to the ir_list. + */ + if (pi->ir_data && (pi->prev_ga_tag != 0)) { + struct kvm *kvm = svm->vcpu.kvm; + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); + struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + struct vcpu_svm *prev_svm; + + if (!prev_vcpu) { + ret = -EINVAL; + goto out; + } + + prev_svm = to_svm(prev_vcpu); + svm_ir_list_del(prev_svm, pi); + } + + /** + * Allocating new amd_iommu_pi_data, which will get + * add to the per-vcpu ir_list. + */ + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + if (!ir) { + ret = -ENOMEM; + goto out; + } + ir->data = pi->ir_data; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_add(&ir->node, &svm->ir_list); + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +out: + return ret; +} + +/** + * Note: + * The HW cannot support posting multicast/broadcast + * interrupts to a vCPU. So, we still use legacy interrupt + * remapping for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + */ +static int +get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, + struct vcpu_data *vcpu_info, struct vcpu_svm **svm) +{ + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu = NULL; + + kvm_set_msi_irq(kvm, e, &irq); + + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", + __func__, irq.vector); + return -1; + } + + pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, + irq.vector); + *svm = to_svm(vcpu); + vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->vector = irq.vector; + + return 0; +} + +/* + * svm_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", + __func__, host_irq, guest_irq, set); + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + struct vcpu_data vcpu_info; + struct vcpu_svm *svm = NULL; + + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + + /** + * Here, we setup with legacy mode in the following cases: + * 1. When cannot target interrupt to a specific vcpu. + * 2. Unsetting posted interrupt. + * 3. APIC virtialization is disabled for the vcpu. + */ + if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && + kvm_vcpu_apicv_active(&svm->vcpu)) { + struct amd_iommu_pi_data pi; + + /* Try to enable guest_mode in IRTE */ + pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, + svm->vcpu.vcpu_id); + pi.is_guest_mode = true; + pi.vcpu_data = &vcpu_info; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Here, we successfully setting up vcpu affinity in + * IOMMU guest mode. Now, we need to store the posted + * interrupt information in a per-vcpu ir_list so that + * we can reference to them directly when we update vcpu + * scheduling information in IOMMU irte. + */ + if (!ret && pi.is_guest_mode) + svm_ir_list_add(svm, &pi); + } else { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } + + if (!ret && svm) { + trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, + host_irq, e->gsi, + vcpu_info.vector, + vcpu_info.pi_desc_addr, set); + } + + if (ret < 0) { + pr_err("%s: failed to update PI IRTE\n", __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5195,6 +5441,7 @@ static struct kvm_x86_ops svm_x86_ops = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .update_pi_irte = svm_update_pi_irte, }; static int __init svm_init(void) -- cgit v0.10.2 From 80cd8763388b52fa9129cbb4b57a3615a55afd40 Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Mon, 15 Aug 2016 04:53:22 +0200 Subject: KVM: s390: lazy enable RI Only enable runtime instrumentation if the guest issues an RI related instruction or if userspace changes the riccb to a valid state. This makes entry/exit a tiny bit faster. Initial patch by Christian Borntraeger Signed-off-by: Fan Zhang Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index dfd0ca2..1cab8a1 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = { [0x01] = kvm_s390_handle_01, [0x82] = kvm_s390_handle_lpsw, [0x83] = kvm_s390_handle_diag, + [0xaa] = kvm_s390_handle_aa, [0xae] = kvm_s390_handle_sigp, [0xb2] = kvm_s390_handle_b2, [0xb6] = kvm_s390_handle_stctl, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dead20a..892abf4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1938,8 +1938,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= 1; if (sclp.has_sigpif) vcpu->arch.sie_block->eca |= 0x10000000U; - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->arch.sie_block->ecb3 |= 0x01; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= 0x00020000; vcpu->arch.sie_block->ecd |= 0x20000000; @@ -2694,6 +2692,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + /* + * If userspace sets the riccb (e.g. after migration) to a valid state, + * we should enable RI here instead of doing the lazy enablement. + */ + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) && + test_kvm_facility(vcpu->kvm, 64)) { + struct runtime_instr_cb *riccb = + (struct runtime_instr_cb *) &kvm_run->s.regs.riccb; + + if (riccb->valid) + vcpu->arch.sie_block->ecb3 |= 0x01; + } + kvm_run->kvm_dirty_regs = 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c3d4f16..9995cab 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -245,6 +245,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) /* implemented in priv.c */ int is_valid_psw(psw_t *psw); +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); int kvm_s390_handle_01(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4616038..e184353 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -32,6 +32,24 @@ #include "kvm-s390.h" #include "trace.h" +static int handle_ri(struct kvm_vcpu *vcpu) +{ + if (test_kvm_facility(vcpu->kvm, 64)) { + vcpu->arch.sie_block->ecb3 |= 0x01; + kvm_s390_retry_instr(vcpu); + return 0; + } else + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.sie_block->ipa & 0xf) <= 4) + return handle_ri(vcpu); + else + return -EOPNOTSUPP; +} + /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { @@ -1093,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu) static const intercept_handler_t eb_handlers[256] = { [0x2f] = handle_lctlg, [0x25] = handle_stctg, + [0x60] = handle_ri, + [0x61] = handle_ri, + [0x62] = handle_ri, }; int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From a6940674c384ebf56aa0c44f417032de2b67100c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Aug 2016 22:39:32 +0200 Subject: KVM: s390: allow 255 VCPUs when sca entries aren't used If the SCA entries aren't used by the hardware (no SIGPIF), we can simply not set the entries, stick to the basic sca and allow more than 64 VCPUs. To hinder any other facility from using these entries, let's properly provoke intercepts by not setting the MCN and keeping the entries unset. This effectively allows when running KVM under KVM (vSIE) or under z/VM to provide more than 64 VCPUs to a guest. Let's limit it to 255 for now, to not run into problems if the CPU numbers are limited somewhere else. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8e5daf7..876173c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -28,7 +28,7 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 -#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS +#define KVM_MAX_VCPUS 255 #define KVM_USER_MEM_SLOTS 32 /* diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 353e0b7..be4db07 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -42,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) return 0; + BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -70,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { int expect, rc; + BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -111,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc, expect; + if (!kvm_s390_use_sca_entries()) + return; atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 892abf4..3a628eb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -376,7 +376,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: r = KVM_S390_BSCA_CPU_SLOTS; - if (sclp.has_esca && sclp.has_64bscao) + if (!kvm_s390_use_sca_entries()) + r = KVM_MAX_VCPUS; + else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; break; case KVM_CAP_NR_MEMSLOTS: @@ -1553,6 +1555,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + if (!kvm_s390_use_sca_entries()) + return; read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -1570,6 +1574,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { + if (!kvm_s390_use_sca_entries()) { + struct bsca_block *sca = vcpu->kvm->arch.sca; + + /* we still need the basic sca for the ipte control */ + vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); + vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -1650,6 +1661,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { int rc; + if (!kvm_s390_use_sca_entries()) { + if (id < KVM_MAX_VCPUS) + return true; + return false; + } if (id < KVM_S390_BSCA_CPU_SLOTS) return true; if (!sclp.has_esca || !sclp.has_64bscao) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9995cab..3a4e97f 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,6 +20,7 @@ #include #include #include +#include typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); @@ -387,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) return &sca->ipte_control; } +static inline int kvm_s390_use_sca_entries(void) +{ + /* + * Without SIGP interpretation, only SRS interpretation (if available) + * might use the entries. By not setting the entries and keeping them + * invalid, hardware will not access them but intercept. + */ + return sclp.has_sigpif; +} #endif -- cgit v0.10.2 From a1708a2eaded836b7fe64e89a137336c0e6244ea Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 24 Aug 2016 19:45:23 +0200 Subject: KVM: s390: Improve determination of sizes in kvm_s390_import_bp_data() * A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus reuse the corresponding function "kmalloc_array". Suggested-by: Paolo Bonzini This issue was detected also by using the Coccinelle software. * Replace the specification of data structures by pointer dereferences to make the corresponding size determination a bit safer according to the Linux coding style convention. * Delete the local variable "size" which became unnecessary with this refactoring. Signed-off-by: Markus Elfring Acked-by: Cornelia Huck Message-Id: Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index d1f8241..70b71ac 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu, int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0, nr_wp = 0, nr_bp = 0, i, size; + int ret = 0, nr_wp = 0, nr_bp = 0, i; struct kvm_hw_breakpoint *bp_data = NULL; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; @@ -216,14 +216,17 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint); - bp_data = kmalloc(size, GFP_KERNEL); + bp_data = kmalloc_array(dbg->arch.nr_hw_bp, + sizeof(*bp_data), + GFP_KERNEL); if (!bp_data) { ret = -ENOMEM; goto error; } - if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) { + if (copy_from_user(bp_data, + dbg->arch.hw_bp, + sizeof(*bp_data) * dbg->arch.nr_hw_bp)) { ret = -EFAULT; goto error; } @@ -241,17 +244,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, } } - size = nr_wp * sizeof(struct kvm_hw_wp_info_arch); - if (size > 0) { - wp_info = kmalloc(size, GFP_KERNEL); + if (nr_wp > 0) { + wp_info = kmalloc_array(nr_wp, + sizeof(*wp_info), + GFP_KERNEL); if (!wp_info) { ret = -ENOMEM; goto error; } } - size = nr_bp * sizeof(struct kvm_hw_bp_info_arch); - if (size > 0) { - bp_info = kmalloc(size, GFP_KERNEL); + if (nr_bp > 0) { + bp_info = kmalloc_array(nr_bp, + sizeof(*bp_info), + GFP_KERNEL); if (!bp_info) { ret = -ENOMEM; goto error; -- cgit v0.10.2 From 0624a8eb82efd58e457093e8fd4514abd3b37cc0 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 24 Aug 2016 20:10:09 +0200 Subject: KVM: s390: Use memdup_user() rather than duplicating code * Reuse existing functionality from memdup_user() instead of keeping duplicate source code. This issue was detected by using the Coccinelle software. * Return directly if this copy operation failed. Reviewed-by: David Hildenbrand Acked-by: Cornelia Huck Signed-off-by: Markus Elfring Message-Id: Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 70b71ac..d7c6a7f 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -216,20 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - bp_data = kmalloc_array(dbg->arch.nr_hw_bp, - sizeof(*bp_data), - GFP_KERNEL); - if (!bp_data) { - ret = -ENOMEM; - goto error; - } - - if (copy_from_user(bp_data, - dbg->arch.hw_bp, - sizeof(*bp_data) * dbg->arch.nr_hw_bp)) { - ret = -EFAULT; - goto error; - } + bp_data = memdup_user(dbg->arch.hw_bp, + sizeof(*bp_data) * dbg->arch.nr_hw_bp); + if (IS_ERR(bp_data)) + return PTR_ERR(bp_data); for (i = 0; i < dbg->arch.nr_hw_bp; i++) { switch (bp_data[i].type) { -- cgit v0.10.2 From f6f7017192ad62669dc8aa4cb33e5f5a0ecd2d81 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 1 Aug 2016 09:07:52 +0100 Subject: KVM: MIPS: Override HVA error values for EVA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS Enhanced Virtual Addressing (EVA) allows the user mode and kernel mode address spaces to overlap, breaking the assumption that PAGE_OFFSET is an appropriate KVM HVA error value, since PAGE_OFFSET may be as low as zero. Fix this in the same way that s390 does in commit bf640876e21f ("KVM: s390: Make KVM_HVA_ERR_BAD usable on s390"), by overriding KVM_HVA_ERR_[RO_]BAD and kvm_is_error_hva() in asm/kvm_host.h. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b54bcad..4d7e0e4 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -107,6 +107,20 @@ #define KVM_INVALID_INST 0xdeadbeef #define KVM_INVALID_ADDR 0xdeadbeef +/* + * EVA has overlapping user & kernel address spaces, so user VAs may be > + * PAGE_OFFSET. For this reason we can't use the default KVM_HVA_ERR_BAD of + * PAGE_OFFSET. + */ + +#define KVM_HVA_ERR_BAD (-1UL) +#define KVM_HVA_ERR_RO_BAD (-2UL) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return IS_ERR_VALUE(addr); +} + extern atomic_t kvm_mips_instance; struct kvm_vm_stat { -- cgit v0.10.2 From d5888477d31c64dba9264fbb33cfa9b066f071d0 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 19 Aug 2016 15:09:47 +0100 Subject: KVM: MIPS: Emulate MMIO via TLB miss for EVA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPS Enhanced Virtual Addressing (EVA) allows the virtual memory segments to be rearranged such that the KSeg0/KSeg1 segments are accessible TLB mapped to user mode, which would trigger a TLB Miss exception (due to lack of TLB mappings) instead of an Address Error exception. Update the TLB Miss handling similar to Address Error handling for guest MMIO emulation. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 0915539..3a5484f 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -175,6 +175,24 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } + } else if (KVM_GUEST_KERNEL_MODE(vcpu) + && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { + /* + * With EVA we may get a TLB exception instead of an address + * error when the guest performs MMIO to KSeg1 addresses. + */ + kvm_debug("Emulate %s MMIO space\n", + store ? "Store to" : "Load from"); + er = kvm_mips_emulate_inst(cause, opc, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Emulate %s MMIO space failed\n", + store ? "Store to" : "Load from"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + } else { + run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } } else { kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", store ? "ST" : "LD", cause, opc, badvaddr); -- cgit v0.10.2 From 0eeede0c63305a33de31bd90b53b023c1d452c17 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 2 Sep 2016 17:20:43 +1000 Subject: powerpc/mm: Speed up computation of base and actual page size for a HPTE This replaces a 2-D search through an array with a simple 8-bit table lookup for determining the actual and/or base page size for a HPT entry. The encoding in the second doubleword of the HPTE is designed to encode the actual and base page sizes without using any more bits than would be needed for a 4k page number, by using between 1 and 8 low-order bits of the RPN (real page number) field to encode the page sizes. A single "large page" bit in the first doubleword indicates that these low-order bits are to be interpreted like this. We can determine the page sizes by using the low-order 8 bits of the RPN to look up a 256-entry table. For actual page sizes less than 1MB, some of the upper bits of these 8 bits are going to be real address bits, but we can cope with that by replicating the entries for those smaller page sizes. While we're at it, let's move the hpte_page_size() and hpte_base_page_size() functions from a KVM-specific header to a header for 64-bit HPT systems, since this computation doesn't have anything specifically to do with KVM. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 287a656..e407af2 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -245,6 +245,43 @@ static inline int segment_shift(int ssize) } /* + * This array is indexed by the LP field of the HPTE second dword. + * Since this field may contain some RPN bits, some entries are + * replicated so that we get the same value irrespective of RPN. + * The top 4 bits are the page size index (MMU_PAGE_*) for the + * actual page size, the bottom 4 bits are the base page size. + */ +extern u8 hpte_page_sizes[1 << LP_BITS]; + +static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l, + bool is_base_size) +{ + unsigned int i, lp; + + if (!(h & HPTE_V_LARGE)) + return 1ul << 12; + + /* Look at the 8 bit LP value */ + lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1); + i = hpte_page_sizes[lp]; + if (!i) + return 0; + if (!is_base_size) + i >>= 4; + return 1ul << mmu_psize_defs[i & 0xf].shift; +} + +static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 0); +} + +static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 1); +} + +/* * The current system page and segment sizes */ extern int mmu_kernel_ssize; diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 88d17b4..4ffd5a1 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_64_H__ #define __ASM_KVM_BOOK3S_64_H__ +#include + #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { @@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) hpte[0] = cpu_to_be64(hpte_v); } -static inline int __hpte_actual_psize(unsigned int lp, int psize) -{ - int i, shift; - unsigned int mask; - - /* start from 1 ignoring MMU_PAGE_4K */ - for (i = 1; i < MMU_PAGE_COUNT; i++) { - - /* invalid penc */ - if (mmu_psize_defs[psize].penc[i] == -1) - continue; - /* - * encoding bits per actual page size - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ....... - */ - shift = mmu_psize_defs[i].shift - LP_SHIFT; - if (shift > LP_BITS) - shift = LP_BITS; - mask = (1 << shift) - 1; - if ((lp & mask) == mmu_psize_defs[psize].penc[i]) - return i; - } - return -1; -} - static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { - int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; + int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; unsigned int penc; unsigned long rb = 0, va_low, sllp; unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1); if (v & HPTE_V_LARGE) { - for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) { - - /* valid entries have a shift value */ - if (!mmu_psize_defs[b_psize].shift) - continue; - - a_psize = __hpte_actual_psize(lp, b_psize); - if (a_psize != -1) - break; - } + i = hpte_page_sizes[lp]; + b_psize = i & 0xf; + a_psize = i >> 4; } + /* * Ignore the top 14 bits of va * v have top two bits covering segment size, hence move @@ -215,45 +181,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, return rb; } -static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l, - bool is_base_size) -{ - - int size, a_psize; - /* Look at the 8 bit LP value */ - unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1); - - /* only handle 4k, 64k and 16M pages for now */ - if (!(h & HPTE_V_LARGE)) - return 1ul << 12; - else { - for (size = 0; size < MMU_PAGE_COUNT; size++) { - /* valid entries have a shift value */ - if (!mmu_psize_defs[size].shift) - continue; - - a_psize = __hpte_actual_psize(lp, size); - if (a_psize != -1) { - if (is_base_size) - return 1ul << mmu_psize_defs[size].shift; - return 1ul << mmu_psize_defs[a_psize].shift; - } - } - - } - return 0; -} - -static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) -{ - return __hpte_page_size(h, l, 0); -} - -static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l) -{ - return __hpte_page_size(h, l, 1); -} - static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) { return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..b78e8d3 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void) #define MMU_PAGE_16G 13 #define MMU_PAGE_64G 14 +/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */ #define MMU_PAGE_COUNT 15 #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 0e4e965..83ddc0e 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid, } #endif -static inline int __hpte_actual_psize(unsigned int lp, int psize) -{ - int i, shift; - unsigned int mask; - - /* start from 1 ignoring MMU_PAGE_4K */ - for (i = 1; i < MMU_PAGE_COUNT; i++) { - - /* invalid penc */ - if (mmu_psize_defs[psize].penc[i] == -1) - continue; - /* - * encoding bits per actual page size - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ....... - */ - shift = mmu_psize_defs[i].shift - LP_SHIFT; - if (shift > LP_BITS) - shift = LP_BITS; - mask = (1 << shift) - 1; - if ((lp & mask) == mmu_psize_defs[psize].penc[i]) - return i; - } - return -1; -} - static void hpte_decode(struct hash_pte *hpte, unsigned long slot, int *psize, int *apsize, int *ssize, unsigned long *vpn) { @@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, size = MMU_PAGE_4K; a_size = MMU_PAGE_4K; } else { - for (size = 0; size < MMU_PAGE_COUNT; size++) { - - /* valid entries have a shift value */ - if (!mmu_psize_defs[size].shift) - continue; - - a_size = __hpte_actual_psize(lp, size); - if (a_size != -1) - break; - } + size = hpte_page_sizes[lp] & 0xf; + a_size = hpte_page_sizes[lp] >> 4; } /* This works for all page sizes, and for 256M and 1T segments */ if (cpu_has_feature(CPU_FTR_ARCH_300)) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0821556..ef3ae89 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -93,6 +93,9 @@ static unsigned long _SDR1; struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; EXPORT_SYMBOL_GPL(mmu_psize_defs); +u8 hpte_page_sizes[1 << LP_BITS]; +EXPORT_SYMBOL_GPL(hpte_page_sizes); + struct hash_pte *htab_address; unsigned long htab_size_bytes; unsigned long htab_hash_mask; @@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void) #endif /* CONFIG_HUGETLB_PAGE */ } +/* + * Fill in the hpte_page_sizes[] array. + * We go through the mmu_psize_defs[] array looking for all the + * supported base/actual page size combinations. Each combination + * has a unique pagesize encoding (penc) value in the low bits of + * the LP field of the HPTE. For actual page sizes less than 1MB, + * some of the upper LP bits are used for RPN bits, meaning that + * we need to fill in several entries in hpte_page_sizes[]. + * + * In diagrammatic form, with r = RPN bits and z = page size bits: + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ... + * + * The zzzz bits are implementation-specific but are chosen so that + * no encoding for a larger page size uses the same value in its + * low-order N bits as the encoding for the 2^(12+N) byte page size + * (if it exists). + */ +static void init_hpte_page_sizes(void) +{ + long int ap, bp; + long int shift, penc; + + for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { + if (!mmu_psize_defs[bp].shift) + continue; /* not a supported page size */ + for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { + penc = mmu_psize_defs[bp].penc[ap]; + if (penc == -1) + continue; + shift = mmu_psize_defs[ap].shift - LP_SHIFT; + if (shift <= 0) + continue; /* should never happen */ + /* + * For page sizes less than 1MB, this loop + * replicates the entry for all possible values + * of the rrrr bits. + */ + while (penc < (1 << LP_BITS)) { + hpte_page_sizes[penc] = (ap << 4) | bp; + penc += 1 << shift; + } + } + } +} + static void __init htab_init_page_sizes(void) { + init_hpte_page_sizes(); + if (!debug_pagealloc_enabled()) { /* * Pick a size for the linear mapping. Currently, we only -- cgit v0.10.2 From 07b1fdf5bd135d94eff2b3a6849b90c358963066 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:45 +1000 Subject: powerpc: Add simple cache inhibited MMIO accessors Add simple cache inhibited accessors for memory mapped I/O. Unlike the accessors built from the DEF_MMIO_* macros, these don't include any hardware memory barriers, callers need to manage memory barriers on their own. These can only be called in hypervisor real mode. Signed-off-by: Suresh Warrier [paulus@ozlabs.org - added line to comment] Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 2fd1690..f6fda84 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val) #endif #endif /* __powerpc64__ */ + +/* + * Simple Cache inhibited accessors + * Unlike the DEF_MMIO_* macros, these don't include any h/w memory + * barriers, callers need to manage memory barriers on their own. + * These can only be used in hypervisor real mode. + */ + +static inline u32 _lwzcix(unsigned long addr) +{ + u32 ret; + + __asm__ __volatile__("lwzcix %0,0, %1" + : "=r" (ret) : "r" (addr) : "memory"); + return ret; +} + +static inline void _stbcix(u64 addr, u8 val) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (addr) : "memory"); +} + +static inline void _stwcix(u64 addr, u32 val) +{ + __asm__ __volatile__("stwcix %0,0,%1" + : : "r" (val), "r" (addr) : "memory"); +} + /* * Low level IO stream instructions are defined out of line for now */ -- cgit v0.10.2 From 4ee11c1a9f7cc20026bb66ac624533310a605312 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:49 +1000 Subject: powerpc/powernv: Provide facilities for EOI, usable from real mode This adds a new function pnv_opal_pci_msi_eoi() which does the part of end-of-interrupt (EOI) handling of an MSI which involves doing an OPAL call. This function can be called in real mode. This doesn't just export pnv_ioda2_msi_eoi() because that does a call to icp_native_eoi(), which does not work in real mode. This also adds a function, is_pnv_opal_msi(), which KVM can call to check whether an interrupt is one for which we should be calling pnv_opal_pci_msi_eoi() when we need to do an EOI. [paulus@ozlabs.org - split out the addition of pnv_opal_pci_msi_eoi() from Suresh's patch "KVM: PPC: Book3S HV: Handle passthrough interrupts in guest"; added is_pnv_opal_msi(); wrote description.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index 0cbd813..1b46b52 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num); void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num); int pnv_cxl_get_irq_count(struct pci_dev *dev); struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev); +int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq); +bool is_pnv_opal_msi(struct irq_chip *chip); #ifdef CONFIG_CXL_BASE int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fd9444f..9ce48ae 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2710,15 +2710,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } #ifdef CONFIG_PCI_MSI -static void pnv_ioda2_msi_eoi(struct irq_data *d) +int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); struct pnv_phb *phb = container_of(chip, struct pnv_phb, ioda.irq_chip); + + return opal_pci_msi_eoi(phb->opal_id, hw_irq); +} + +static void pnv_ioda2_msi_eoi(struct irq_data *d) +{ int64_t rc; + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + struct irq_chip *chip = irq_data_get_irq_chip(d); - rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); + rc = pnv_opal_pci_msi_eoi(chip, hw_irq); WARN_ON_ONCE(rc); icp_native_eoi(d); @@ -2748,6 +2754,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) irq_set_chip(virq, &phb->ioda.irq_chip); } +/* + * Returns true iff chip is something that we could call + * pnv_opal_pci_msi_eoi for. + */ +bool is_pnv_opal_msi(struct irq_chip *chip) +{ + return chip->irq_eoi == pnv_ioda2_msi_eoi; +} +EXPORT_SYMBOL_GPL(is_pnv_opal_msi); + static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg) -- cgit v0.10.2 From 3f2577749948803491874c6895fec559f3473eab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Aug 2016 15:07:43 +0200 Subject: powerpc: move hmi.c to arch/powerpc/kvm/ hmi.c functions are unused unless sibling_subcore_state is nonzero, and that in turn happens only if KVM is in use. So move the code to arch/powerpc/kvm/, putting it under CONFIG_KVM_BOOK3S_HV_POSSIBLE rather than CONFIG_PPC_BOOK3S_64. The sibling_subcore_state is also included in struct paca_struct only if KVM is supported by the kernel. Cc: Daniel Axtens Cc: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Paul Mackerras Cc: linuxppc-dev@lists.ozlabs.org Cc: kvm-ppc@vger.kernel.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h index 88b4901..85b7a1a 100644 --- a/arch/powerpc/include/asm/hmi.h +++ b/arch/powerpc/include/asm/hmi.h @@ -21,7 +21,7 @@ #ifndef __ASM_PPC64_HMI_H__ #define __ASM_PPC64_HMI_H__ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define CORE_TB_RESYNC_REQ_BIT 63 #define MAX_SUBCORE_PER_CORE 4 diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 148303e7..6a6792b 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -183,11 +183,6 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ - /* - * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for - * more details - */ - struct sibling_subcore_state *sibling_subcore_state; #endif /* Stuff for accurate time accounting */ @@ -202,6 +197,13 @@ struct paca_struct { struct kvmppc_book3s_shadow_vcpu shadow_vcpu; #endif struct kvmppc_host_state kvm_hstate; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* + * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for + * more details + */ + struct sibling_subcore_state *sibling_subcore_state; +#endif #endif }; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b2027a5..fe4c075 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o -obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c deleted file mode 100644 index e3f738e..0000000 --- a/arch/powerpc/kernel/hmi.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Hypervisor Maintenance Interrupt (HMI) handling. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. - * - * Copyright 2015 IBM Corporation - * Author: Mahesh Salgaonkar - */ - -#undef DEBUG - -#include -#include -#include -#include - -void wait_for_subcore_guest_exit(void) -{ - int i; - - /* - * NULL bitmap pointer indicates that KVM module hasn't - * been loaded yet and hence no guests are running. - * If no KVM is in use, no need to co-ordinate among threads - * as all of them will always be in host and no one is going - * to modify TB other than the opal hmi handler. - * Hence, just return from here. - */ - if (!local_paca->sibling_subcore_state) - return; - - for (i = 0; i < MAX_SUBCORE_PER_CORE; i++) - while (local_paca->sibling_subcore_state->in_guest[i]) - cpu_relax(); -} - -void wait_for_tb_resync(void) -{ - if (!local_paca->sibling_subcore_state) - return; - - while (test_bit(CORE_TB_RESYNC_REQ_BIT, - &local_paca->sibling_subcore_state->flags)) - cpu_relax(); -} diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 1f9e552..855d4b9 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -78,6 +78,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ + book3s_hv_hmi.o \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ book3s_hv_ras.o \ diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c new file mode 100644 index 0000000..e3f738e --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_hmi.c @@ -0,0 +1,56 @@ +/* + * Hypervisor Maintenance Interrupt (HMI) handling. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#undef DEBUG + +#include +#include +#include +#include + +void wait_for_subcore_guest_exit(void) +{ + int i; + + /* + * NULL bitmap pointer indicates that KVM module hasn't + * been loaded yet and hence no guests are running. + * If no KVM is in use, no need to co-ordinate among threads + * as all of them will always be in host and no one is going + * to modify TB other than the opal hmi handler. + * Hence, just return from here. + */ + if (!local_paca->sibling_subcore_state) + return; + + for (i = 0; i < MAX_SUBCORE_PER_CORE; i++) + while (local_paca->sibling_subcore_state->in_guest[i]) + cpu_relax(); +} + +void wait_for_tb_resync(void) +{ + if (!local_paca->sibling_subcore_state) + return; + + while (test_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + cpu_relax(); +} -- cgit v0.10.2 From 37f55d30df2eef89b97d627e5830beb6983c4101 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:46 +1000 Subject: KVM: PPC: Book3S HV: Convert kvmppc_read_intr to a C function Modify kvmppc_read_intr to make it a C function. Because it is called from kvmppc_check_wake_reason, any of the assembler code that calls either kvmppc_read_intr or kvmppc_check_wake_reason now has to assume that the volatile registers might have been modified. This also adds in the optimization of clearing saved_xirr in the case where we completely handle and EOI an IPI. Without this, the next device interrupt will require two trips through the host interrupt handling code. [paulus@ozlabs.org - made kvmppc_check_wake_reason create a stack frame when it is calling kvmppc_read_intr, which means we can set r12 to the trap number (0x500) after the call to kvmppc_read_intr, instead of using r31. Also moved the deliver_guest_interrupt label so as to restore XER and CTR, plus other minor tweaks.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 5f0380d..b476a6a 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -25,6 +25,7 @@ #include #include #include +#include #define KVM_CMA_CHUNK_ORDER 18 @@ -286,3 +287,86 @@ void kvmhv_commence_exit(int trap) struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); + +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + */ + +long kvmppc_read_intr(void) +{ + unsigned long xics_phys; + u32 h_xirr; + __be32 xirr; + u32 xisr; + u8 host_ipi; + + /* see if a host IPI is pending */ + host_ipi = local_paca->kvm_hstate.host_ipi; + if (host_ipi) + return 1; + + /* Now read the interrupt from the ICP */ + xics_phys = local_paca->kvm_hstate.xics_phys; + if (unlikely(!xics_phys)) + return 1; + + /* + * Save XIRR for later. Since we get control in reverse endian + * on LE systems, save it byte reversed and fetch it back in + * host endian. Note that xirr is the value read from the + * XIRR register, while h_xirr is the host endian version. + */ + xirr = _lwzcix(xics_phys + XICS_XIRR); + h_xirr = be32_to_cpu(xirr); + local_paca->kvm_hstate.saved_xirr = h_xirr; + xisr = h_xirr & 0xffffff; + /* + * Ensure that the store/load complete to guarantee all side + * effects of loading from XIRR has completed + */ + smp_mb(); + + /* if nothing pending in the ICP */ + if (!xisr) + return 0; + + /* We found something in the ICP... + * + * If it is an IPI, clear the MFRR and EOI it. + */ + if (xisr == XICS_IPI) { + _stbcix(xics_phys + XICS_MFRR, 0xff); + _stwcix(xics_phys + XICS_XIRR, xirr); + /* + * Need to ensure side effects of above stores + * complete before proceeding. + */ + smp_mb(); + + /* + * We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + host_ipi = local_paca->kvm_hstate.host_ipi; + if (unlikely(host_ipi != 0)) { + /* We raced with the host, + * we need to resend that IPI, bummer + */ + _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + /* Let side effects complete */ + smp_mb(); + return 1; + } + + /* OK, it's an IPI for us */ + local_paca->kvm_hstate.saved_xirr = 0; + return -1; + } + + return 1; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9756555..dccfa85 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -221,6 +221,13 @@ kvmppc_primary_no_guest: li r3, 0 /* Don't wake on privileged (OS) doorbell */ b kvm_do_nap +/* + * kvm_novcpu_wakeup + * Entered from kvm_start_guest if kvm_hstate.napping is set + * to NAPPING_NOVCPU + * r2 = kernel TOC + * r13 = paca + */ kvm_novcpu_wakeup: ld r1, HSTATE_HOST_R1(r13) ld r5, HSTATE_KVM_VCORE(r13) @@ -230,6 +237,13 @@ kvm_novcpu_wakeup: /* check the wake reason */ bl kvmppc_check_wake_reason + /* + * Restore volatile registers since we could have called + * a C routine in kvmppc_check_wake_reason. + * r5 = VCORE + */ + ld r5, HSTATE_KVM_VCORE(r13) + /* see if any other thread is already exiting */ lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -322,6 +336,11 @@ kvm_start_guest: /* Check the wake reason in SRR1 to see why we got here */ bl kvmppc_check_wake_reason + /* + * kvmppc_check_wake_reason could invoke a C routine, but we + * have no volatile registers to restore when we return. + */ + cmpdi r3, 0 bge kvm_no_guest @@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) cmpwi r3, 512 /* 1 microsecond */ blt hdec_soon +deliver_guest_interrupt: ld r6, VCPU_CTR(r4) ld r7, VCPU_XER(r4) @@ -895,7 +915,6 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 -deliver_guest_interrupt: /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG @@ -1155,10 +1174,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * set, we know the host wants us out so let's do it now */ bl kvmppc_read_intr + + /* + * Restore the active volatile registers after returning from + * a C function. + */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_EXTERNAL + + /* + * kvmppc_read_intr return codes: + * + * Exit to host (r3 > 0) + * 1 An interrupt is pending that needs to be handled by the host + * Exit guest and return to host by branching to guest_exit_cont + * + * Before returning to guest, we check if any CPU is heading out + * to the host and if so, we head out also. If no CPUs are heading + * check return values <= 0. + * + * Return to guest (r3 <= 0) + * 0 No external interrupt is pending + * -1 A guest wakeup IPI (which has now been cleared) + * In either case, we return to guest to deliver any pending + * guest interrupts. + */ + cmpdi r3, 0 bgt guest_exit_cont - /* Check if any CPU is heading out to the host, if so head out too */ + /* Return code <= 0 */ 4: ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -2213,10 +2258,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) ld r29, VCPU_GPR(R29)(r4) ld r30, VCPU_GPR(R30)(r4) ld r31, VCPU_GPR(R31)(r4) - + /* Check the wake reason in SRR1 to see why we got here */ bl kvmppc_check_wake_reason + /* + * Restore volatile registers since we could have called a + * C routine in kvmppc_check_wake_reason + * r4 = VCPU + * r3 tells us whether we need to return to host or not + * WARNING: it gets checked further down: + * should not modify r3 until this check is done. + */ + ld r4, HSTATE_KVM_VCPU(r13) + /* clear our bit in vcore->napping_threads */ 34: ld r5,HSTATE_KVM_VCORE(r13) lbz r7,HSTATE_PTID(r13) @@ -2230,7 +2285,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) li r0,0 stb r0,HSTATE_NAPPING(r13) - /* See if the wake reason means we need to exit */ + /* See if the wake reason saved in r3 means we need to exit */ stw r12, VCPU_TRAP(r4) mr r9, r4 cmpdi r3, 0 @@ -2300,7 +2355,9 @@ machine_check_realmode: * * Also sets r12 to the interrupt vector for any interrupt that needs * to be handled now by the host (0x500 for external interrupt), or zero. - * Modifies r0, r6, r7, r8. + * Modifies all volatile registers (since it may call a C function). + * This routine calls kvmppc_read_intr, a C function, if an external + * interrupt is pending. */ kvmppc_check_wake_reason: mfspr r6, SPRN_SRR1 @@ -2310,8 +2367,7 @@ FTR_SECTION_ELSE rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) cmpwi r6, 8 /* was it an external interrupt? */ - li r12, BOOK3S_INTERRUPT_EXTERNAL - beq kvmppc_read_intr /* if so, see what it was */ + beq 7f /* if so, see what it was */ li r3, 0 li r12, 0 cmpwi r6, 6 /* was it the decrementer? */ @@ -2350,83 +2406,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 1 blr -/* - * Determine what sort of external interrupt is pending (if any). - * Returns: - * 0 if no interrupt is pending - * 1 if an interrupt is pending that needs to be handled by the host - * -1 if there was a guest wakeup IPI (which has now been cleared) - * Modifies r0, r6, r7, r8, returns value in r3. - */ -kvmppc_read_intr: - /* see if a host IPI is pending */ - li r3, 1 - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne 1f - - /* Now read the interrupt from the ICP */ - ld r6, HSTATE_XICS_PHYS(r13) - li r7, XICS_XIRR - cmpdi r6, 0 - beq- 1f - lwzcix r0, r6, r7 - /* - * Save XIRR for later. Since we get in in reverse endian on LE - * systems, save it byte reversed and fetch it back in host endian. - */ - li r3, HSTATE_SAVED_XIRR - STWX_BE r0, r3, r13 -#ifdef __LITTLE_ENDIAN__ - lwz r3, HSTATE_SAVED_XIRR(r13) -#else - mr r3, r0 -#endif - rlwinm. r3, r3, 0, 0xffffff - sync - beq 1f /* if nothing pending in the ICP */ - - /* We found something in the ICP... - * - * If it's not an IPI, stash it in the PACA and return to - * the host, we don't (yet) handle directing real external - * interrupts directly to the guest - */ - cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ - bne 42f - - /* It's an IPI, clear the MFRR and EOI it */ - li r3, 0xff - li r8, XICS_MFRR - stbcix r3, r6, r8 /* clear the IPI */ - stwcix r0, r6, r7 /* EOI it */ - sync - - /* We need to re-check host IPI now in case it got set in the - * meantime. If it's clear, we bounce the interrupt to the - * guest - */ - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne- 43f - - /* OK, it's an IPI for us */ - li r12, 0 - li r3, -1 -1: blr - -42: /* It's not an IPI and it's for the host. We saved a copy of XIRR in - * the PACA earlier, it will be picked up by the host ICP driver - */ - li r3, 1 - b 1b - -43: /* We raced with the host, we need to resend that IPI, bummer */ - li r0, IPI_PRIORITY - stbcix r0, r6, r8 /* set the IPI */ - sync - li r3, 1 - b 1b + /* external interrupt - create a stack frame so we can call C */ +7: mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + bl kvmppc_read_intr + nop + li r12, BOOK3S_INTERRUPT_EXTERNAL + ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) + addi r1, r1, PPC_MIN_STKFRM + mtlr r0 + blr /* * Save away FP, VMX and VSX registers. -- cgit v0.10.2 From 9576730d0e6e301343c5aead5418ad53fcecfd14 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:47 +1000 Subject: KVM: PPC: select IRQ_BYPASS_MANAGER Select IRQ_BYPASS_MANAGER for PPC when CONFIG_KVM is set. Add the PPC producer functions for add and del producer. [paulus@ozlabs.org - Moved new functions from book3s.c to powerpc.c so booke compiles; added kvm_arch_has_irq_bypass implementation.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544eda..94715e2 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -287,6 +287,10 @@ struct kvmppc_ops { long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, unsigned long arg); int (*hcall_implemented)(unsigned long hcall); + int (*irq_bypass_add_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c0a2478..029be26 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -23,6 +23,8 @@ config KVM select HAVE_KVM_EVENTFD select SRCU select KVM_VFIO + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS config KVM_BOOK3S_HANDLER bool diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ab2b3d5..0b7d664 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include #include @@ -739,6 +741,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } +/* + * irq_bypass_add_producer and irq_bypass_del_producer are only + * useful if the architecture supports PCI passthrough. + * irq_bypass_stop and irq_bypass_start are not needed and so + * kvm_ops are not defined for them. + */ +bool kvm_arch_has_irq_bypass(void) +{ + return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) || + (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer)); +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + + if (kvm->arch.kvm_ops->irq_bypass_add_producer) + return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod); + + return 0; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + + if (kvm->arch.kvm_ops->irq_bypass_del_producer) + kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod); +} + static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { -- cgit v0.10.2 From 8daaafc88b46fb3af952e92d7c2816a8950e1363 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:48 +1000 Subject: KVM: PPC: Book3S HV: Introduce kvmppc_passthru_irqmap This patch introduces an IRQ mapping structure, the kvmppc_passthru_irqmap structure that is to be used to map the real hardware IRQ in the host with the virtual hardware IRQ (gsi) that is injected into a guest by KVM for passthrough adapters. Currently, we assume a separate IRQ mapping structure for each guest. Each kvmppc_passthru_irqmap has a mapping arrays, containing all defined real<->virtual IRQs. [paulus@ozlabs.org - removed irq_chip field from struct kvmppc_passthru_irqmap; changed parameter for kvmppc_get_passthru_irqmap from struct kvm_vcpu * to struct kvm *, removed small cached array.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 373003f..89ac1f6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -203,6 +203,8 @@ struct kvmppc_spapr_tce_table { struct kvmppc_xics; struct kvmppc_icp; +struct kvmppc_passthru_irqmap; + /* * The reverse mapping array has one entry for each HPTE, * which stores the guest's view of the second word of the HPTE @@ -273,6 +275,7 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; + struct kvmppc_passthru_irqmap *pimap; #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -369,6 +372,20 @@ struct kvmhv_tb_accumulator { u64 tb_max; /* max time */ }; +#ifdef CONFIG_PPC_BOOK3S_64 +struct kvmppc_irq_map { + u32 r_hwirq; + u32 v_hwirq; + struct irq_desc *desc; +}; + +#define KVMPPC_PIRQ_MAPPED 1024 +struct kvmppc_passthru_irqmap { + int n_mapped; + struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED]; +}; +#endif + # ifdef CONFIG_PPC_FSL_BOOK3E #define KVMPPC_BOOKE_IAC_NUM 2 #define KVMPPC_BOOKE_DAC_NUM 2 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 94715e2..4ca2ba3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -457,8 +457,18 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; } + +static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( + struct kvm *kvm) +{ + if (kvm) + return kvm->arch.pimap; + return NULL; +} + extern void kvmppc_alloc_host_rm_ops(void); extern void kvmppc_free_host_rm_ops(void); +extern void kvmppc_free_pimap(struct kvm *kvm); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -470,8 +480,12 @@ extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, extern void kvmppc_xics_ipi_action(void); extern int h_ipi_redirect; #else +static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( + struct kvm *kvm) + { return NULL; } static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; +static inline void kvmppc_free_pimap(struct kvm *kvm) {}; static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 30ff8ab..d5c7061 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3412,6 +3412,19 @@ static int kvmppc_core_check_processor_compat_hv(void) return 0; } +#ifdef CONFIG_KVM_XICS + +void kvmppc_free_pimap(struct kvm *kvm) +{ + kfree(kvm->arch.pimap); +} + +struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) +{ + return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); +} +#endif + static long kvm_arch_vm_ioctl_hv(struct file *filp, unsigned int ioctl, unsigned long arg) { -- cgit v0.10.2 From c57875f5f9be2cea1f1cc83f815a3aadabedcdd3 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:50 +1000 Subject: KVM: PPC: Book3S HV: Enable IRQ bypass Add the irq_bypass_add_producer and irq_bypass_del_producer functions. These functions get called whenever a GSI is being defined for a guest. They create/remove the mapping between host real IRQ numbers and the guest GSI. Add the following helper functions to manage the passthrough IRQ map. kvmppc_set_passthru_irq() Creates a mapping in the passthrough IRQ map that maps a host IRQ to a guest GSI. It allocates the structure (one per guest VM) the first time it is called. kvmppc_clr_passthru_irq() Removes the passthrough IRQ map entry given a guest GSI. The passthrough IRQ map structure is not freed even when the number of mapped entries goes to zero. It is only freed when the VM is destroyed. [paulus@ozlabs.org - modified to use is_pnv_opal_msi() rather than requiring all passed-through interrupts to use the same irq_chip; changed deletion so it zeroes out the r_hwirq field rather than copying the last entry down and decrementing the number of entries.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d5c7061..a7fe178 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -53,10 +53,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include @@ -3377,6 +3380,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); kvmppc_free_hpt(kvm); + + kvmppc_free_pimap(kvm); } /* We don't need to emulate any privileged instructions or dcbz */ @@ -3419,10 +3424,159 @@ void kvmppc_free_pimap(struct kvm *kvm) kfree(kvm->arch.pimap); } -struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) +static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) { return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); } + +static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) +{ + struct irq_desc *desc; + struct kvmppc_irq_map *irq_map; + struct kvmppc_passthru_irqmap *pimap; + struct irq_chip *chip; + int i; + + desc = irq_to_desc(host_irq); + if (!desc) + return -EIO; + + mutex_lock(&kvm->lock); + + pimap = kvm->arch.pimap; + if (pimap == NULL) { + /* First call, allocate structure to hold IRQ map */ + pimap = kvmppc_alloc_pimap(); + if (pimap == NULL) { + mutex_unlock(&kvm->lock); + return -ENOMEM; + } + kvm->arch.pimap = pimap; + } + + /* + * For now, we only support interrupts for which the EOI operation + * is an OPAL call followed by a write to XIRR, since that's + * what our real-mode EOI code does. + */ + chip = irq_data_get_irq_chip(&desc->irq_data); + if (!chip || !is_pnv_opal_msi(chip)) { + pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", + host_irq, guest_gsi); + mutex_unlock(&kvm->lock); + return -ENOENT; + } + + /* + * See if we already have an entry for this guest IRQ number. + * If it's mapped to a hardware IRQ number, that's an error, + * otherwise re-use this entry. + */ + for (i = 0; i < pimap->n_mapped; i++) { + if (guest_gsi == pimap->mapped[i].v_hwirq) { + if (pimap->mapped[i].r_hwirq) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + break; + } + } + + if (i == KVMPPC_PIRQ_MAPPED) { + mutex_unlock(&kvm->lock); + return -EAGAIN; /* table is full */ + } + + irq_map = &pimap->mapped[i]; + + irq_map->v_hwirq = guest_gsi; + irq_map->r_hwirq = desc->irq_data.hwirq; + irq_map->desc = desc; + + if (i == pimap->n_mapped) + pimap->n_mapped++; + + mutex_unlock(&kvm->lock); + + return 0; +} + +static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) +{ + struct irq_desc *desc; + struct kvmppc_passthru_irqmap *pimap; + int i; + + desc = irq_to_desc(host_irq); + if (!desc) + return -EIO; + + mutex_lock(&kvm->lock); + + if (kvm->arch.pimap == NULL) { + mutex_unlock(&kvm->lock); + return 0; + } + pimap = kvm->arch.pimap; + + for (i = 0; i < pimap->n_mapped; i++) { + if (guest_gsi == pimap->mapped[i].v_hwirq) + break; + } + + if (i == pimap->n_mapped) { + mutex_unlock(&kvm->lock); + return -ENODEV; + } + + /* invalidate the entry */ + pimap->mapped[i].r_hwirq = 0; + + /* + * We don't free this structure even when the count goes to + * zero. The structure is freed when we destroy the VM. + */ + + mutex_unlock(&kvm->lock); + return 0; +} + +static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret = 0; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + irqfd->producer = prod; + + ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); + if (ret) + pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n", + prod->irq, irqfd->gsi, ret); + + return ret; +} + +static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * default external interrupt handling mode - KVM real mode + * will switch back to host. + */ + ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); + if (ret) + pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n", + prod->irq, irqfd->gsi, ret); +} #endif static long kvm_arch_vm_ioctl_hv(struct file *filp, @@ -3543,6 +3697,10 @@ static struct kvmppc_ops kvm_ops_hv = { .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, .hcall_implemented = kvmppc_hcall_impl_hv, +#ifdef CONFIG_KVM_XICS + .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, + .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, +#endif }; static int kvm_init_subcore_bitmap(void) -- cgit v0.10.2 From e3c13e56a4717ee334837a20c596e527eb6355e1 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:51 +1000 Subject: KVM: PPC: Book3S HV: Handle passthrough interrupts in guest Currently, KVM switches back to the host to handle any external interrupt (when the interrupt is received while running in the guest). This patch updates real-mode KVM to check if an interrupt is generated by a passthrough adapter that is owned by this guest. If so, the real mode KVM will directly inject the corresponding virtual interrupt to the guest VCPU's ICS and also EOI the interrupt in hardware. In short, the interrupt is handled entirely in real mode in the guest context without switching back to the host. In some rare cases, the interrupt cannot be completely handled in real mode, for instance, a VCPU that is sleeping needs to be woken up. In this case, KVM simply switches back to the host with trap reason set to 0x500. This works, but it is clearly not very efficient. A following patch will distinguish this case and handle it correctly in the host. Note that we can use the existing check_too_hard() routine even though we are not in a hypercall to determine if there is unfinished business that needs to be completed in host virtual mode. The patch assumes that the mapping between hardware interrupt IRQ and virtual IRQ to be injected to the guest already exists for the PCI passthrough interrupts that need to be handled in real mode. If the mapping does not exist, KVM falls back to the default existing behavior. The KVM real mode code reads mappings from the mapped array in the passthrough IRQ map without taking any lock. We carefully order the loads and stores of the fields in the kvmppc_irq_map data structure using memory barriers to avoid an inconsistent mapping being seen by the reader. Thus, although it is possible to miss a map entry, it is not possible to read a stale value. [paulus@ozlabs.org - get irq_chip from irq_map rather than pimap, pulled out powernv eoi change into a separate patch, made kvmppc_read_intr get the vcpu from the paca rather than being passed in, rewrote the logic at the end of kvmppc_read_intr to avoid deep indentation, simplified logic in book3s_hv_rmhandlers.S since we were always restoring SRR0/1 anyway, get rid of the cached array (just use the mapped array), removed the kick_all_cpus_sync() call, clear saved_xirr PACA field when we handle the interrupt in real mode, fix compilation with CONFIG_KVM_XICS=n.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4ca2ba3..4299a1f 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -478,6 +478,9 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xics_ipi_action(void); +extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, + struct kvmppc_irq_map *irq_map, + struct kvmppc_passthru_irqmap *pimap); extern int h_ipi_redirect; #else static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a7fe178..a393c2b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3490,9 +3490,15 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) irq_map = &pimap->mapped[i]; irq_map->v_hwirq = guest_gsi; - irq_map->r_hwirq = desc->irq_data.hwirq; irq_map->desc = desc; + /* + * Order the above two stores before the next to serialize with + * the KVM real mode handler. + */ + smp_wmb(); + irq_map->r_hwirq = desc->irq_data.hwirq; + if (i == pimap->n_mapped) pimap->n_mapped++; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index b476a6a..7531e29 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -288,12 +288,83 @@ void kvmhv_commence_exit(int trap) struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); +#ifdef CONFIG_KVM_XICS +static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap, + u32 xisr) +{ + int i; + + /* + * We access the mapped array here without a lock. That + * is safe because we never reduce the number of entries + * in the array and we never change the v_hwirq field of + * an entry once it is set. + * + * We have also carefully ordered the stores in the writer + * and the loads here in the reader, so that if we find a matching + * hwirq here, the associated GSI and irq_desc fields are valid. + */ + for (i = 0; i < pimap->n_mapped; i++) { + if (xisr == pimap->mapped[i].r_hwirq) { + /* + * Order subsequent reads in the caller to serialize + * with the writer. + */ + smp_rmb(); + return &pimap->mapped[i]; + } + } + return NULL; +} + +/* + * If we have an interrupt that's not an IPI, check if we have a + * passthrough adapter and if so, check if this external interrupt + * is for the adapter. + * We will attempt to deliver the IRQ directly to the target VCPU's + * ICP, the virtual ICP (based on affinity - the xive value in ICS). + * + * If the delivery fails or if this is not for a passthrough adapter, + * return to the host to handle this interrupt. We earlier + * saved a copy of the XIRR in the PACA, it will be picked up by + * the host ICP driver. + */ +static int kvmppc_check_passthru(u32 xisr, __be32 xirr) +{ + struct kvmppc_passthru_irqmap *pimap; + struct kvmppc_irq_map *irq_map; + struct kvm_vcpu *vcpu; + + vcpu = local_paca->kvm_hstate.kvm_vcpu; + if (!vcpu) + return 1; + pimap = kvmppc_get_passthru_irqmap(vcpu->kvm); + if (!pimap) + return 1; + irq_map = get_irqmap(pimap, xisr); + if (!irq_map) + return 1; + + /* We're handling this interrupt, generic code doesn't need to */ + local_paca->kvm_hstate.saved_xirr = 0; + + return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap); +} + +#else +static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) +{ + return 1; +} +#endif + /* * Determine what sort of external interrupt is pending (if any). * Returns: * 0 if no interrupt is pending * 1 if an interrupt is pending that needs to be handled by the host * -1 if there was a guest wakeup IPI (which has now been cleared) + * -2 if there is PCI passthrough external interrupt that was handled */ long kvmppc_read_intr(void) @@ -368,5 +439,5 @@ long kvmppc_read_intr(void) return -1; } - return 1; + return kvmppc_check_passthru(xisr, xirr); } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 980d8a6..17f5b85 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "book3s_xics.h" @@ -712,6 +713,49 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return check_too_hard(xics, icp); } +unsigned long eoi_rc; + +static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) +{ + unsigned long xics_phys; + int64_t rc; + + rc = pnv_opal_pci_msi_eoi(c, hwirq); + + if (rc) + eoi_rc = rc; + + iosync(); + + /* EOI it */ + xics_phys = local_paca->kvm_hstate.xics_phys; + _stwcix(xics_phys + XICS_XIRR, xirr); +} + +long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, + u32 xirr, + struct kvmppc_irq_map *irq_map, + struct kvmppc_passthru_irqmap *pimap) +{ + struct kvmppc_xics *xics; + struct kvmppc_icp *icp; + u32 irq; + + irq = irq_map->v_hwirq; + xics = vcpu->kvm->arch.xics; + icp = vcpu->arch.icp; + + icp_rm_deliver_irq(xics, icp, irq); + + /* EOI the interrupt */ + icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr); + + if (check_too_hard(xics, icp) == H_TOO_HARD) + return 1; + else + return -2; +} + /* --- Non-real mode XICS-related built-in routines --- */ /** diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index dccfa85..12fb2af 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1198,6 +1198,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * -1 A guest wakeup IPI (which has now been cleared) * In either case, we return to guest to deliver any pending * guest interrupts. + * + * -2 A PCI passthrough external interrupt was handled + * (interrupt was delivered directly to guest) + * Return to guest to deliver any pending guest interrupts. */ cmpdi r3, 0 @@ -2352,6 +2356,8 @@ machine_check_realmode: * 0 if nothing needs to be done * 1 if something happened that needs to be handled by the host * -1 if there was a guest wakeup (IPI or msgsnd) + * -2 if we handled a PCI passthrough interrupt (returned by + * kvmppc_read_intr only) * * Also sets r12 to the interrupt vector for any interrupt that needs * to be handled now by the host (0x500 for external interrupt), or zero. -- cgit v0.10.2 From f7af5209b87c592aad81da65bd104241aa43d36a Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:52 +1000 Subject: KVM: PPC: Book3S HV: Complete passthrough interrupt in host In existing real mode ICP code, when updating the virtual ICP state, if there is a required action that cannot be completely handled in real mode, as for instance, a VCPU needs to be woken up, flags are set in the ICP to indicate the required action. This is checked when returning from hypercalls to decide whether the call needs switch back to the host where the action can be performed in virtual mode. Note that if h_ipi_redirect is enabled, real mode code will first try to message a free host CPU to complete this job instead of returning the host to do it ourselves. Currently, the real mode PCI passthrough interrupt handling code checks if any of these flags are set and simply returns to the host. This is not good enough as the trap value (0x500) is treated as an external interrupt by the host code. It is only when the trap value is a hypercall that the host code searches for and acts on unfinished work by calling kvmppc_xics_rm_complete. This patch introduces a special trap BOOK3S_INTERRUPT_HV_RM_HARD which is returned by KVM if there is unfinished business to be completed in host virtual mode after handling a PCI passthrough interrupt. The host checks for this special interrupt condition and calls into the kvmppc_xics_rm_complete, which is made an exported function for this reason. [paulus@ozlabs.org - moved logic to set r12 to BOOK3S_INTERRUPT_HV_RM_HARD in book3s_hv_rmhandlers.S into the end of kvmppc_check_wake_reason.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 5bca220..05cabed 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -105,6 +105,15 @@ #define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80 +/* book3s_hv */ + +/* + * Special trap used to indicate to host that this is a + * passthrough interrupt that could not be handled + * completely in the guest. + */ +#define BOOK3S_INTERRUPT_HV_RM_HARD 0x5555 + #define BOOK3S_IRQPRIO_SYSTEM_RESET 0 #define BOOK3S_IRQPRIO_DATA_SEGMENT 1 #define BOOK3S_IRQPRIO_INST_SEGMENT 2 @@ -136,6 +145,7 @@ #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ #define RESUME_FLAG_ARCH1 (1<<2) +#define RESUME_FLAG_ARCH2 (1<<3) #define RESUME_GUEST 0 #define RESUME_GUEST_NV RESUME_FLAG_NV diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4299a1f..e0ada31 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -469,6 +469,7 @@ static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( extern void kvmppc_alloc_host_rm_ops(void); extern void kvmppc_free_host_rm_ops(void); extern void kvmppc_free_pimap(struct kvm *kvm); +extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -489,6 +490,8 @@ static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; static inline void kvmppc_free_pimap(struct kvm *kvm) {}; +static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) + { return 0; } static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a393c2b..90beb96 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -74,6 +74,8 @@ /* Used to indicate that a guest page fault needs to be handled */ #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) +/* Used to indicate that a guest passthrough interrupt needs to be handled */ +#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2) /* Used as a "null" value for timebase values */ #define TB_NIL (~(u64)0) @@ -1032,6 +1034,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_HV_RM_HARD: + r = RESUME_PASSTHROUGH; + break; default: kvmppc_dump_regs(vcpu); printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", @@ -2951,7 +2956,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - } + } else if (r == RESUME_PASSTHROUGH) + r = kvmppc_xics_rm_complete(vcpu, 0); } while (is_kvmppc_resume_guest(r)); out: diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7531e29..0c84d6b 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -363,6 +363,7 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) * Returns: * 0 if no interrupt is pending * 1 if an interrupt is pending that needs to be handled by the host + * 2 Passthrough that needs completion in the host * -1 if there was a guest wakeup IPI (which has now been cleared) * -2 if there is PCI passthrough external interrupt that was handled */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 17f5b85..3b8d7ac 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -751,7 +751,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr); if (check_too_hard(xics, icp) == H_TOO_HARD) - return 1; + return 2; else return -2; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 12fb2af..7cc924b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1189,6 +1189,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * 1 An interrupt is pending that needs to be handled by the host * Exit guest and return to host by branching to guest_exit_cont * + * 2 Passthrough that needs completion in the host + * Exit guest and return to host by branching to guest_exit_cont + * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD + * to indicate to the host to complete handling the interrupt + * * Before returning to guest, we check if any CPU is heading out * to the host and if so, we head out also. If no CPUs are heading * check return values <= 0. @@ -1204,6 +1209,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * Return to guest to deliver any pending guest interrupts. */ + cmpdi r3, 1 + ble 1f + + /* Return code = 2 */ + li r12, BOOK3S_INTERRUPT_HV_RM_HARD + stw r12, VCPU_TRAP(r9) + b guest_exit_cont + +1: /* Return code <= 1 */ cmpdi r3, 0 bgt guest_exit_cont @@ -2419,6 +2433,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) bl kvmppc_read_intr nop li r12, BOOK3S_INTERRUPT_EXTERNAL + cmpdi r3, 1 + ble 1f + + /* + * Return code of 2 means PCI passthrough interrupt, but + * we need to return back to host to complete handling the + * interrupt. Trap reason is expected in r12 by guest + * exit code. + */ + li r12, BOOK3S_INTERRUPT_HV_RM_HARD +1: ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) addi r1, r1, PPC_MIN_STKFRM mtlr r0 diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 686147e..4edbe7b 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -812,7 +812,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return H_SUCCESS; } -static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; @@ -841,6 +841,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete); int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) { -- cgit v0.10.2 From af893c7dc941f2510d4f23e76e312c77c434b2f0 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:53 +1000 Subject: KVM: PPC: Book3S HV: Dump irqmap in debugfs Dump the passthrough irqmap structure associated with a guest as part of /sys/kernel/debug/powerpc/kvm-xics-*. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 4edbe7b..be5179c 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -893,6 +893,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); /* -- Initialisation code etc. -- */ +static void xics_debugfs_irqmap(struct seq_file *m, + struct kvmppc_passthru_irqmap *pimap) +{ + int i; + + if (!pimap) + return; + seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n", + pimap->n_mapped); + for (i = 0; i < pimap->n_mapped; i++) { + seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n", + pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq); + } +} + static int xics_debug_show(struct seq_file *m, void *private) { struct kvmppc_xics *xics = m->private; @@ -914,6 +929,8 @@ static int xics_debug_show(struct seq_file *m, void *private) t_check_resend = 0; t_reject = 0; + xics_debugfs_irqmap(m, kvm->arch.pimap); + seq_printf(m, "=========\nICP state\n=========\n"); kvm_for_each_vcpu(i, vcpu, kvm) { -- cgit v0.10.2 From 644abbb254b1ab171f777431b23e6fb5879599d0 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:54 +1000 Subject: KVM: PPC: Book3S HV: Tunable to disable KVM IRQ bypass Add a module parameter kvm_irq_bypass for kvm_hv.ko to disable IRQ bypass for passthrough interrupts. The default value of this tunable is 1 - that is enable the feature. Since the tunable is used by built-in kernel code, we use the module_param_cb macro to achieve this. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index c261f52..cef2b89 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -227,6 +227,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, struct kvm_vcpu *vcpu); extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, struct kvmppc_book3s_shadow_vcpu *svcpu); +extern int kvm_irq_bypass; static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e0ada31..97b9bad 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -461,7 +461,7 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( struct kvm *kvm) { - if (kvm) + if (kvm && kvm_irq_bypass) return kvm->arch.pimap; return NULL; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 90beb96..03721ed 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -95,6 +95,10 @@ static struct kernel_param_ops module_param_ops = { .get = param_get_int, }; +module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization"); + module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); @@ -3443,6 +3447,9 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct irq_chip *chip; int i; + if (!kvm_irq_bypass) + return 1; + desc = irq_to_desc(host_irq); if (!desc) return -EIO; @@ -3519,6 +3526,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct kvmppc_passthru_irqmap *pimap; int i; + if (!kvm_irq_bypass) + return 0; + desc = irq_to_desc(host_irq); if (!desc) return -EIO; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 3b8d7ac..00b9dfd 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -27,6 +27,8 @@ int h_ipi_redirect = 1; EXPORT_SYMBOL(h_ipi_redirect); +int kvm_irq_bypass = 1; +EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); -- cgit v0.10.2 From 366274f59c4de018f72ab44bb41ccaf3d657eb52 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:55 +1000 Subject: KVM: PPC: Book3S HV: Update irq stats for IRQs handled in real mode When a passthrough IRQ is handled completely within KVM real mode code, it has to also update the IRQ stats since this does not go through the generic IRQ handling code. However, the per CPU kstat_irqs field is an allocated (not static) field and so cannot be directly accessed in real mode safely. The function this_cpu_inc_rm() is introduced to safely increment per CPU fields (currently coded for unsigned integers only) that are allocated and could thus be vmalloced also. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 00b9dfd..554cdfa 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include @@ -734,6 +736,53 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) _stwcix(xics_phys + XICS_XIRR, xirr); } +/* + * Increment a per-CPU 32-bit unsigned integer variable. + * Safe to call in real-mode. Handles vmalloc'ed addresses + * + * ToDo: Make this work for any integral type + */ + +static inline void this_cpu_inc_rm(unsigned int __percpu *addr) +{ + unsigned long l; + unsigned int *raddr; + int cpu = smp_processor_id(); + + raddr = per_cpu_ptr(addr, cpu); + l = (unsigned long)raddr; + + if (REGION_ID(l) == VMALLOC_REGION_ID) { + l = vmalloc_to_phys(raddr); + raddr = (unsigned int *)l; + } + ++*raddr; +} + +/* + * We don't try to update the flags in the irq_desc 'istate' field in + * here as would happen in the normal IRQ handling path for several reasons: + * - state flags represent internal IRQ state and are not expected to be + * updated outside the IRQ subsystem + * - more importantly, these are useful for edge triggered interrupts, + * IRQ probing, etc., but we are only handling MSI/MSIx interrupts here + * and these states shouldn't apply to us. + * + * However, we do update irq_stats - we somewhat duplicate the code in + * kstat_incr_irqs_this_cpu() for this since this function is defined + * in irq/internal.h which we don't want to include here. + * The only difference is that desc->kstat_irqs is an allocated per CPU + * variable and could have been vmalloc'ed, so we can't directly + * call __this_cpu_inc() on it. The kstat structure is a static + * per CPU variable and it should be accessible by real-mode KVM. + * + */ +static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc) +{ + this_cpu_inc_rm(desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} + long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, struct kvmppc_irq_map *irq_map, @@ -747,6 +796,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, xics = vcpu->kvm->arch.xics; icp = vcpu->arch.icp; + kvmppc_rm_handle_irq_desc(irq_map->desc); icp_rm_deliver_irq(xics, icp, irq); /* EOI the interrupt */ -- cgit v0.10.2 From 5d375199ea963fa2a972eae9c7d83db36ed37082 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 19 Aug 2016 15:35:56 +1000 Subject: KVM: PPC: Book3S HV: Set server for passed-through interrupts When a guest has a PCI pass-through device with an interrupt, it will direct the interrupt to a particular guest VCPU. In fact the physical interrupt might arrive on any CPU, and then get delivered to the target VCPU in the emulated XICS (guest interrupt controller), and eventually delivered to the target VCPU. Now that we have code to handle device interrupts in real mode without exiting to the host kernel, there is an advantage to having the device interrupt arrive on the same sub(core) as the target VCPU is running on. In this situation, the interrupt can be delivered to the target VCPU without any exit to the host kernel (using a hypervisor doorbell interrupt between threads if necessary). This patch aims to get passed-through device interrupts arriving on the correct core by setting the interrupt server in the real hardware XICS for the interrupt to the first thread in the (sub)core where its target VCPU is running. We do this in the real-mode H_EOI code because the H_EOI handler already needs to look at the emulated ICS state for the interrupt (whereas the H_XIRR handler doesn't), and we know we are running in the target VCPU context at that point. We set the server CPU in hardware using an OPAL call, regardless of what the IRQ affinity mask for the interrupt says, and without updating the affinity mask. This amounts to saying that when an interrupt is passed through to a guest, as a matter of policy we allow the guest's affinity for the interrupt to override the host's. This is inspired by an earlier patch from Suresh Warrier, although none of this code came from that earlier patch. Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 97b9bad..f6e4964 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -479,6 +479,10 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xics_ipi_action(void); +extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq, + unsigned long host_irq); +extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq, + unsigned long host_irq); extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, struct kvmppc_irq_map *irq_map, struct kvmppc_passthru_irqmap *pimap); diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ee05bd2..e958b70 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func, int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func, uint64_t offset, uint32_t data); int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority); +int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority); int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority); int64_t opal_register_exception_handler(uint64_t opal_exception, uint64_t handler_address, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 03721ed..9b3bba6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3515,6 +3515,8 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (i == pimap->n_mapped) pimap->n_mapped++; + kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); + mutex_unlock(&kvm->lock); return 0; @@ -3551,6 +3553,8 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) return -ENODEV; } + kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); + /* invalidate the entry */ pimap->mapped[i].r_hwirq = 0; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 554cdfa..5f7527e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "book3s_xics.h" @@ -34,6 +35,7 @@ EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); +static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ static void ics_rm_check_resend(struct kvmppc_xics *xics, @@ -713,6 +715,13 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) icp->rm_action |= XICS_RM_NOTIFY_EOI; icp->rm_eoied_irq = irq; } + + if (state->host_irq && state->intr_cpu != -1) { + int pcpu = cpu_first_thread_sibling(raw_smp_processor_id()); + if (state->intr_cpu != pcpu) + xics_opal_rm_set_server(state->host_irq, pcpu); + state->intr_cpu = -1; + } bail: return check_too_hard(xics, icp); } @@ -736,6 +745,13 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) _stwcix(xics_phys + XICS_XIRR, xirr); } +static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu) +{ + unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2; + + return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY); +} + /* * Increment a per-CPU 32-bit unsigned integer variable. * Safe to call in real-mode. Handles vmalloc'ed addresses diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index be5179c..3bdc639 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) return 0; } + /* Record which CPU this arrived on for passed-through interrupts */ + if (state->host_irq) + state->intr_cpu = raw_smp_processor_id(); + /* Attempt delivery */ icp_deliver_irq(xics, NULL, irq); @@ -1438,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) { return pin; } + +void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq, + unsigned long host_irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + u16 idx; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return; + + ics->irq_state[idx].host_irq = host_irq; + ics->irq_state[idx].intr_cpu = -1; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped); + +void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq, + unsigned long host_irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + u16 idx; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return; + + ics->irq_state[idx].host_irq = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped); diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index a46b954..2a50320 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -42,6 +42,8 @@ struct ics_irq_state { u8 lsi; /* level-sensitive interrupt */ u8 asserted; /* Only for LSI */ u8 exists; + int intr_cpu; + u32 host_irq; }; /* Atomic ICP state, updated with a single compare & swap */ diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 3d29d40..44d2d84 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL_REAL(opal_rm_set_xive, OPAL_SET_XIVE); OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); -- cgit v0.10.2 From 65e7026a6c90484fbaa076d2c51e61baf7241960 Mon Sep 17 00:00:00 2001 From: Suresh Warrier Date: Fri, 19 Aug 2016 15:35:57 +1000 Subject: KVM: PPC: Book3S HV: Counters for passthrough IRQ stats Add VCPU stat counters to track affinity for passthrough interrupts. pthru_all: Counts all passthrough interrupts whose IRQ mappings are in the kvmppc_passthru_irq_map structure. pthru_host: Counts all cached passthrough interrupts that were injected from the host through kvm_set_irq (i.e. not handled in real mode). pthru_bad_aff: Counts how many cached passthrough interrupts have bad affinity (receiving CPU is not running VCPU that is the target of the virtual interrupt in the guest). Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 89ac1f6..ed30d2e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -137,6 +137,9 @@ struct kvm_vcpu_stat { u64 ld_slow; u64 st_slow; #endif + u64 pthru_all; + u64 pthru_host; + u64 pthru_bad_aff; }; enum kvm_exit_types { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 71eb8f3..ba231a1 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -68,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "ld_slow", VCPU_STAT(ld_slow) }, { "st", VCPU_STAT(st) }, { "st_slow", VCPU_STAT(st_slow) }, + { "pthru_all", VCPU_STAT(pthru_all) }, + { "pthru_host", VCPU_STAT(pthru_host) }, + { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, { NULL } }; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 5f7527e..82ff5de 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -716,11 +716,19 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) icp->rm_eoied_irq = irq; } - if (state->host_irq && state->intr_cpu != -1) { - int pcpu = cpu_first_thread_sibling(raw_smp_processor_id()); - if (state->intr_cpu != pcpu) - xics_opal_rm_set_server(state->host_irq, pcpu); - state->intr_cpu = -1; + if (state->host_irq) { + ++vcpu->stat.pthru_all; + if (state->intr_cpu != -1) { + int pcpu = raw_smp_processor_id(); + + pcpu = cpu_first_thread_sibling(pcpu); + ++vcpu->stat.pthru_host; + if (state->intr_cpu != pcpu) { + ++vcpu->stat.pthru_bad_aff; + xics_opal_rm_set_server(state->host_irq, pcpu); + } + state->intr_cpu = -1; + } } bail: return check_too_hard(xics, icp); -- cgit v0.10.2 From f3c0ce86a8da6a43c3551a2223abd42040316dd8 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 28 Aug 2016 16:30:07 +0200 Subject: KVM: PPC: e500: Use kmalloc_array() in kvm_vcpu_ioctl_config_tlb() * A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus use the corresponding function "kmalloc_array". This issue was detected by using the Coccinelle software. * Replace the specification of a data type by a pointer dereference to make the corresponding size determination a bit safer according to the Linux coding style convention. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 29911a0..26f3737 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -779,7 +779,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - cfg->array / PAGE_SIZE; - pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL); if (!pages) return -ENOMEM; -- cgit v0.10.2 From 46d4e7479252d3fd82a0ea121daef75164a91939 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 28 Aug 2016 17:34:46 +0200 Subject: KVM: PPC: e500: Less function calls in kvm_vcpu_ioctl_config_tlb() after error detection The kfree() function was called in two cases by the kvm_vcpu_ioctl_config_tlb() function during error handling even if the passed data structure element contained a null pointer. * Split a condition check for memory allocation failures. * Adjust jump targets according to the Linux coding style convention. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 26f3737..b65a894 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -785,35 +785,39 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); if (ret < 0) - goto err_pages; + goto free_pages; if (ret != num_pages) { num_pages = ret; ret = -EFAULT; - goto err_put_page; + goto put_pages; } virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); if (!virt) { ret = -ENOMEM; - goto err_put_page; + goto put_pages; } privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], GFP_KERNEL); + if (!privs[0]) { + ret = -ENOMEM; + goto put_pages; + } + privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], GFP_KERNEL); - - if (!privs[0] || !privs[1]) { + if (!privs[1]) { ret = -ENOMEM; - goto err_privs; + goto free_privs_first; } g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], GFP_KERNEL); if (!g2h_bitmap) { ret = -ENOMEM; - goto err_privs; + goto free_privs_second; } free_gtlb(vcpu_e500); @@ -845,16 +849,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; - -err_privs: - kfree(privs[0]); + free_privs_second: kfree(privs[1]); - -err_put_page: + free_privs_first: + kfree(privs[0]); + put_pages: for (i = 0; i < num_pages; i++) put_page(pages[i]); - -err_pages: + free_pages: kfree(pages); return ret; } -- cgit v0.10.2 From cfb60813fb8363a1681da2c10947e5f5b4165c47 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 28 Aug 2016 17:37:10 +0200 Subject: KVM: PPC: e500: Delete an unnecessary initialisation in kvm_vcpu_ioctl_config_tlb() The local variable "g2h_bitmap" will be set to an appropriate value a bit later. Thus omit the explicit initialisation at the beginning. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index b65a894..e9c19e9 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, char *virt; struct page **pages; struct tlbe_priv *privs[2] = {}; - u64 *g2h_bitmap = NULL; + u64 *g2h_bitmap; size_t array_len; u32 sets; int num_pages, ret, i; -- cgit v0.10.2 From b0ac477bc4efd6b9a3eccc106e597edf27546c11 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 28 Aug 2016 18:30:38 +0200 Subject: KVM: PPC: e500: Replace kzalloc() calls by kcalloc() in two functions * A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus use the corresponding function "kcalloc". Suggested-by: Paolo Bonzini This issue was detected also by using the Coccinelle software. * Replace the specification of data structures by pointer dereferences to make the corresponding size determination a bit safer according to the Linux coding style convention. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index e9c19e9..2be2afc4 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -799,22 +799,21 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, goto put_pages; } - privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], - GFP_KERNEL); + privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL); if (!privs[0]) { ret = -ENOMEM; goto put_pages; } - privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], - GFP_KERNEL); + privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL); if (!privs[1]) { ret = -ENOMEM; goto free_privs_first; } - g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], - GFP_KERNEL); + g2h_bitmap = kcalloc(params.tlb_sizes[1], + sizeof(*g2h_bitmap), + GFP_KERNEL); if (!g2h_bitmap) { ret = -ENOMEM; goto free_privs_second; @@ -929,20 +928,20 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; - vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * - vcpu_e500->gtlb_params[0].entries, + vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries, + sizeof(struct tlbe_ref), GFP_KERNEL); if (!vcpu_e500->gtlb_priv[0]) goto err; - vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * - vcpu_e500->gtlb_params[1].entries, + vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries, + sizeof(struct tlbe_ref), GFP_KERNEL); if (!vcpu_e500->gtlb_priv[1]) goto err; - vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * - vcpu_e500->gtlb_params[1].entries, + vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries, + sizeof(*vcpu_e500->g2h_tlb1_map), GFP_KERNEL); if (!vcpu_e500->g2h_tlb1_map) goto err; -- cgit v0.10.2 From 90235dc19e5ccd85ad17ebbf51041770e87540a0 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 28 Aug 2016 18:40:08 +0200 Subject: KVM: PPC: e500: Use kmalloc_array() in kvmppc_e500_tlb_init() * A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus use the corresponding function "kmalloc_array". * Replace the specification of a data structure by a pointer dereference to make the corresponding size determination a bit safer according to the Linux coding style convention. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 2be2afc4..0a2eeb1 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -905,8 +905,6 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu, int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; - int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); - int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; if (e500_mmu_host_init(vcpu_e500)) goto err; @@ -921,7 +919,10 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; vcpu_e500->gtlb_params[1].sets = 1; - vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE + + KVM_E500_TLB1_SIZE, + sizeof(*vcpu_e500->gtlb_arch), + GFP_KERNEL); if (!vcpu_e500->gtlb_arch) return -ENOMEM; -- cgit v0.10.2 From aad9e5ba243336d2b133e7fa9aace1a51d6fdae6 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 12 Sep 2016 22:33:53 +0200 Subject: KVM: PPC: e500: Rename jump labels in kvmppc_e500_tlb_init() Adjust jump labels according to the current Linux coding style convention. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 0a2eeb1..ddbf8f0 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -907,7 +907,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; if (e500_mmu_host_init(vcpu_e500)) - goto err; + goto free_vcpu; vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; @@ -933,26 +933,25 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) sizeof(struct tlbe_ref), GFP_KERNEL); if (!vcpu_e500->gtlb_priv[0]) - goto err; + goto free_vcpu; vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries, sizeof(struct tlbe_ref), GFP_KERNEL); if (!vcpu_e500->gtlb_priv[1]) - goto err; + goto free_vcpu; vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries, sizeof(*vcpu_e500->g2h_tlb1_map), GFP_KERNEL); if (!vcpu_e500->g2h_tlb1_map) - goto err; + goto free_vcpu; vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; - -err: + free_vcpu: free_gtlb(vcpu_e500); return -1; } -- cgit v0.10.2 From a545ab6a0085e6df9c7b6e9734b40ba4d2aca8c9 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 7 Sep 2016 14:47:19 -0400 Subject: kvm: x86: add tsc_offset field to struct kvm_vcpu_arch A future commit will want to easily read a vCPU's TSC offset, so we store it in struct kvm_arch_vcpu_arch for easy access. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cd82bf7..ddffcfb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -568,6 +568,7 @@ struct kvm_vcpu_arch { struct kvm_steal_time steal; } st; + u64 tsc_offset; u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57ffe78..9a2cc69 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + kvm_x86_ops->write_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = offset; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_x86_ops->write_tsc_offset(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); @@ -2750,7 +2756,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); - kvm_x86_ops->write_tsc_offset(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } /* -- cgit v0.10.2 From 3e3f50262eb441d0fd1de4dce06739e9c0fe7c61 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 7 Sep 2016 14:47:20 -0400 Subject: kvm: x86: drop read_tsc_offset() The TSC offset can now be read directly from struct kvm_arch_vcpu. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddffcfb..32a43a2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -954,7 +954,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index db77c1c..8023d53 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1119,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return svm->vmcb->control.tsc_offset; -} - static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5427,7 +5420,6 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f9939d0..fe2b144 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2609,11 +2609,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return host_tsc + tsc_offset; } -static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) -{ - return vmcs_read64(TSC_OFFSET); -} - /* * writes 'offset' into guest's timestamp counter offset register */ @@ -11286,7 +11281,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9a2cc69..b4ae925 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + u64 curr_offset = vcpu->arch.tsc_offset; vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } -- cgit v0.10.2 From 9d5a1dcebfbe0e421412fcf5ab86d30e2e65bab4 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 7 Sep 2016 14:47:21 -0400 Subject: kvm: kvm_destroy_vm_debugfs(): check debugfs_stat_data pointer This make it possible to call kvm_destroy_vm_debugfs() from kvm_create_vm_debugfs() in error conditions. Reviewed-by: Paolo Bonzini Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a00f8e4..a6e864c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -559,9 +559,11 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm) debugfs_remove_recursive(kvm->debugfs_dentry); - for (i = 0; i < kvm_debugfs_num_entries; i++) - kfree(kvm->debugfs_stat_data[i]); - kfree(kvm->debugfs_stat_data); + if (kvm->debugfs_stat_data) { + for (i = 0; i < kvm_debugfs_num_entries; i++) + kfree(kvm->debugfs_stat_data[i]); + kfree(kvm->debugfs_stat_data); + } } static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) -- cgit v0.10.2 From 235539b48a2357da28f52d66d04bec04f3dcb9dd Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 7 Sep 2016 14:47:23 -0400 Subject: kvm: add stubs for arch specific debugfs support Two stubs are added: o kvm_arch_has_vcpu_debugfs(): must return true if the arch supports creating debugfs entries in the vcpu debugfs dir (which will be implemented by the next commit) o kvm_arch_create_vcpu_debugfs(): code that creates debugfs entries in the vcpu debugfs dir For x86, this commit introduces a new file to avoid growing arch/x86/kvm/x86.c even more. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 75f130e..c638935 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -144,6 +144,16 @@ out_fail_alloc: return ret; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { return VM_FAULT_SIGBUS; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a6ea084..49b25e7 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -140,6 +140,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_mips_free_vcpus(struct kvm *kvm) { unsigned int i; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0b7d664..70963c8 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -438,6 +438,16 @@ err_out: return -EINVAL; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ac6c056..0a9ce9d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1489,6 +1489,16 @@ out_err: return rc; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 464fa47..3bff207 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o + hyperv.o page_track.o debugfs.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c new file mode 100644 index 0000000..bb5e9f6 --- /dev/null +++ b/arch/x86/kvm/debugfs.c @@ -0,0 +1,20 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * Copyright 2016 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include + +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9c28b4d..5486ff9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -749,6 +749,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); +bool kvm_arch_has_vcpu_debugfs(void); +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu); + int kvm_arch_hardware_enable(void); void kvm_arch_hardware_disable(void); int kvm_arch_hardware_setup(void); -- cgit v0.10.2 From 45b5939e50746b92fd4cb47c02524f79ba8fabe6 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Fri, 16 Sep 2016 10:27:35 -0400 Subject: kvm: create per-vcpu dirs in debugfs This commit adds the ability for archs to export per-vcpu information via a new per-vcpu dir in the VM's debugfs directory. If kvm_arch_has_vcpu_debugfs() returns true, then KVM will create a vcpu dir for each vCPU in the VM's debugfs directory. Then kvm_arch_create_vcpu_debugfs() is responsible for populating each vcpu directory with arch specific entries. The per-vcpu path in debugfs will look like: /sys/kernel/debug/kvm/29162-10/vcpu0 /sys/kernel/debug/kvm/29162-10/vcpu1 This is all arch specific for now because the only user of this interface (x86) wants to export x86-specific per-vcpu information to user-space. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5486ff9..01c0b9c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -265,6 +265,7 @@ struct kvm_vcpu { #endif bool preempted; struct kvm_vcpu_arch arch; + struct dentry *debugfs_dentry; }; static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a6e864c..81dfc73 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2371,6 +2371,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; + debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_put_kvm(vcpu->kvm); return 0; } @@ -2393,6 +2394,32 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } +static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + char dir_name[ITOA_MAX_LEN * 2]; + int ret; + + if (!kvm_arch_has_vcpu_debugfs()) + return 0; + + if (!debugfs_initialized()) + return 0; + + snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); + vcpu->debugfs_dentry = debugfs_create_dir(dir_name, + vcpu->kvm->debugfs_dentry); + if (!vcpu->debugfs_dentry) + return -ENOMEM; + + ret = kvm_arch_create_vcpu_debugfs(vcpu); + if (ret < 0) { + debugfs_remove_recursive(vcpu->debugfs_dentry); + return ret; + } + + return 0; +} + /* * Creates some virtual cpus. Good luck creating more than one. */ @@ -2425,6 +2452,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_destroy; + r = kvm_create_vcpu_debugfs(vcpu); + if (r) + goto vcpu_destroy; + mutex_lock(&kvm->lock); if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; @@ -2456,6 +2487,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) unlock_vcpu_destroy: mutex_unlock(&kvm->lock); + debugfs_remove_recursive(vcpu->debugfs_dentry); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); vcpu_decrement: -- cgit v0.10.2 From 4f5758fc67607b915c7513baf0438a35a5fc457c Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Fri, 16 Sep 2016 10:27:36 -0400 Subject: kvm: x86: export TSC information to user-space This commit exports the following information to user-space via the newly created per-vcpu debugfs directory: - TSC offset (as a signed number) - TSC scaling ratio - TSC scaling ratio fractinal bits The original intention of this commit was to export only the TSC offset, but the TSC scaling information is exported for completeness. We need to retrieve the TSC offset from user-space in order to support the merging of host and guest traces in trace-cmd. Today, we use the kvm_write_tsc_offset tracepoint, but it has a number of problems (mainly, it requires a running VM to be rebooted, ftrace setup, and also tracepoints are not supposed to be ABIs). The merging of host and guest traces is explained in more detail in this thread: [Qemu-devel] [RFC] host and guest kernel trace merging https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg00887.html This commit creates the following files in debugfs: /sys/kernel/debug/kvm/66828-10/vcpu0/tsc-offset /sys/kernel/debug/kvm/66828-10/vcpu0/tsc-scaling-ratio /sys/kernel/debug/kvm/66828-10/vcpu0/tsc-scaling-ratio-frac-bits The last two are only created if TSC scaling is supported. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index bb5e9f6..c19c7ed 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -8,13 +8,62 @@ * */ #include +#include bool kvm_arch_has_vcpu_debugfs(void) { - return false; + return true; } +static int vcpu_get_tsc_offset(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.tsc_offset; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); + +static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.tsc_scaling_ratio; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); + +static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) +{ + *val = kvm_tsc_scaling_ratio_frac_bits; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); + int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { + struct dentry *ret; + + ret = debugfs_create_file("tsc-offset", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_offset_fops); + if (!ret) + return -ENOMEM; + + if (kvm_has_tsc_control) { + ret = debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_scaling_fops); + if (!ret) + return -ENOMEM; + ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_scaling_frac_fops); + if (!ret) + return -ENOMEM; + + } + return 0; } -- cgit v0.10.2 From 0d6dd2ff8206dc1da3428d5b1611f6304d481dab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Sep 2016 14:20:09 +0200 Subject: KVM: x86: always fill in vcpu->arch.hv_clock We will use it in the next patches for KVM_GET_CLOCK and as a basis for the contents of the Hyper-V TSC page. Get the values from the Linux timekeeper even if kvmclock is not enabled. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4ae925..d1e8307 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1722,6 +1722,60 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) #endif } +static void kvm_setup_pvclock_page(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + struct pvclock_vcpu_time_info guest_hv_clock; + + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return; + + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. + */ + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; @@ -1729,7 +1783,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; bool use_master_clock; @@ -1783,8 +1836,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->pv_time_enabled) - return 0; + /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); @@ -1796,64 +1848,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = tgt_tsc_khz; } - /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return 0; - - /* This VCPU is paused, but it's legal for a guest to read another - * VCPU's kvmclock, so we really have to follow the specification where - * it says that version is odd if data is being modified, and even after - * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. - */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); - - smp_wmb(); - - /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - /* If the host uses TSC clocksource, then it is stable */ + pvclock_flags = 0; if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; vcpu->hv_clock.flags = pvclock_flags; - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - smp_wmb(); + if (!vcpu->pv_time_enabled) + return 0; - vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_setup_pvclock_page(v); return 0; } -- cgit v0.10.2 From 67198ac3f37ffb150f1c95fae16b597339eabc9d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Sep 2016 15:44:57 +0200 Subject: KVM: x86: initialize kvmclock_offset Make the guest's kvmclock count up from zero, not from the host boot time. The guest cannot rely on that anyway because it changes on migration, the numbers are easier on the eye and finally it matches the desired semantics of the Hyper-V time reference counter. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d1e8307..00e569c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7779,6 +7779,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + kvm->arch.kvmclock_offset = -get_kernel_ns(); pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); -- cgit v0.10.2 From 108b249c453dd7132599ab6dc7e435a7036c193f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Sep 2016 14:21:03 +0200 Subject: KVM: x86: introduce get_kvmclock_ns Introduce a function that reads the exact nanoseconds value that is provided to the guest in kvmclock. This crystallizes the notion of kvmclock as a thin veneer over a stable TSC, that the guest will (hopefully) convert with NTP. In other words, kvmclock is *not* a paravirtualized host-to-guest NTP. Drop the get_kernel_ns() function, that was used both to get the base value of the master clock and to get the current value of kvmclock. The former use is replaced by ktime_get_boot_ns(), the latter is the purpose of get_kernel_ns(). This also allows KVM to provide a Hyper-V time reference counter that is synchronized with the time that is computed from the TSC page. Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 94d54d0..02223cb 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode) return 0; } - ret = __pvclock_read_cycles(pvti); + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); /* refer to vread_tsc() comment for rationale */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d019f0c..3ad741b 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, + u64 tsc) { - u64 delta = rdtsc_ordered() - src->tsc_timestamp; + u64 delta = tsc - src->tsc_timestamp; cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 3599404..5b2cc88 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = pvclock_read_begin(src); - ret = __pvclock_read_cycles(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); flags = src->flags; } while (pvclock_read_retry(src, version)); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 01bd7b7..ed5b77f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + return div_u64(get_kvmclock_ns(kvm), 100); } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00e569c..81e9945 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1431,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = get_kernel_ns(); + ns = ktime_get_boot_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -1722,6 +1722,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) #endif } +static u64 __get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); + struct kvm_arch *ka = &kvm->arch; + s64 ns; + + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); + } else { + ns = ktime_get_boot_ns() + ka->kvmclock_offset; + } + + return ns; +} + +u64 get_kvmclock_ns(struct kvm *kvm) +{ + unsigned long flags; + s64 ns; + + local_irq_save(flags); + ns = __get_kvmclock_ns(kvm); + local_irq_restore(flags); + + return ns; +} + static void kvm_setup_pvclock_page(struct kvm_vcpu *v) { struct kvm_vcpu_arch *vcpu = &v->arch; @@ -1811,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = get_kernel_ns(); + kernel_ns = ktime_get_boot_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -4054,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_CLOCK: { struct kvm_clock_data user_ns; u64 now_ns; - s64 delta; r = -EFAULT; if (copy_from_user(&user_ns, argp, sizeof(user_ns))) @@ -4066,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; local_irq_disable(); - now_ns = get_kernel_ns(); - delta = user_ns.clock - now_ns; + now_ns = __get_kvmclock_ns(kvm); + kvm->arch.kvmclock_offset += user_ns.clock - now_ns; local_irq_enable(); - kvm->arch.kvmclock_offset = delta; kvm_gen_update_masterclock(kvm); break; } @@ -4077,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = get_kernel_ns(); - user_ns.clock = kvm->arch.kvmclock_offset + now_ns; - local_irq_enable(); + now_ns = get_kvmclock_ns(kvm); + user_ns.clock = now_ns; user_ns.flags = 0; memset(&user_ns.pad, 0, sizeof(user_ns.pad)); @@ -7544,7 +7568,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, get_kernel_ns() will be using boot + * elapsed; our helper function, ktime_get_boot_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -7779,7 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -get_kernel_ns(); + kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a82ca46..e8ff3e4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } -static inline u64 get_kernel_ns(void) -{ - return ktime_get_boot_ns(); -} - static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) { return !(kvm->arch.disabled_quirks & quirk); @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); +u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, -- cgit v0.10.2 From 095cf55df715d14d5dad75326faf5984e7fc0b3a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Feb 2016 12:54:12 +0100 Subject: KVM: x86: Hyper-V tsc page setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lately tsc page was implemented but filled with empty values. This patch setup tsc page scale and offset based on vcpu tsc, tsc_khz and HV_X64_MSR_TIME_REF_COUNT value. The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr reads count to zero which potentially improves performance. Signed-off-by: Andrey Smetanin Reviewed-by: Peter Hornyack Reviewed-by: Radim Krčmář CC: Paolo Bonzini CC: Roman Kagan CC: Denis V. Lunev [Computation of TSC page parameters rewritten to use the Linux timekeeper parameters. - Paolo] Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 32a43a2..4b20f73 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -702,6 +702,8 @@ struct kvm_hv { /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; + + HV_REFERENCE_TSC_PAGE tsc_ref; }; struct kvm_arch { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ed5b77f..42b1c83 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - return div_u64(get_kvmclock_ns(kvm), 100); + struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_vcpu *vcpu; + u64 tsc; + + /* + * The guest has not set up the TSC page or the clock isn't + * stable, fall back to get_kvmclock_ns. + */ + if (!hv->tsc_ref.tsc_sequence) + return div_u64(get_kvmclock_ns(kvm), 100); + + vcpu = kvm_get_vcpu(kvm, 0); + tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, @@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, return 0; } +/* + * The kvmclock and Hyper-V TSC page use similar formulas, and converting + * between them is possible: + * + * kvmclock formula: + * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) + * + system_time + * + * Hyper-V formula: + * nsec/100 = ticks * scale / 2^64 + offset + * + * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. + * By dividing the kvmclock formula by 100 and equating what's left we get: + * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 + * + * Now expand the kvmclock formula and divide by 100: + * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) + * + system_time + * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * + system_time / 100 + * + * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: + * nsec/100 = ticks * scale / 2^64 + * - tsc_timestamp * scale / 2^64 + * + system_time / 100 + * + * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: + * offset = system_time / 100 - tsc_timestamp * scale / 2^64 + * + * These two equivalencies are implemented in this function. + */ +static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, + HV_REFERENCE_TSC_PAGE *tsc_ref) +{ + u64 max_mul; + + if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) + return false; + + /* + * check if scale would overflow, if so we use the time ref counter + * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 + * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) + * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) + */ + max_mul = 100ull << (32 - hv_clock->tsc_shift); + if (hv_clock->tsc_to_system_mul >= max_mul) + return false; + + /* + * Otherwise compute the scale and offset according to the formulas + * derived above. + */ + tsc_ref->tsc_scale = + mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), + hv_clock->tsc_to_system_mul, + 100); + + tsc_ref->tsc_offset = hv_clock->system_time; + do_div(tsc_ref->tsc_offset, 100); + tsc_ref->tsc_offset -= + mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); + return true; +} + +void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock) +{ + struct kvm_hv *hv = &kvm->arch.hyperv; + u32 tsc_seq; + u64 gfn; + + BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); + BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + return; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + /* + * Because the TSC parameters only vary when there is a + * change in the master clock, do not bother with caching. + */ + if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), + &tsc_seq, sizeof(tsc_seq)))) + return; + + /* + * While we're computing and writing the parameters, force the + * guest to use the time reference count MSR. + */ + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + return; + + if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) + return; + + /* Ensure sequence is zero before writing the rest of the struct. */ + smp_wmb(); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + return; + + /* + * Now switch to the TSC page mechanism by writing the sequence. + */ + tsc_seq++; + if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) + tsc_seq = 1; + + /* Write the struct entirely before the non-zero sequence. */ + smp_wmb(); + + hv->tsc_ref.tsc_sequence = tsc_seq; + kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { @@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, mark_page_dirty(kvm, gfn); break; } - case HV_X64_MSR_REFERENCE_TSC: { - u64 gfn; - HV_REFERENCE_TSC_PAGE tsc_ref; - - memset(&tsc_ref, 0, sizeof(tsc_ref)); + case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - break; - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest( - kvm, - gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, - &tsc_ref, sizeof(tsc_ref))) - return 1; - mark_page_dirty(kvm, gfn); + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); break; - } case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(vcpu, msr - HV_X64_MSR_CRASH_P0, diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 60eccd4..cd11195 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); +void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock); + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 81e9945..3ee8a91 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1887,10 +1887,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - if (!vcpu->pv_time_enabled) - return 0; - - kvm_setup_pvclock_page(v); + if (vcpu->pv_time_enabled) + kvm_setup_pvclock_page(v); + if (v == kvm_get_vcpu(v->kvm, 0)) + kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } -- cgit v0.10.2 From adad0d02a7d3c958121a4eb9d126015a2353db94 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 19 Sep 2016 07:11:59 +0100 Subject: kvm: svm: fix unsigned compare less than zero comparison vm_data->avic_vm_id is a u32, so the check for a error return (less than zero) such as -EAGAIN from avic_get_next_vm_id currently has no effect whatsoever. Fix this by using a temporary int for the comparison and assign vm_data->avic_vm_id to this. I used an explicit u32 cast in the assignment to show why vm_data->avic_vm_id cannot be used in the assign/compare steps. Signed-off-by: Colin Ian King Acked-by: Joerg Roedel Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8023d53..9b42214 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1412,7 +1412,7 @@ static void avic_vm_destroy(struct kvm *kvm) static int avic_vm_init(struct kvm *kvm) { unsigned long flags; - int err = -ENOMEM; + int vm_id, err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; struct page *l_page; @@ -1420,9 +1420,10 @@ static int avic_vm_init(struct kvm *kvm) if (!avic) return 0; - vm_data->avic_vm_id = avic_get_next_vm_id(); - if (vm_data->avic_vm_id < 0) - return vm_data->avic_vm_id; + vm_id = avic_get_next_vm_id(); + if (vm_id < 0) + return vm_id; + vm_data->avic_vm_id = (u32)vm_id; /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); -- cgit v0.10.2 From 5a7a8426b2ac004b064e4106911769e0a55e7c4b Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:15 +0100 Subject: arm64: KVM: Use static keys for selecting the GIC backend Currently GIC backend is selected via alternative framework and this is fine. We are going to introduce vgic-v3 to 32-bit world and there we don't have patching framework in hand, so we can either check support for GICv3 every time we need to choose which backend to use or try to optimise it by using static keys. The later looks quite promising because we can share logic involved in selecting GIC backend between architectures if both uses static keys. This patch moves arm64 from alternative to static keys framework for selecting GIC backend. For that we embed static key into vgic_global and enable the key during vgic initialisation based on what has already been exposed by the host GIC driver. Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 731519c..83037cd 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -16,6 +16,8 @@ */ #include +#include + #include #include #include @@ -136,17 +138,13 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) write_sysreg(0, vttbr_el2); } -static hyp_alternate_select(__vgic_call_save_state, - __vgic_v2_save_state, __vgic_v3_save_state, - ARM64_HAS_SYSREG_GIC_CPUIF); - -static hyp_alternate_select(__vgic_call_restore_state, - __vgic_v2_restore_state, __vgic_v3_restore_state, - ARM64_HAS_SYSREG_GIC_CPUIF); - static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) { - __vgic_call_save_state()(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_save_state(vcpu); + else + __vgic_v2_save_state(vcpu); + write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2); } @@ -159,7 +157,10 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) val |= vcpu->arch.irq_lines; write_sysreg(val, hcr_el2); - __vgic_call_restore_state()(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_restore_state(vcpu); + else + __vgic_v2_restore_state(vcpu); } static bool __hyp_text __true_value(void) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index bb46c03..8d22adc 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -24,6 +24,7 @@ #include #include #include +#include #define VGIC_V3_MAX_CPUS 255 #define VGIC_V2_MAX_CPUS 8 @@ -67,6 +68,9 @@ struct vgic_global { /* Only needed for the legacy KVM_CREATE_IRQCHIP */ bool can_emulate_gicv2; + + /* GIC system register CPU interface */ + struct static_key_false gicv3_cpuif; }; extern struct vgic_global kvm_vgic_global_state; diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 83777c1..8cebfbc 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -405,6 +405,10 @@ int kvm_vgic_hyp_init(void) break; case GIC_V3: ret = vgic_v3_probe(gic_kvm_info); + if (!ret) { + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); + kvm_info("GIC system register CPU interface enabled\n"); + } break; default: ret = -ENODEV; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index e83b7fe..8a529a7 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -29,7 +29,7 @@ #define DEBUG_SPINLOCK_BUG_ON(p) #endif -struct vgic_global __section(.hyp.text) kvm_vgic_global_state; +struct vgic_global __section(.hyp.text) kvm_vgic_global_state = {.gicv3_cpuif = STATIC_KEY_FALSE_INIT,}; /* * Locking order is always: -- cgit v0.10.2 From b5525ce898eb5cbbd0359d745e23a5516377fa64 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:16 +0100 Subject: arm64: KVM: Move GIC accessors to arch_gicv3.h Since we are going to share vgic-v3 save/restore code with ARM keep arch specific accessors separately. Signed-off-by: Vladimir Murzin Acked-by: Christoffer Dall Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 8ec88e5..ae7dbd7 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -79,6 +79,19 @@ #include #include +#define read_gicreg(r) \ + ({ \ + u64 reg; \ + asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \ + reg; \ + }) + +#define write_gicreg(v,r) \ + do { \ + u64 __val = (v); \ + asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\ + } while (0) + /* * Low-level accessors * diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index ee1ea63..3947095 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -24,19 +24,6 @@ #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1) -#define read_gicreg(r) \ - ({ \ - u64 reg; \ - asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \ - reg; \ - }) - -#define write_gicreg(v,r) \ - do { \ - u64 __val = (v); \ - asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\ - } while (0) - static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) { switch (lr & 0xf) { -- cgit v0.10.2 From 19f0ece4395470b24826fc090de2795ecc9cf4a0 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:17 +0100 Subject: arm64: KVM: Move vgic-v3 save/restore to virt/kvm/arm/hyp So we can reuse the code under arch/arm Signed-off-by: Vladimir Murzin Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 0c85feb..aaf42ae 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -5,9 +5,9 @@ KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o obj-$(CONFIG_KVM_ARM_HOST) += entry.o diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c deleted file mode 100644 index 3947095..0000000 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include - -#include - -#define vtr_to_max_lr_idx(v) ((v) & 0xf) -#define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1) - -static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) -{ - switch (lr & 0xf) { - case 0: - return read_gicreg(ICH_LR0_EL2); - case 1: - return read_gicreg(ICH_LR1_EL2); - case 2: - return read_gicreg(ICH_LR2_EL2); - case 3: - return read_gicreg(ICH_LR3_EL2); - case 4: - return read_gicreg(ICH_LR4_EL2); - case 5: - return read_gicreg(ICH_LR5_EL2); - case 6: - return read_gicreg(ICH_LR6_EL2); - case 7: - return read_gicreg(ICH_LR7_EL2); - case 8: - return read_gicreg(ICH_LR8_EL2); - case 9: - return read_gicreg(ICH_LR9_EL2); - case 10: - return read_gicreg(ICH_LR10_EL2); - case 11: - return read_gicreg(ICH_LR11_EL2); - case 12: - return read_gicreg(ICH_LR12_EL2); - case 13: - return read_gicreg(ICH_LR13_EL2); - case 14: - return read_gicreg(ICH_LR14_EL2); - case 15: - return read_gicreg(ICH_LR15_EL2); - } - - unreachable(); -} - -static void __hyp_text __gic_v3_set_lr(u64 val, int lr) -{ - switch (lr & 0xf) { - case 0: - write_gicreg(val, ICH_LR0_EL2); - break; - case 1: - write_gicreg(val, ICH_LR1_EL2); - break; - case 2: - write_gicreg(val, ICH_LR2_EL2); - break; - case 3: - write_gicreg(val, ICH_LR3_EL2); - break; - case 4: - write_gicreg(val, ICH_LR4_EL2); - break; - case 5: - write_gicreg(val, ICH_LR5_EL2); - break; - case 6: - write_gicreg(val, ICH_LR6_EL2); - break; - case 7: - write_gicreg(val, ICH_LR7_EL2); - break; - case 8: - write_gicreg(val, ICH_LR8_EL2); - break; - case 9: - write_gicreg(val, ICH_LR9_EL2); - break; - case 10: - write_gicreg(val, ICH_LR10_EL2); - break; - case 11: - write_gicreg(val, ICH_LR11_EL2); - break; - case 12: - write_gicreg(val, ICH_LR12_EL2); - break; - case 13: - write_gicreg(val, ICH_LR13_EL2); - break; - case 14: - write_gicreg(val, ICH_LR14_EL2); - break; - case 15: - write_gicreg(val, ICH_LR15_EL2); - break; - } -} - -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - int i; - bool expect_mi; - - expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE); - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) && - (cpu_if->vgic_lr[i] & ICH_LR_EOI)); - } - - if (expect_mi) { - cpu_if->vgic_misr = read_gicreg(ICH_MISR_EL2); - - if (cpu_if->vgic_misr & ICH_MISR_EOI) - cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2); - else - cpu_if->vgic_eisr = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - } -} - -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - - /* - * Make sure stores to the GIC via the memory mapped interface - * are now visible to the system register interface. - */ - if (!cpu_if->vgic_sre) - dsb(st); - - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - - if (vcpu->arch.vgic_cpu.live_lrs) { - int i; - u32 max_lr_idx, nr_pri_bits; - - cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); - - write_gicreg(0, ICH_HCR_EL2); - val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); - nr_pri_bits = vtr_to_nr_pri_bits(val); - - save_maint_int_state(vcpu, max_lr_idx + 1); - - for (i = 0; i <= max_lr_idx; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - if (cpu_if->vgic_elrsr & (1 << i)) - cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; - else - cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); - - __gic_v3_set_lr(0, i); - } - - switch (nr_pri_bits) { - case 7: - cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2); - cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2); - case 6: - cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2); - default: - cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2); - } - - switch (nr_pri_bits) { - case 7: - cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2); - cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2); - case 6: - cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2); - default: - cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); - } - - vcpu->arch.vgic_cpu.live_lrs = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - cpu_if->vgic_elrsr = 0xffff; - cpu_if->vgic_ap0r[0] = 0; - cpu_if->vgic_ap0r[1] = 0; - cpu_if->vgic_ap0r[2] = 0; - cpu_if->vgic_ap0r[3] = 0; - cpu_if->vgic_ap1r[0] = 0; - cpu_if->vgic_ap1r[1] = 0; - cpu_if->vgic_ap1r[2] = 0; - cpu_if->vgic_ap1r[3] = 0; - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); - } -} - -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - u32 max_lr_idx, nr_pri_bits; - u16 live_lrs = 0; - int i; - - /* - * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a - * Group0 interrupt (as generated in GICv2 mode) to be - * delivered as a FIQ to the guest, with potentially fatal - * consequences. So we must make sure that ICC_SRE_EL1 has - * been actually programmed with the value we want before - * starting to mess with the rest of the GIC. - */ - if (!cpu_if->vgic_sre) { - write_gicreg(0, ICC_SRE_EL1); - isb(); - } - - val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); - nr_pri_bits = vtr_to_nr_pri_bits(val); - - for (i = 0; i <= max_lr_idx; i++) { - if (cpu_if->vgic_lr[i] & ICH_LR_STATE) - live_lrs |= (1 << i); - } - - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - - if (live_lrs) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - - switch (nr_pri_bits) { - case 7: - write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2); - write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2); - case 6: - write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2); - default: - write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2); - } - - switch (nr_pri_bits) { - case 7: - write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2); - write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2); - case 6: - write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2); - default: - write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); - } - - for (i = 0; i <= max_lr_idx; i++) { - if (!(live_lrs & (1 << i))) - continue; - - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } - } - - /* - * Ensures that the above will have reached the - * (re)distributors. This ensure the guest will read the - * correct values from the memory-mapped interface. - */ - if (!cpu_if->vgic_sre) { - isb(); - dsb(sy); - } - vcpu->arch.vgic_cpu.live_lrs = live_lrs; - - /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); -} - -void __hyp_text __vgic_v3_init_lrs(void) -{ - int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); - int i; - - for (i = 0; i <= max_lr_idx; i++) - __gic_v3_set_lr(0, i); -} - -u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) -{ - return read_gicreg(ICH_VTR_EL2); -} diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c new file mode 100644 index 0000000..3947095 --- /dev/null +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -0,0 +1,328 @@ +/* + * Copyright (C) 2012-2015 - ARM Ltd + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include + +#define vtr_to_max_lr_idx(v) ((v) & 0xf) +#define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1) + +static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) +{ + switch (lr & 0xf) { + case 0: + return read_gicreg(ICH_LR0_EL2); + case 1: + return read_gicreg(ICH_LR1_EL2); + case 2: + return read_gicreg(ICH_LR2_EL2); + case 3: + return read_gicreg(ICH_LR3_EL2); + case 4: + return read_gicreg(ICH_LR4_EL2); + case 5: + return read_gicreg(ICH_LR5_EL2); + case 6: + return read_gicreg(ICH_LR6_EL2); + case 7: + return read_gicreg(ICH_LR7_EL2); + case 8: + return read_gicreg(ICH_LR8_EL2); + case 9: + return read_gicreg(ICH_LR9_EL2); + case 10: + return read_gicreg(ICH_LR10_EL2); + case 11: + return read_gicreg(ICH_LR11_EL2); + case 12: + return read_gicreg(ICH_LR12_EL2); + case 13: + return read_gicreg(ICH_LR13_EL2); + case 14: + return read_gicreg(ICH_LR14_EL2); + case 15: + return read_gicreg(ICH_LR15_EL2); + } + + unreachable(); +} + +static void __hyp_text __gic_v3_set_lr(u64 val, int lr) +{ + switch (lr & 0xf) { + case 0: + write_gicreg(val, ICH_LR0_EL2); + break; + case 1: + write_gicreg(val, ICH_LR1_EL2); + break; + case 2: + write_gicreg(val, ICH_LR2_EL2); + break; + case 3: + write_gicreg(val, ICH_LR3_EL2); + break; + case 4: + write_gicreg(val, ICH_LR4_EL2); + break; + case 5: + write_gicreg(val, ICH_LR5_EL2); + break; + case 6: + write_gicreg(val, ICH_LR6_EL2); + break; + case 7: + write_gicreg(val, ICH_LR7_EL2); + break; + case 8: + write_gicreg(val, ICH_LR8_EL2); + break; + case 9: + write_gicreg(val, ICH_LR9_EL2); + break; + case 10: + write_gicreg(val, ICH_LR10_EL2); + break; + case 11: + write_gicreg(val, ICH_LR11_EL2); + break; + case 12: + write_gicreg(val, ICH_LR12_EL2); + break; + case 13: + write_gicreg(val, ICH_LR13_EL2); + break; + case 14: + write_gicreg(val, ICH_LR14_EL2); + break; + case 15: + write_gicreg(val, ICH_LR15_EL2); + break; + } +} + +static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + bool expect_mi; + + expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE); + + for (i = 0; i < nr_lr; i++) { + if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) + continue; + + expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) && + (cpu_if->vgic_lr[i] & ICH_LR_EOI)); + } + + if (expect_mi) { + cpu_if->vgic_misr = read_gicreg(ICH_MISR_EL2); + + if (cpu_if->vgic_misr & ICH_MISR_EOI) + cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2); + else + cpu_if->vgic_eisr = 0; + } else { + cpu_if->vgic_misr = 0; + cpu_if->vgic_eisr = 0; + } +} + +void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 val; + + /* + * Make sure stores to the GIC via the memory mapped interface + * are now visible to the system register interface. + */ + if (!cpu_if->vgic_sre) + dsb(st); + + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + + if (vcpu->arch.vgic_cpu.live_lrs) { + int i; + u32 max_lr_idx, nr_pri_bits; + + cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); + + write_gicreg(0, ICH_HCR_EL2); + val = read_gicreg(ICH_VTR_EL2); + max_lr_idx = vtr_to_max_lr_idx(val); + nr_pri_bits = vtr_to_nr_pri_bits(val); + + save_maint_int_state(vcpu, max_lr_idx + 1); + + for (i = 0; i <= max_lr_idx; i++) { + if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) + continue; + + if (cpu_if->vgic_elrsr & (1 << i)) + cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; + else + cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); + + __gic_v3_set_lr(0, i); + } + + switch (nr_pri_bits) { + case 7: + cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2); + cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2); + case 6: + cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2); + default: + cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2); + } + + switch (nr_pri_bits) { + case 7: + cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2); + cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2); + case 6: + cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2); + default: + cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); + } + + vcpu->arch.vgic_cpu.live_lrs = 0; + } else { + cpu_if->vgic_misr = 0; + cpu_if->vgic_eisr = 0; + cpu_if->vgic_elrsr = 0xffff; + cpu_if->vgic_ap0r[0] = 0; + cpu_if->vgic_ap0r[1] = 0; + cpu_if->vgic_ap0r[2] = 0; + cpu_if->vgic_ap0r[3] = 0; + cpu_if->vgic_ap1r[0] = 0; + cpu_if->vgic_ap1r[1] = 0; + cpu_if->vgic_ap1r[2] = 0; + cpu_if->vgic_ap1r[3] = 0; + } + + val = read_gicreg(ICC_SRE_EL2); + write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); + + if (!cpu_if->vgic_sre) { + /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ + isb(); + write_gicreg(1, ICC_SRE_EL1); + } +} + +void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 val; + u32 max_lr_idx, nr_pri_bits; + u16 live_lrs = 0; + int i; + + /* + * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a + * Group0 interrupt (as generated in GICv2 mode) to be + * delivered as a FIQ to the guest, with potentially fatal + * consequences. So we must make sure that ICC_SRE_EL1 has + * been actually programmed with the value we want before + * starting to mess with the rest of the GIC. + */ + if (!cpu_if->vgic_sre) { + write_gicreg(0, ICC_SRE_EL1); + isb(); + } + + val = read_gicreg(ICH_VTR_EL2); + max_lr_idx = vtr_to_max_lr_idx(val); + nr_pri_bits = vtr_to_nr_pri_bits(val); + + for (i = 0; i <= max_lr_idx; i++) { + if (cpu_if->vgic_lr[i] & ICH_LR_STATE) + live_lrs |= (1 << i); + } + + write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); + + if (live_lrs) { + write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + + switch (nr_pri_bits) { + case 7: + write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2); + write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2); + case 6: + write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2); + default: + write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2); + } + + switch (nr_pri_bits) { + case 7: + write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2); + write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2); + case 6: + write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2); + default: + write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); + } + + for (i = 0; i <= max_lr_idx; i++) { + if (!(live_lrs & (1 << i))) + continue; + + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + } + } + + /* + * Ensures that the above will have reached the + * (re)distributors. This ensure the guest will read the + * correct values from the memory-mapped interface. + */ + if (!cpu_if->vgic_sre) { + isb(); + dsb(sy); + } + vcpu->arch.vgic_cpu.live_lrs = live_lrs; + + /* + * Prevent the guest from touching the GIC system registers if + * SRE isn't enabled for GICv3 emulation. + */ + write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, + ICC_SRE_EL2); +} + +void __hyp_text __vgic_v3_init_lrs(void) +{ + int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); + int i; + + for (i = 0; i <= max_lr_idx; i++) + __gic_v3_set_lr(0, i); +} + +u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) +{ + return read_gicreg(ICH_VTR_EL2); +} -- cgit v0.10.2 From 7a1ff708286045a6664fbda716fd3cf8d63afadb Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:18 +0100 Subject: KVM: arm64: vgic-its: Introduce config option to guard ITS specific code By now ITS code guarded with KVM_ARM_VGIC_V3 config option which was introduced to hide everything specific to vgic-v3 from 32-bit world. We are going to support vgic-v3 in 32-bit world and KVM_ARM_VGIC_V3 will gone, but we don't have support for ITS there yet and we need to continue keeping ITS away. Introduce the new config option to prevent ITS code being build in 32-bit mode when support for vgic-v3 is done. Signed-off-by: Vladimir Murzin Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 9c9edc9..7ba9164 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -16,6 +16,9 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION +config KVM_ARM_VGIC_V3_ITS + bool + config KVM_ARM_VGIC_V3 bool @@ -35,6 +38,7 @@ config KVM select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD select KVM_ARM_VGIC_V3 + select KVM_ARM_VGIC_V3_ITS select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 163b057..89ef9bc 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -223,9 +223,12 @@ int kvm_register_vgic_device(unsigned long type) case KVM_DEV_TYPE_ARM_VGIC_V3: ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); + +#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS if (ret) break; ret = kvm_vgic_register_its_device(); +#endif break; #endif } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 90d8181..acbe691 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -42,6 +42,7 @@ u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, return reg | ((u64)val << lower); } +#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS bool vgic_has_its(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; @@ -51,6 +52,7 @@ bool vgic_has_its(struct kvm *kvm) return dist->has_its; } +#endif static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 6c4625c..100045f 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -84,11 +84,15 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); + +#ifdef CONFIG_KVM_ARM_VGIC_V3_ITS int vgic_register_its_iodevs(struct kvm *kvm); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +#endif + #else static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) { -- cgit v0.10.2 From e533a37f7b5ee5eb8b102cf0823e84cd6a7deb57 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:19 +0100 Subject: KVM: arm: vgic: Fix compiler warnings when built for 32-bit Well, this patch is looking ahead of time, but we'll get following compiler warnings as soon as we introduce vgic-v3 to 32-bit world CC arch/arm/kvm/../../../virt/kvm/arm/vgic/vgic-mmio-v3.o arch/arm/kvm/../../../virt/kvm/arm/vgic/vgic-mmio-v3.c: In function 'vgic_mmio_read_v3r_typer': arch/arm/kvm/../../../virt/kvm/arm/vgic/vgic-mmio-v3.c:184:35: warning: left shift count >= width of type [-Wshift-count-overflow] value = (mpidr & GENMASK(23, 0)) << 32; ^ In file included from ./include/linux/kernel.h:10:0, from ./include/asm-generic/bug.h:13, from ./arch/arm/include/asm/bug.h:59, from ./include/linux/bug.h:4, from ./include/linux/io.h:23, from ./arch/arm/include/asm/arch_gicv3.h:23, from ./include/linux/irqchip/arm-gic-v3.h:411, from arch/arm/kvm/../../../virt/kvm/arm/vgic/vgic-mmio-v3.c:14: arch/arm/kvm/../../../virt/kvm/arm/vgic/vgic-mmio-v3.c: In function 'vgic_v3_dispatch_sgi': ./include/linux/bitops.h:6:24: warning: left shift count >= width of type [-Wshift-count-overflow] #define BIT(nr) (1UL << (nr)) ^ arch/arm/kvm/../../../virt/kvm/arm/vgic/vgic-mmio-v3.c:614:20: note: in expansion of macro 'BIT' broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); ^ Let's fix them now. Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index acbe691..6385ed5 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -181,7 +181,7 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, int target_vcpu_id = vcpu->vcpu_id; u64 value; - value = (mpidr & GENMASK(23, 0)) << 32; + value = (u64)(mpidr & GENMASK(23, 0)) << 32; value |= ((target_vcpu_id & 0xffff) << 8); if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) value |= GICR_TYPER_LAST; @@ -611,7 +611,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) bool broadcast; sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); + broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; mpidr = SGI_AFFINITY_LEVEL(reg, 3); mpidr |= SGI_AFFINITY_LEVEL(reg, 2); -- cgit v0.10.2 From d7d0a11e44a1c6c1e61db7f305115f33be900704 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:20 +0100 Subject: KVM: arm: vgic: Support 64-bit data manipulation on 32-bit host systems We have couple of 64-bit registers defined in GICv3 architecture, so unsigned long accesses to these registers will only access a single 32-bit part of that regitser. On the other hand these registers can't be accessed as 64-bit with a single instruction like ldrd/strd or ldmia/stmia if we run a 32-bit host because KVM does not support access to MMIO space done by these instructions. It means that a 32-bit guest accesses these registers in 32-bit chunks, so the only thing we need to do is to ensure that extract_bytes() always takes 64-bit data. Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 6385ed5..0d3c76a 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -23,7 +23,7 @@ #include "vgic-mmio.h" /* extract @num bytes at @offset bytes offset in data */ -unsigned long extract_bytes(unsigned long data, unsigned int offset, +unsigned long extract_bytes(u64 data, unsigned int offset, unsigned int num) { return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 0b3ecf9..80f92ce 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -96,7 +96,7 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, unsigned long data); -unsigned long extract_bytes(unsigned long data, unsigned int offset, +unsigned long extract_bytes(u64 data, unsigned int offset, unsigned int num); u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, -- cgit v0.10.2 From 163352eb6827e5b556b0860edc19327798fcc5ff Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:21 +0100 Subject: ARM: Introduce MPIDR_LEVEL_SHIFT macro vgic-v3 driver uses architecture specific MPIDR_LEVEL_SHIFT macro to encode the affinity in a form compatible with ICC_SGI* registers. Unfortunately, that macro is missing on ARM, so let's add it. Cc: Russell King Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h index 1ee94c7..e2d94c1 100644 --- a/arch/arm/include/asm/cputype.h +++ b/arch/arm/include/asm/cputype.h @@ -55,6 +55,7 @@ #define MPIDR_LEVEL_BITS 8 #define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1) +#define MPIDR_LEVEL_SHIFT(level) (MPIDR_LEVEL_BITS * level) #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK) -- cgit v0.10.2 From 4f2546384150e78cad8045e59a9587fabcd9f9fe Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:22 +0100 Subject: ARM: Move system register accessors to asm/cp15.h Headers linux/irqchip/arm-gic.v3.h and arch/arm/include/asm/kvm_hyp.h are included in virt/kvm/arm/hyp/vgic-v3-sr.c and both define macros called __ACCESS_CP15 and __ACCESS_CP15_64 which obviously creates a conflict. These macros were introduced independently for GIC and KVM and, in fact, do the same thing. As an option we could add prefixes to KVM and GIC version of macros so they won't clash, but it'd introduce code duplication. Alternatively, we could keep macro in, say, GIC header and include it in KVM one (or vice versa), but such dependency would not look nicer. So we follow arm64 way (it handles this via sysreg.h) and move only single set of macros to asm/cp15.h Cc: Russell King Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index e08d151..af25c32 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -22,9 +22,7 @@ #include #include - -#define __ACCESS_CP15(CRn, Op1, CRm, Op2) p15, Op1, %0, CRn, CRm, Op2 -#define __ACCESS_CP15_64(Op1, CRm) p15, Op1, %Q0, %R0, CRm +#include #define ICC_EOIR1 __ACCESS_CP15(c12, 0, c12, 1) #define ICC_DIR __ACCESS_CP15(c12, 0, c11, 1) @@ -102,58 +100,55 @@ static inline void gic_write_eoir(u32 irq) { - asm volatile("mcr " __stringify(ICC_EOIR1) : : "r" (irq)); + write_sysreg(irq, ICC_EOIR1); isb(); } static inline void gic_write_dir(u32 val) { - asm volatile("mcr " __stringify(ICC_DIR) : : "r" (val)); + write_sysreg(val, ICC_DIR); isb(); } static inline u32 gic_read_iar(void) { - u32 irqstat; + u32 irqstat = read_sysreg(ICC_IAR1); - asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat)); dsb(sy); + return irqstat; } static inline void gic_write_pmr(u32 val) { - asm volatile("mcr " __stringify(ICC_PMR) : : "r" (val)); + write_sysreg(val, ICC_PMR); } static inline void gic_write_ctlr(u32 val) { - asm volatile("mcr " __stringify(ICC_CTLR) : : "r" (val)); + write_sysreg(val, ICC_CTLR); isb(); } static inline void gic_write_grpen1(u32 val) { - asm volatile("mcr " __stringify(ICC_IGRPEN1) : : "r" (val)); + write_sysreg(val, ICC_IGRPEN1); isb(); } static inline void gic_write_sgi1r(u64 val) { - asm volatile("mcrr " __stringify(ICC_SGI1R) : : "r" (val)); + write_sysreg(val, ICC_SGI1R); } static inline u32 gic_read_sre(void) { - u32 val; - - asm volatile("mrc " __stringify(ICC_SRE) : "=r" (val)); - return val; + return read_sysreg(ICC_SRE); } static inline void gic_write_sre(u32 val) { - asm volatile("mcr " __stringify(ICC_SRE) : : "r" (val)); + write_sysreg(val, ICC_SRE); isb(); } diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h index c3f1152..dbdbce1 100644 --- a/arch/arm/include/asm/cp15.h +++ b/arch/arm/include/asm/cp15.h @@ -49,6 +49,21 @@ #ifdef CONFIG_CPU_CP15 +#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ + "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 +#define __ACCESS_CP15_64(Op1, CRm) \ + "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64 + +#define __read_sysreg(r, w, c, t) ({ \ + t __val; \ + asm volatile(r " " c : "=r" (__val)); \ + __val; \ +}) +#define read_sysreg(...) __read_sysreg(__VA_ARGS__) + +#define __write_sysreg(v, r, w, c, t) asm volatile(w " " c : : "r" ((t)(v))) +#define write_sysreg(v, ...) __write_sysreg(v, __VA_ARGS__) + extern unsigned long cr_alignment; /* defined in entry-armv.S */ static inline unsigned long get_cr(void) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 6eaff28..e604ad68 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -20,28 +20,15 @@ #include #include +#include #include #include #define __hyp_text __section(.hyp.text) notrace -#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ - "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 -#define __ACCESS_CP15_64(Op1, CRm) \ - "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64 #define __ACCESS_VFP(CRn) \ "mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32 -#define __write_sysreg(v, r, w, c, t) asm volatile(w " " c : : "r" ((t)(v))) -#define write_sysreg(v, ...) __write_sysreg(v, __VA_ARGS__) - -#define __read_sysreg(r, w, c, t) ({ \ - t __val; \ - asm volatile(r " " c : "=r" (__val)); \ - __val; \ -}) -#define read_sysreg(...) __read_sysreg(__VA_ARGS__) - #define write_special(v, r) \ asm volatile("msr " __stringify(r) ", %0" : : "r" (v)) #define read_special(r) ({ \ -- cgit v0.10.2 From a078bedf17c2e43819fea54bdfd5793845142e3a Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:23 +0100 Subject: ARM: gic-v3: Introduce 32-to-64-bit mappings for GICv3 cpu registers vgic-v3 save/restore routines are written in such way that they map arm64 system register naming nicely, but it does not fit to arm world. To keep virt/kvm/arm/hyp/vgic-v3-sr.c untouched we create a mapping with a function for each register mapping the 32-bit to the 64-bit accessors. Please, note that 64-bit wide ICH_LR is split in two 32-bit halves (ICH_LR and ICH_LRC) accessed independently. Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index af25c32..996848e 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -96,6 +96,70 @@ #define ICH_AP1R2 __AP1Rx(2) #define ICH_AP1R3 __AP1Rx(3) +/* A32-to-A64 mappings used by VGIC save/restore */ + +#define CPUIF_MAP(a32, a64) \ +static inline void write_ ## a64(u32 val) \ +{ \ + write_sysreg(val, a32); \ +} \ +static inline u32 read_ ## a64(void) \ +{ \ + return read_sysreg(a32); \ +} \ + +#define CPUIF_MAP_LO_HI(a32lo, a32hi, a64) \ +static inline void write_ ## a64(u64 val) \ +{ \ + write_sysreg(lower_32_bits(val), a32lo);\ + write_sysreg(upper_32_bits(val), a32hi);\ +} \ +static inline u64 read_ ## a64(void) \ +{ \ + u64 val = read_sysreg(a32lo); \ + \ + val |= (u64)read_sysreg(a32hi) << 32; \ + \ + return val; \ +} + +CPUIF_MAP(ICH_HCR, ICH_HCR_EL2) +CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) +CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) +CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) +CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2) +CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) +CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) +CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) +CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2) +CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2) +CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2) +CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2) +CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2) +CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2) +CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2) +CPUIF_MAP(ICC_SRE, ICC_SRE_EL1) + +CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2) +CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2) +CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2) +CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2) +CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2) +CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2) +CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2) +CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2) +CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2) +CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2) +CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2) +CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2) +CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2) +CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2) +CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2) +CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2) + +#define read_gicreg(r) read_##r() +#define write_gicreg(v, r) write_##r(v) + /* Low-level accessors */ static inline void gic_write_eoir(u32 irq) -- cgit v0.10.2 From acda5430bee4621f218391d0bcfbe4412adb3554 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Sep 2016 15:49:24 +0100 Subject: ARM: KVM: Support vgic-v3 This patch allows to build and use vgic-v3 in 32-bit mode. Unfortunately, it can not be split in several steps without extra stubs to keep patches independent and bisectable. For instance, virt/kvm/arm/vgic/vgic-v3.c uses function from vgic-v3-sr.c, handling access to GICv3 cpu interface from the guest requires vgic_v3.vgic_sre to be already defined. It is how support has been done: * handle SGI requests from the guest * report configured SRE on access to GICv3 cpu interface from the guest * required vgic-v3 macros are provided via uapi.h * static keys are used to select GIC backend * to make vgic-v3 build KVM_ARM_VGIC_V3 guard is removed along with the static inlines Acked-by: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 05e47fa..d7ea6bc 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -72,6 +72,9 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __init_stage2_translation(void); extern void __kvm_hyp_reset(unsigned long); + +extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern void __vgic_v3_init_lrs(void); #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index de338d9..c2c40a7 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -39,7 +39,12 @@ #include + +#ifdef CONFIG_ARM_GIC_V3 +#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS +#else #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS +#endif #define KVM_REQ_VCPU_EXIT 8 diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index e604ad68..343135e 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -106,6 +106,9 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); void __sysreg_save_state(struct kvm_cpu_context *ctxt); void __sysreg_restore_state(struct kvm_cpu_context *ctxt); +void __vgic_v3_save_state(struct kvm_vcpu *vcpu); +void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); + void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp); void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp); static inline bool __vfp_enabled(void) diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index a2b3eb3..b38c10c 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -84,6 +84,13 @@ struct kvm_regs { #define KVM_VGIC_V2_DIST_SIZE 0x1000 #define KVM_VGIC_V2_CPU_SIZE 0x2000 +/* Supported VGICv3 address types */ +#define KVM_VGIC_V3_ADDR_TYPE_DIST 2 +#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 + +#define KVM_VGIC_V3_DIST_SIZE SZ_64K +#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) + #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 339ec88..f19842e 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -27,8 +27,10 @@ obj-y += $(KVM)/arm/vgic/vgic.o obj-y += $(KVM)/arm/vgic/vgic-init.o obj-y += $(KVM)/arm/vgic/vgic-irqfd.o obj-y += $(KVM)/arm/vgic/vgic-v2.o +obj-y += $(KVM)/arm/vgic/vgic-v3.o obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o +obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o obj-y += $(KVM)/irqchip.o obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 1bb2b79..3e5e419 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -228,6 +228,35 @@ bool access_vm_reg(struct kvm_vcpu *vcpu, return true; } +static bool access_gic_sgi(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + u64 reg; + + if (!p->is_write) + return read_from_write_only(vcpu, p); + + reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; + reg |= *vcpu_reg(vcpu, p->Rt1) ; + + vgic_v3_dispatch_sgi(vcpu, reg); + + return true; +} + +static bool access_gic_sre(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + + return true; +} + /* * We could trap ID_DFR0 and tell the guest we don't support performance * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was @@ -361,10 +390,16 @@ static const struct coproc_reg cp15_regs[] = { { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32, access_vm_reg, reset_unknown, c10_AMAIR1}, + /* ICC_SGI1R */ + { CRm64(12), Op1( 0), is64, access_gic_sgi}, + /* VBAR: swapped by interrupt.S. */ { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32, NULL, reset_val, c12_VBAR, 0x00000000 }, + /* ICC_SRE */ + { CRn(12), CRm(12), Op1( 0), Op2(5), is32, access_gic_sre }, + /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */ { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32, access_vm_reg, reset_val, c13_CID, 0x00000000 }, diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile index 8dfa5f7..3023bb5 100644 --- a/arch/arm/kvm/hyp/Makefile +++ b/arch/arm/kvm/hyp/Makefile @@ -5,6 +5,7 @@ KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index 9da16fd..92678b7 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -14,6 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#include #include #include @@ -83,14 +84,21 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) write_sysreg(read_sysreg(MIDR), VPIDR); } + static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) { - __vgic_v2_save_state(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_save_state(vcpu); + else + __vgic_v2_save_state(vcpu); } static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) { - __vgic_v2_restore_state(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_restore_state(vcpu); + else + __vgic_v2_restore_state(vcpu); } static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 7ba9164..6eaf12c 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -19,9 +19,6 @@ if VIRTUALIZATION config KVM_ARM_VGIC_V3_ITS bool -config KVM_ARM_VGIC_V3 - bool - config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF @@ -37,7 +34,6 @@ config KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_VGIC_V3 select KVM_ARM_VGIC_V3_ITS select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 8d22adc..002f092 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -225,7 +225,6 @@ struct vgic_v2_cpu_if { }; struct vgic_v3_cpu_if { -#ifdef CONFIG_KVM_ARM_VGIC_V3 u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ @@ -235,7 +234,6 @@ struct vgic_v3_cpu_if { u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; u64 vgic_lr[VGIC_V3_MAX_LRS]; -#endif }; struct vgic_cpu { @@ -304,13 +302,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); -#ifdef CONFIG_KVM_ARM_VGIC_V3 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); -#else -static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) -{ -} -#endif /** * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 89ef9bc..ce1f4ed 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -71,7 +71,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) addr_ptr = &vgic->vgic_cpu_base; alignment = SZ_4K; break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 case KVM_VGIC_V3_ADDR_TYPE_DIST: type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; addr_ptr = &vgic->vgic_dist_base; @@ -82,7 +81,6 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) addr_ptr = &vgic->vgic_redist_base; alignment = SZ_64K; break; -#endif default: r = -ENODEV; goto out; @@ -219,7 +217,6 @@ int kvm_register_vgic_device(unsigned long type) ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 case KVM_DEV_TYPE_ARM_VGIC_V3: ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); @@ -230,7 +227,6 @@ int kvm_register_vgic_device(unsigned long type) ret = kvm_vgic_register_its_device(); #endif break; -#endif } return ret; @@ -434,8 +430,6 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = { .has_attr = vgic_v2_has_attr, }; -#ifdef CONFIG_KVM_ARM_VGIC_V3 - static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { @@ -478,5 +472,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = { .get_attr = vgic_v3_get_attr, .has_attr = vgic_v3_has_attr, }; - -#endif /* CONFIG_KVM_ARM_VGIC_V3 */ diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 3bad3c5..e18b30d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -550,11 +550,9 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, case VGIC_V2: len = vgic_v2_init_dist_iodev(io_device); break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 case VGIC_V3: len = vgic_v3_init_dist_iodev(io_device); break; -#endif default: BUG_ON(1); } diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 80f92ce..4c34d39 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -162,12 +162,10 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); -#ifdef CONFIG_KVM_ARM_VGIC_V3 u64 vgic_sanitise_outer_cacheability(u64 reg); u64 vgic_sanitise_inner_cacheability(u64 reg); u64 vgic_sanitise_shareability(u64 reg); u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, u64 (*sanitise_fn)(u64)); -#endif #endif diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 100045f..9d9e014 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -72,7 +72,6 @@ static inline void vgic_get_irq_kref(struct vgic_irq *irq) kref_get(&irq->refcount); } -#ifdef CONFIG_KVM_ARM_VGIC_V3 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); @@ -91,60 +90,7 @@ bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); -#endif - #else -static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) -{ -} - -static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) -{ -} - -static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, - struct vgic_irq *irq, int lr) -{ -} - -static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) -{ -} - -static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) -{ -} - -static inline -void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ -} - -static inline -void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ -} - -static inline void vgic_v3_enable(struct kvm_vcpu *vcpu) -{ -} - -static inline int vgic_v3_probe(const struct gic_kvm_info *info) -{ - return -ENODEV; -} - -static inline int vgic_v3_map_resources(struct kvm *kvm) -{ - return -ENODEV; -} - -static inline int vgic_register_redist_iodevs(struct kvm *kvm, - gpa_t dist_base_address) -{ - return -ENODEV; -} - static inline int vgic_register_its_iodevs(struct kvm *kvm) { return -ENODEV; -- cgit v0.10.2 From 6134993789b76d85102d68a286b8b82cb53b52cb Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Thu, 22 Sep 2016 09:18:01 +0100 Subject: arm64: KVM: Remove duplicating init code for setting VMID By now both VHE and non-VHE initialisation sequences query supported VMID size. Lets keep only single instance of this code under init_common_resources(). Acked-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 75f130e..85a3f90 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1178,6 +1178,10 @@ static int init_common_resources(void) return -ENOMEM; } + /* set size of VMID supported by CPU */ + kvm_vmid_bits = kvm_get_vmid_bits(); + kvm_info("%d-bit VMID\n", kvm_vmid_bits); + return 0; } @@ -1243,10 +1247,6 @@ static void teardown_hyp_mode(void) static int init_vhe_mode(void) { - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_info("VHE mode initialized successfully\n"); return 0; } @@ -1330,10 +1330,6 @@ static int init_hyp_mode(void) } } - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_info("Hyp mode initialized successfully\n"); return 0; -- cgit v0.10.2 From bd6c92221d2081045cbc5f13c8c1de77e3ff569e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 8 Sep 2016 13:41:01 -0500 Subject: config: move x86 kvm_guest.config to a common location MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_guest.config is useful for KVM guests on other arches, and nothing in it appears to be x86 specific, so just move the whole file. Kbuild will find it in either location. Signed-off-by: Rob Herring Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: kvmarm@lists.cs.columbia.edu Cc: kvm@vger.kernel.org Acked-by: Christoffer Dall Signed-off-by: Radim Krčmář diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config deleted file mode 100644 index 9906505..0000000 --- a/arch/x86/configs/kvm_guest.config +++ /dev/null @@ -1,31 +0,0 @@ -CONFIG_NET=y -CONFIG_NET_CORE=y -CONFIG_NETDEVICES=y -CONFIG_BLOCK=y -CONFIG_BLK_DEV=y -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_INET=y -CONFIG_TTY=y -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_BINFMT_ELF=y -CONFIG_PCI=y -CONFIG_PCI_MSI=y -CONFIG_DEBUG_KERNEL=y -CONFIG_VIRTUALIZATION=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y -CONFIG_VIRTIO=y -CONFIG_VIRTIO_PCI=y -CONFIG_VIRTIO_BLK=y -CONFIG_VIRTIO_CONSOLE=y -CONFIG_VIRTIO_NET=y -CONFIG_9P_FS=y -CONFIG_NET_9P=y -CONFIG_NET_9P_VIRTIO=y -CONFIG_SCSI_LOWLEVEL=y -CONFIG_SCSI_VIRTIO=y -CONFIG_VIRTIO_INPUT=y diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config new file mode 100644 index 0000000..9906505 --- /dev/null +++ b/kernel/configs/kvm_guest.config @@ -0,0 +1,31 @@ +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV=y +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_INET=y +CONFIG_TTY=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_BINFMT_ELF=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DEBUG_KERNEL=y +CONFIG_VIRTUALIZATION=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_KVM_GUEST=y +CONFIG_VIRTIO=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BLK=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_NET=y +CONFIG_9P_FS=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y +CONFIG_SCSI_LOWLEVEL=y +CONFIG_SCSI_VIRTIO=y +CONFIG_VIRTIO_INPUT=y -- cgit v0.10.2 From ce836c2974dd9f1ebeb73882cec2a83b8830c749 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 8 Sep 2016 13:41:02 -0500 Subject: kvmconfig: add virtio-gpu to config fragment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio-gpu is used for VMs, so add it to the kvm config. Signed-off-by: Rob Herring Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: kvmarm@lists.cs.columbia.edu Cc: kvm@vger.kernel.org [expanded "frag" to "fragment" in summary] Signed-off-by: Radim Krčmář diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 9906505..8d96437 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -29,3 +29,4 @@ CONFIG_NET_9P_VIRTIO=y CONFIG_SCSI_LOWLEVEL=y CONFIG_SCSI_VIRTIO=y CONFIG_VIRTIO_INPUT=y +CONFIG_DRM_VIRTIO_GPU=y -- cgit v0.10.2 From c83b6d1594bfff8cdb95becb338b8fe9e3473c8d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 6 Sep 2016 17:20:33 +0800 Subject: KVM: nVMX: Fix reload apic access page warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WARNING: CPU: 1 PID: 4230 at kernel/sched/core.c:7564 __might_sleep+0x7e/0x80 do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_swait+0x39/0xa0 CPU: 1 PID: 4230 Comm: qemu-system-x86 Not tainted 4.8.0-rc5+ #47 Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 warn_slowpath_fmt+0x4f/0x60 ? prepare_to_swait+0x39/0xa0 ? prepare_to_swait+0x39/0xa0 __might_sleep+0x7e/0x80 __gfn_to_pfn_memslot+0x156/0x480 [kvm] gfn_to_pfn+0x2a/0x30 [kvm] gfn_to_page+0xe/0x20 [kvm] kvm_vcpu_reload_apic_access_page+0x32/0xa0 [kvm] nested_vmx_vmexit+0x765/0xca0 [kvm_intel] ? _raw_spin_unlock_irqrestore+0x36/0x80 vmx_check_nested_events+0x49/0x1f0 [kvm_intel] kvm_arch_vcpu_runnable+0x2d/0xe0 [kvm] kvm_vcpu_check_block+0x12/0x60 [kvm] kvm_vcpu_block+0x94/0x4c0 [kvm] kvm_arch_vcpu_ioctl_run+0x619/0x1aa0 [kvm] ? kvm_arch_vcpu_ioctl_run+0xdf1/0x1aa0 [kvm] kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm] =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #47 Not tainted ------------------------------- ./include/linux/kvm_host.h:535 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 1 lock held by qemu-system-x86/4230: #0: (&vcpu->mutex){+.+.+.}, at: [] vcpu_load+0x1c/0x60 [kvm] stack backtrace: CPU: 1 PID: 4230 Comm: qemu-system-x86 Not tainted 4.8.0-rc5+ #47 Call Trace: dump_stack+0x99/0xd0 lockdep_rcu_suspicious+0xe7/0x120 gfn_to_memslot+0x12a/0x140 [kvm] gfn_to_pfn+0x12/0x30 [kvm] gfn_to_page+0xe/0x20 [kvm] kvm_vcpu_reload_apic_access_page+0x32/0xa0 [kvm] nested_vmx_vmexit+0x765/0xca0 [kvm_intel] ? _raw_spin_unlock_irqrestore+0x36/0x80 vmx_check_nested_events+0x49/0x1f0 [kvm_intel] kvm_arch_vcpu_runnable+0x2d/0xe0 [kvm] kvm_vcpu_check_block+0x12/0x60 [kvm] kvm_vcpu_block+0x94/0x4c0 [kvm] kvm_arch_vcpu_ioctl_run+0x619/0x1aa0 [kvm] ? kvm_arch_vcpu_ioctl_run+0xdf1/0x1aa0 [kvm] kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm] ? __fget+0xfd/0x210 ? __lock_is_held+0x54/0x70 do_vfs_ioctl+0x96/0x6a0 ? __fget+0x11c/0x210 ? __fget+0x5/0x210 SyS_ioctl+0x79/0x90 do_syscall_64+0x81/0x220 entry_SYSCALL64_slow_path+0x25/0x25 These can be triggered by running kvm-unit-test: ./x86-run x86/vmx.flat The nested preemption timer is based on hrtimer which is started on L2 entry, stopped on L2 exit and evaluated via the new check_nested_events hook. The current logic adds vCPU to a simple waitqueue (TASK_INTERRUPTIBLE) if need to yield pCPU and w/o holding srcu read lock when accesses memslots, both can be in nested preemption timer evaluation path which results in the warning above. This patch fix it by leveraging request bit to async reload APIC access page before vmentry in order to avoid to reload directly during the nested preemption timer evaluation, it is safe since the vmcs01 is loaded and current is nested vmexit. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Yunhong Jiang Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fe2b144..122047a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10800,7 +10800,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * We are now running in L2, mmu_notifier will force to reload the * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. */ - kvm_vcpu_reload_apic_access_page(vcpu); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Exiting from L2 to L1, we're now back to L1 which thinks it just -- cgit v0.10.2 From f6e90f9e0ee3df5010e7f378e669d6838b9a25bf Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 22 Sep 2016 07:43:25 +0800 Subject: KVM: VMX: Enable MSR-BASED TPR shadow even if APICv is inactive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I observed that kvmvapic(to optimize flexpriority=N or AMD) is used to boost TPR access when testing kvm-unit-test/eventinj.flat tpr case on my haswell desktop (w/ flexpriority, w/o APICv). Commit (8d14695f9542 x86, apicv: add virtual x2apic support) disable virtual x2apic mode completely if w/o APICv, and the author also told me that windows guest can't enter into x2apic mode when he developed the APICv feature several years ago. However, it is not truth currently, Interrupt Remapping and vIOMMU is added to qemu and the developers from Intel test windows 8 can work in x2apic mode w/ Interrupt Remapping enabled recently. This patch enables TPR shadow for virtual x2apic mode to boost windows guest in x2apic mode even if w/o APICv. Can pass the kvm-unit-test. Suggested-by: Radim Krčmář Suggested-by: Wincy Van Reviewed-by: Radim Krčmář Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Wincy Van Cc: Yang Zhang Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 122047a..5a5ab206 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -2524,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + } } else { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode; @@ -4682,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + if (apicv_active) { + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); + } else { + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + } } -static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + if (apicv_active) { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); + } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + } } -static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); + if (apicv_active) { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); + } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_W); + } } static bool vmx_get_enable_apicv(void) @@ -6364,22 +6394,32 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_legacy_x2apic) goto out2; + vmx_msr_bitmap_legacy_x2apic_apicv_inactive = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) + goto out3; + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) - goto out3; + goto out4; vmx_msr_bitmap_longmode_x2apic = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; + goto out5; + + vmx_msr_bitmap_longmode_x2apic_apicv_inactive = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) + goto out6; vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out6; + goto out7; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out7; + goto out8; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6398,7 +6438,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out8; + goto out9; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6465,20 +6505,35 @@ static __init int hardware_setup(void) vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + /* + * enable_apicv && kvm_vcpu_apicv_active() + */ for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); + vmx_disable_intercept_msr_read_x2apic(msr, true); /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); + vmx_enable_intercept_msr_read_x2apic(0x839, true); /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); + vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); + vmx_disable_intercept_msr_write_x2apic(0x80b, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); + vmx_disable_intercept_msr_write_x2apic(0x83f, true); + + /* + * (enable_apicv && !kvm_vcpu_apicv_active()) || + * !enable_apicv + */ + /* TPR */ + vmx_disable_intercept_msr_read_x2apic(0x808, false); + vmx_disable_intercept_msr_write_x2apic(0x808, false); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6525,14 +6580,18 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out8: +out9: free_page((unsigned long)vmx_vmwrite_bitmap); -out7: +out8: free_page((unsigned long)vmx_vmread_bitmap); +out7: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); out6: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: +out5: free_page((unsigned long)vmx_msr_bitmap_longmode); +out4: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); out3: free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: @@ -6548,7 +6607,9 @@ out: static __exit void hardware_unsetup(void) { free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); @@ -8439,12 +8500,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } - /* - * There is not point to enable virtualize x2apic without enable - * apicv - */ - if (!cpu_has_vmx_virtualize_x2apic_mode() || - !kvm_vcpu_apicv_active(vcpu)) + if (!cpu_has_vmx_virtualize_x2apic_mode()) return; if (!cpu_need_tpr_shadow(vcpu)) -- cgit v0.10.2 From c5a6d5f7faad8549bb5ff7e3e5792e33933c5b9f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 22 Sep 2016 17:55:54 +0800 Subject: KVM: nVMX: Fix the NMI IDT-vectoring handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run kvm-unit-tests/eventinj.flat in L1: Sending NMI to self After NMI to self FAIL: NMI This test scenario is to test whether VMM can handle NMI IDT-vectoring info correctly. At the beginning, L2 writes LAPIC to send a self NMI, the EPT page tables on both L1 and L0 are empty so: - The L2 accesses memory can generate EPT violation which can be intercepted by L0. The EPT violation vmexit occurred during delivery of this NMI, and the NMI info is recorded in vmcs02's IDT-vectoring info. - L0 walks L1's EPT12 and L0 sees the mapping is invalid, it injects the EPT violation into L1. The vmcs02's IDT-vectoring info is reflected to vmcs12's IDT-vectoring info since it is a nested vmexit. - L1 receives the EPT violation, then fixes its EPT12. - L1 executes VMRESUME to resume L2 which generates vmexit and causes L1 exits to L0. - L0 emulates VMRESUME which is called from L1, then return to L2. L0 merges the requirement of vmcs12's IDT-vectoring info and injects it to L2 through vmcs02. - The L2 re-executes the fault instruction and cause EPT violation again. - Since the L1's EPT12 is valid, L0 can fix its EPT02 - L0 resume L2 The EPT violation vmexit occurred during delivery of this NMI again, and the NMI info is recorded in vmcs02's IDT-vectoring info. L0 should inject the NMI through vmentry event injection since it is caused by EPT02's EPT violation. However, vmx_inject_nmi() refuses to inject NMI from IDT-vectoring info if vCPU is in guest mode, this patch fix it by permitting to inject NMI from IDT-vectoring if it is the L0's responsibility to inject NMI from IDT-vectoring info to L2. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jan Kiszka Cc: Bandan Das Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5a5ab206..2577183 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5313,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (is_guest_mode(vcpu)) - return; + if (!is_guest_mode(vcpu)) { + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; + ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; } - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } -- cgit v0.10.2 From 3d9cd95f90b2987ef95182a4340a9150e06c4253 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 23 Sep 2016 14:23:43 +0100 Subject: ARM: gic-v3: Work around definition of gic_write_bpr1 A new accessor for gic_write_bpr1 is added to arch_gicv3.h in 4.9, whilst the CP15 accessors are redifined in a separate branch. This leads to a horrible clash, where the new accessor ends up with a crap "asm volatile" definition. Work around this by carrying our own definition of gic_write_bpr1, creating a small conflict which will be obvious to resolve. Signed-off-by: Marc Zyngier diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 996848e..1fee657 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -216,6 +216,15 @@ static inline void gic_write_sre(u32 val) isb(); } +static inline void gic_write_bpr1(u32 val) +{ +#if defined(__write_sysreg) && defined(ICC_BPR1) + write_sysreg(val, ICC_BPR1); +#else + asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val)); +#endif +} + /* * Even in 32bit systems that use LPAE, there is no guarantee that the I/O * interface provides true 64bit atomic accesses, so using strd/ldrd doesn't -- cgit v0.10.2 From 88b02cf97bb7e742db3e31671d54177e3e19fd89 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Sep 2016 13:42:52 +1000 Subject: KVM: PPC: Book3S: Treat VTB as a per-subcore register, not per-thread POWER8 has one virtual timebase (VTB) register per subcore, not one per CPU thread. The HV KVM code currently treats VTB as a per-thread register, which can lead to spurious soft lockup messages from guests which use the VTB as the time source for the soft lockup detector. (CPUs before POWER8 did not have the VTB register.) For HV KVM, this fixes the problem by making only the primary thread in each virtual core save and restore the VTB value. With this, the VTB state becomes part of the kvmppc_vcore structure. This also means that "piggybacking" of multiple virtual cores onto one subcore is not possible on POWER8, because then the virtual cores would share a single VTB register. PR KVM emulates a VTB register, which is per-vcpu because PR KVM has no notion of CPU threads or SMT. For PR KVM we move the VTB state into the kvmppc_vcpu_book3s struct. Cc: stable@vger.kernel.org # v3.14+ Reported-by: Thomas Huth Tested-by: Thomas Huth Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index cef2b89..5cf306a 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -101,6 +101,7 @@ struct kvmppc_vcore { u32 arch_compat; ulong pcr; ulong dpdes; /* doorbell state (POWER8) */ + ulong vtb; /* virtual timebase */ ulong conferring_threads; unsigned int halt_poll_ns; }; @@ -119,6 +120,7 @@ struct kvmppc_vcpu_book3s { u64 sdr1; u64 hior; u64 msr_mask; + u64 vtb; #ifdef CONFIG_PPC_BOOK3S_32 u32 vsid_pool[VSID_POOL_SIZE]; u32 vsid_next; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ed30d2e..28350a2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -475,7 +475,6 @@ struct kvm_vcpu_arch { ulong purr; ulong spurr; ulong ic; - ulong vtb; ulong dscr; ulong amr; ulong uamor; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b89d14c..a51ae9b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -506,7 +506,6 @@ int main(void) DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic)); - DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb)); DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr)); DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); @@ -557,6 +556,7 @@ int main(void) DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr)); DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr)); DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes)); + DEFINE(VCORE_VTB, offsetof(struct kvmppc_vcore, vtb)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ba231a1..b6952dd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -599,9 +599,6 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_BESCR: *val = get_reg_val(id, vcpu->arch.bescr); break; - case KVM_REG_PPC_VTB: - *val = get_reg_val(id, vcpu->arch.vtb); - break; case KVM_REG_PPC_IC: *val = get_reg_val(id, vcpu->arch.ic); break; @@ -673,9 +670,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_BESCR: vcpu->arch.bescr = set_reg_val(id, *val); break; - case KVM_REG_PPC_VTB: - vcpu->arch.vtb = set_reg_val(id, *val); - break; case KVM_REG_PPC_IC: vcpu->arch.ic = set_reg_val(id, *val); break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 2afdb9c..7d9e4ed 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -579,7 +579,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val *spr_val = vcpu->arch.spurr; break; case SPRN_VTB: - *spr_val = vcpu->arch.vtb; + *spr_val = to_book3s(vcpu)->vtb; break; case SPRN_IC: *spr_val = vcpu->arch.ic; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9b3bba6..c4f8971 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1199,6 +1199,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_DPDES: *val = get_reg_val(id, vcpu->arch.vcore->dpdes); break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, vcpu->arch.vcore->vtb); + break; case KVM_REG_PPC_DAWR: *val = get_reg_val(id, vcpu->arch.dawr); break; @@ -1391,6 +1394,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_DPDES: vcpu->arch.vcore->dpdes = set_reg_val(id, *val); break; + case KVM_REG_PPC_VTB: + vcpu->arch.vcore->vtb = set_reg_val(id, *val); + break; case KVM_REG_PPC_DAWR: vcpu->arch.dawr = set_reg_val(id, *val); break; @@ -2213,9 +2219,11 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc, pvc->lpcr != vc->lpcr) return false; - /* P8 guest with > 1 thread per core would see wrong TIR value */ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - (vc->num_threads > 1 || pvc->num_threads > 1)) + /* + * P8 guests can't do piggybacking, because then the + * VTB would be shared between the vcpus. + */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) return false; n_thr = cip->subcore_threads[sub] + pvc->num_threads; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7cc924b..c3c1d1b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -644,9 +644,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) 38: BEGIN_FTR_SECTION - /* DPDES is shared between threads */ + /* DPDES and VTB are shared between threads */ ld r8, VCORE_DPDES(r5) + ld r7, VCORE_VTB(r5) mtspr SPRN_DPDES, r8 + mtspr SPRN_VTB, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Mark the subcore state as inside guest */ @@ -806,10 +808,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_CIABR, r7 mtspr SPRN_TAR, r8 ld r5, VCPU_IC(r4) - ld r6, VCPU_VTB(r4) - mtspr SPRN_IC, r5 - mtspr SPRN_VTB, r6 ld r8, VCPU_EBBHR(r4) + mtspr SPRN_IC, r5 mtspr SPRN_EBBHR, r8 ld r5, VCPU_EBBRR(r4) ld r6, VCPU_BESCR(r4) @@ -1334,10 +1334,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) stw r6, VCPU_PSPB(r9) std r7, VCPU_FSCR(r9) mfspr r5, SPRN_IC - mfspr r6, SPRN_VTB mfspr r7, SPRN_TAR std r5, VCPU_IC(r9) - std r6, VCPU_VTB(r9) std r7, VCPU_TAR(r9) mfspr r8, SPRN_EBBHR std r8, VCPU_EBBHR(r9) @@ -1564,9 +1562,11 @@ kvmhv_switch_to_host: isync BEGIN_FTR_SECTION - /* DPDES is shared between threads */ + /* DPDES and VTB are shared between threads */ mfspr r7, SPRN_DPDES + mfspr r8, SPRN_VTB std r7, VCORE_DPDES(r5) + std r8, VCORE_VTB(r5) /* clear DPDES so we don't get guest doorbells in the host */ li r8, 0 mtspr SPRN_DPDES, r8 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index e76f79a..69ebd70 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -226,7 +226,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, */ vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb; vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb; - vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb; + to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb; if (cpu_has_feature(CPU_FTR_ARCH_207S)) vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic; svcpu->in_use = false; @@ -1361,6 +1361,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_HIOR: *val = get_reg_val(id, to_book3s(vcpu)->hior); break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, to_book3s(vcpu)->vtb); + break; case KVM_REG_PPC_LPCR: case KVM_REG_PPC_LPCR_64: /* @@ -1397,6 +1400,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, to_book3s(vcpu)->hior = set_reg_val(id, *val); to_book3s(vcpu)->hior_explicit = true; break; + case KVM_REG_PPC_VTB: + to_book3s(vcpu)->vtb = set_reg_val(id, *val); + break; case KVM_REG_PPC_LPCR: case KVM_REG_PPC_LPCR_64: kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); -- cgit v0.10.2 From b009031f74da1c71f306bbe410da78bd9d848669 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Sep 2016 16:27:41 +1000 Subject: KVM: PPC: Book3S HV: Take out virtual core piggybacking code This takes out the code that arranges to run two (or more) virtual cores on a single subcore when possible, that is, when both vcores are from the same VM, the VM is configured with one CPU thread per virtual core, and all the per-subcore registers have the same value in each vcore. Since the VTB (virtual timebase) is a per-subcore register, and will almost always differ between vcores, this code is disabled on POWER8 machines, meaning that it is only usable on POWER7 machines (which don't have VTB). Given the tiny number of POWER7 machines which have firmware that allows them to run HV KVM, the benefit of simplifying the code outweighs the loss of this feature on POWER7 machines. Tested-by: Thomas Huth Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c4f8971..3686471 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2103,66 +2103,6 @@ static void init_master_vcore(struct kvmppc_vcore *vc) vc->conferring_threads = 0; } -/* - * See if the existing subcores can be split into 3 (or fewer) subcores - * of at most two threads each, so we can fit in another vcore. This - * assumes there are at most two subcores and at most 6 threads in total. - */ -static bool can_split_piggybacked_subcores(struct core_info *cip) -{ - int sub, new_sub; - int large_sub = -1; - int thr; - int n_subcores = cip->n_subcores; - struct kvmppc_vcore *vc, *vcnext; - struct kvmppc_vcore *master_vc = NULL; - - for (sub = 0; sub < cip->n_subcores; ++sub) { - if (cip->subcore_threads[sub] <= 2) - continue; - if (large_sub >= 0) - return false; - large_sub = sub; - vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore, - preempt_list); - if (vc->num_threads > 2) - return false; - n_subcores += (cip->subcore_threads[sub] - 1) >> 1; - } - if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2)) - return false; - - /* - * Seems feasible, so go through and move vcores to new subcores. - * Note that when we have two or more vcores in one subcore, - * all those vcores must have only one thread each. - */ - new_sub = cip->n_subcores; - thr = 0; - sub = large_sub; - list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) { - if (thr >= 2) { - list_del(&vc->preempt_list); - list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]); - /* vc->num_threads must be 1 */ - if (++cip->subcore_threads[new_sub] == 1) { - cip->subcore_vm[new_sub] = vc->kvm; - init_master_vcore(vc); - master_vc = vc; - ++cip->n_subcores; - } else { - vc->master_vcore = master_vc; - ++new_sub; - } - } - thr += vc->num_threads; - } - cip->subcore_threads[large_sub] = 2; - cip->max_subcore_threads = 2; - - return true; -} - static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) { int n_threads = vc->num_threads; @@ -2173,23 +2113,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (n_threads < cip->max_subcore_threads) n_threads = cip->max_subcore_threads; - if (subcore_config_ok(cip->n_subcores + 1, n_threads)) { - cip->max_subcore_threads = n_threads; - } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 && - vc->num_threads <= 2) { - /* - * We may be able to fit another subcore in by - * splitting an existing subcore with 3 or 4 - * threads into two 2-thread subcores, or one - * with 5 or 6 threads into three subcores. - * We can only do this if those subcores have - * piggybacked virtual cores. - */ - if (!can_split_piggybacked_subcores(cip)) - return false; - } else { + if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) return false; - } + cip->max_subcore_threads = n_threads; sub = cip->n_subcores; ++cip->n_subcores; @@ -2203,45 +2129,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) return true; } -static bool can_piggyback_subcore(struct kvmppc_vcore *pvc, - struct core_info *cip, int sub) -{ - struct kvmppc_vcore *vc; - int n_thr; - - vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore, - preempt_list); - - /* require same VM and same per-core reg values */ - if (pvc->kvm != vc->kvm || - pvc->tb_offset != vc->tb_offset || - pvc->pcr != vc->pcr || - pvc->lpcr != vc->lpcr) - return false; - - /* - * P8 guests can't do piggybacking, because then the - * VTB would be shared between the vcpus. - */ - if (cpu_has_feature(CPU_FTR_ARCH_207S)) - return false; - - n_thr = cip->subcore_threads[sub] + pvc->num_threads; - if (n_thr > cip->max_subcore_threads) { - if (!subcore_config_ok(cip->n_subcores, n_thr)) - return false; - cip->max_subcore_threads = n_thr; - } - - cip->total_threads += pvc->num_threads; - cip->subcore_threads[sub] = n_thr; - pvc->master_vcore = vc; - list_del(&pvc->preempt_list); - list_add_tail(&pvc->preempt_list, &cip->vcs[sub]); - - return true; -} - /* * Work out whether it is possible to piggyback the execution of * vcore *pvc onto the execution of the other vcores described in *cip. @@ -2249,19 +2136,10 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc, static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, int target_threads) { - int sub; - if (cip->total_threads + pvc->num_threads > target_threads) return false; - for (sub = 0; sub < cip->n_subcores; ++sub) - if (cip->subcore_threads[sub] && - can_piggyback_subcore(pvc, cip, sub)) - return true; - if (can_dynamic_split(pvc, cip)) - return true; - - return false; + return can_dynamic_split(pvc, cip); } static void prepare_threads(struct kvmppc_vcore *vc) -- cgit v0.10.2 From ac0e89bb4744d3882ccd275f2416d9ce22f4e1e7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Jul 2016 13:15:46 +0300 Subject: KVM: PPC: BookE: Fix a sanity check We use logical negate where bitwise negate was intended. It means that we never return -EINVAL here. Fixes: ce11e48b7fdd ('KVM: PPC: E500: Add userspace debug stub support') Signed-off-by: Dan Carpenter Reviewed-by: Alexander Graf Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 02b4672..df3f270 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2038,7 +2038,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (type == KVMPPC_DEBUG_NONE) continue; - if (type & !(KVMPPC_DEBUG_WATCH_READ | + if (type & ~(KVMPPC_DEBUG_WATCH_READ | KVMPPC_DEBUG_WATCH_WRITE | KVMPPC_DEBUG_BREAKPOINT)) return -EINVAL; -- cgit v0.10.2 From 4f053d06dce6cbc82806d9baff24fb3add877205 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 16 Sep 2016 17:25:50 +1000 Subject: KVM: PPC: Book3S: Remove duplicate setting of the B field in tlbie Remove duplicate setting of the the "B" field when doing a tlbie(l). In compute_tlbie_rb(), the "B" field is set again just before returning the rb value to be used for tlbie(l). Signed-off-by: Balbir Singh Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 4ffd5a1..8482921 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -125,7 +125,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, /* This covers 14..54 bits of va*/ rb = (v & ~0x7fUL) << 16; /* AVA field */ - rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ /* * AVA in v had cleared lower 23 bits. We need to derive * that from pteg index @@ -177,7 +176,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, break; } } - rb |= (v >> 54) & 0x300; /* B field */ + rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ return rb; } -- cgit v0.10.2 From 2365f6b67c0d786b9d1cb1268575e42807fe47e2 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 21 Sep 2016 13:53:46 +0200 Subject: KVM: PPC: Book3S PR: Support 64kB page size on POWER8E and POWER8NVL On POWER8E and POWER8NVL, KVM-PR does not announce support for 64kB page sizes and 1TB segments yet. Looks like this has just been forgotton so far, since there is no reason why this should be different to the normal POWER8 CPUs. Signed-off-by: Thomas Huth Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 69ebd70..826c541 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -448,6 +448,8 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) case PVR_POWER7: case PVR_POWER7p: case PVR_POWER8: + case PVR_POWER8E: + case PVR_POWER8NVL: vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | BOOK3S_HFLAG_NEW_TLBIE; break; -- cgit v0.10.2 From fa73c3b25bd8d0d393dc6109a1dba3c2aef0451e Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 21 Sep 2016 15:06:45 +0200 Subject: KVM: PPC: Book3s PR: Allow access to unprivileged MMCR2 register The MMCR2 register is available twice, one time with number 785 (privileged access), and one time with number 769 (unprivileged, but it can be disabled completely). In former times, the Linux kernel was using the unprivileged register 769 only, but since commit 8dd75ccb571f3c92c ("powerpc: Use privileged SPR number for MMCR2"), it uses the privileged register 785 instead. The KVM-PR code then of course also switched to use the SPR 785, but this is causing older guest kernels to crash, since these kernels still access 769 instead. So to support older kernels with KVM-PR again, we have to support register 769 in KVM-PR, too. Fixes: 8dd75ccb571f3c92c48014b3dabd3d51a115ab41 Cc: stable@vger.kernel.org # v3.10+ Signed-off-by: Thomas Huth Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f69f40f..978dada 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -737,6 +737,7 @@ #define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */ #define SPRN_MMCR1 798 #define SPRN_MMCR2 785 +#define SPRN_UMMCR2 769 #define SPRN_MMCRA 0x312 #define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */ #define MMCRA_SDAR_DCACHE_MISS 0x40000000UL diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 7d9e4ed..8359752 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -498,6 +498,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_MMCR0: case SPRN_MMCR1: case SPRN_MMCR2: + case SPRN_UMMCR2: #endif break; unprivileged: @@ -640,6 +641,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_MMCR0: case SPRN_MMCR1: case SPRN_MMCR2: + case SPRN_UMMCR2: case SPRN_TIR: #endif *spr_val = 0; -- cgit v0.10.2 From 6fe407f2d18a4f94216263f91cb7d1f08fa5887c Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 26 Sep 2016 18:51:47 -0700 Subject: KVM: arm64: Require in-kernel irqchip for PMU support If userspace creates a PMU for the VCPU, but doesn't create an in-kernel irqchip, then we end up in a nasty path where we try to take an uninitialized spinlock, which can lead to all sorts of breakages. Luckily, QEMU always creates the VGIC before the PMU, so we can establish this as ABI and check for the VGIC in the PMU init stage. This can be relaxed at a later time if we want to support PMU with a userspace irqchip. Cc: stable@vger.kernel.org Cc: Shannon Zhao Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt index c041658..02f5068 100644 --- a/Documentation/virtual/kvm/devices/vcpu.txt +++ b/Documentation/virtual/kvm/devices/vcpu.txt @@ -30,4 +30,6 @@ Returns: -ENODEV: PMUv3 not supported attribute -EBUSY: PMUv3 already initialized -Request the initialization of the PMUv3. +Request the initialization of the PMUv3. This must be done after creating the +in-kernel irqchip. Creating a PMU with a userspace irqchip is currently not +supported. diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index a027569..6e9c40e 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -423,6 +423,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) if (!kvm_arm_support_pmu_v3()) return -ENODEV; + /* + * We currently require an in-kernel VGIC to use the PMU emulation, + * because we do not support forwarding PMU overflow interrupts to + * userspace yet. + */ + if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm)) + return -ENODEV; + if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) || !kvm_arm_pmu_irq_initialized(vcpu)) return -ENXIO; -- cgit v0.10.2 From 0099b7701f5296a758d9e6b945ec96f96847cc2f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 27 Sep 2016 18:53:35 +0200 Subject: KVM: arm/arm64: vgic: Don't flush/sync without a working vgic If the vgic hasn't been created and initialized, we shouldn't attempt to look at its data structures or flush/sync anything to the GIC hardware. This fixes an issue reported by Alexander Graf when using a userspace irqchip. Fixes: 0919e84c0fc1 ("KVM: arm/arm64: vgic-new: Add IRQ sync/flush framework") Cc: stable@vger.kernel.org Reported-by: Alexander Graf Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 8a529a7..2893d5b 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -645,6 +645,9 @@ next: /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + vgic_process_maintenance_interrupt(vcpu); vgic_fold_lr_state(vcpu); vgic_prune_ap_list(vcpu); @@ -653,6 +656,9 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) /* Flush our emulation state into the GIC hardware before entering the guest. */ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); vgic_flush_lr_state(vcpu); spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); -- cgit v0.10.2 From 91e4f1b6073dd680d86cdb7e42d7cccca9db39d8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 15 Sep 2016 17:20:06 +0100 Subject: KVM: MIPS: Drop other CPU ASIDs on guest MMU changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a guest TLB entry is replaced by TLBWI or TLBWR, we only invalidate TLB entries on the local CPU. This doesn't work correctly on an SMP host when the guest is migrated to a different physical CPU, as it could pick up stale TLB mappings from the last time the vCPU ran on that physical CPU. Therefore invalidate both user and kernel host ASIDs on other CPUs, which will cause new ASIDs to be generated when it next runs on those CPUs. We're careful only to do this if the TLB entry was already valid, and only for the kernel ASID where the virtual address it mapped is outside of the guest user address range. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: # 3.10.x- diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index e788515..43853ec 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -846,6 +846,47 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) return EMULATE_FAIL; } +/** + * kvm_mips_invalidate_guest_tlb() - Indicates a change in guest MMU map. + * @vcpu: VCPU with changed mappings. + * @tlb: TLB entry being removed. + * + * This is called to indicate a single change in guest MMU mappings, so that we + * can arrange TLB flushes on this and other CPUs. + */ +static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, + struct kvm_mips_tlb *tlb) +{ + int cpu, i; + bool user; + + /* No need to flush for entries which are already invalid */ + if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V)) + return; + /* User address space doesn't need flushing for KSeg2/3 changes */ + user = tlb->tlb_hi < KVM_GUEST_KSEG0; + + preempt_disable(); + + /* + * Probe the shadow host TLB for the entry being overwritten, if one + * matches, invalidate it + */ + kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + + /* Invalidate the whole ASID on other CPUs */ + cpu = smp_processor_id(); + for_each_possible_cpu(i) { + if (i == cpu) + continue; + if (user) + vcpu->arch.guest_user_asid[i] = 0; + vcpu->arch.guest_kernel_asid[i] = 0; + } + + preempt_enable(); +} + /* Write Guest TLB Entry @ Index */ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) { @@ -865,11 +906,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) } tlb = &vcpu->arch.guest_tlb[index]; - /* - * Probe the shadow host TLB for the entry being overwritten, if one - * matches, invalidate it - */ - kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + + kvm_mips_invalidate_guest_tlb(vcpu, tlb); tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); @@ -898,11 +936,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) tlb = &vcpu->arch.guest_tlb[index]; - /* - * Probe the shadow host TLB for the entry being overwritten, if one - * matches, invalidate it - */ - kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + kvm_mips_invalidate_guest_tlb(vcpu, tlb); tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); @@ -1026,6 +1060,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, enum emulation_result er = EMULATE_DONE; u32 rt, rd, sel; unsigned long curr_pc; + int cpu, i; /* * Update PC and hold onto current PC in case there is @@ -1135,8 +1170,16 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, & KVM_ENTRYHI_ASID, nasid); + preempt_disable(); /* Blow away the shadow host TLBs */ kvm_mips_flush_host_tlb(1); + cpu = smp_processor_id(); + for_each_possible_cpu(i) + if (i != cpu) { + vcpu->arch.guest_user_asid[i] = 0; + vcpu->arch.guest_kernel_asid[i] = 0; + } + preempt_enable(); } kvm_write_c0_guest_entryhi(cop0, vcpu->arch.gprs[rt]); -- cgit v0.10.2 From f3124cc551c8575c30e3736faf2d0bca54ee07cb Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 16 Sep 2016 00:00:08 +0100 Subject: KVM: MIPS: Split kernel/user ASID regeneration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host ASIDs for guest kernel and user mode are regenerated together if the ASID for guest kernel mode is out of date. That is fine as the ASID for guest kernel mode is always generated first, however it doesn't allow the ASIDs to be regenerated or invalidated individually instead of linearly flushing the entire host TLB. Therefore separate the regeneration code so that the ASIDs are checked and regenerated separately. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6cfdcf5..c1f8758 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -250,6 +250,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); vcpu->arch.guest_kernel_asid[cpu] = vcpu->arch.guest_kernel_mm.context.asid[cpu]; + newasid++; + + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, + cpu_context(cpu, current->mm)); + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", + cpu, vcpu->arch.guest_kernel_asid[cpu]); + } + + if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & + asid_version_mask(cpu)) { kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); vcpu->arch.guest_user_asid[cpu] = vcpu->arch.guest_user_mm.context.asid[cpu]; @@ -257,8 +267,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, vcpu->arch.guest_user_asid[cpu]); } -- cgit v0.10.2 From 25b08c7fb0e410bdf7c42ea1faff816eb0451bbc Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 16 Sep 2016 00:06:43 +0100 Subject: KVM: MIPS: Invalidate TLB by regenerating ASIDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Invalidate host TLB mappings when the guest ASID is changed by regenerating ASIDs, rather than flushing the entire host TLB except entries in the guest KSeg0 range. For the guest kernel mode ASID we regenerate on the spot when the guest ASID is changed, as that will always take place while the guest is in kernel mode. However when the guest invalidates TLB entries the ASID will often by changed temporarily as part of writing EntryHi without the guest returning to user mode in between. We therefore regenerate the user mode ASID lazily before entering the guest in user mode, if and only if the guest ASID has actually changed since the last guest user mode entry. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 4d7e0e4..a5685c1 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -328,6 +328,9 @@ struct kvm_vcpu_arch { u32 guest_kernel_asid[NR_CPUS]; struct mm_struct guest_kernel_mm, guest_user_mm; + /* Guest ASID of last user mode execution */ + unsigned int last_user_gasid; + int last_sched_cpu; /* WAIT executed */ diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 43853ec..8dc9e64 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1170,15 +1170,23 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, & KVM_ENTRYHI_ASID, nasid); + /* + * Regenerate/invalidate kernel MMU + * context. + * The user MMU context will be + * regenerated lazily on re-entry to + * guest user if the guest ASID actually + * changes. + */ preempt_disable(); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); cpu = smp_processor_id(); + kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, + cpu, vcpu); + vcpu->arch.guest_kernel_asid[cpu] = + vcpu->arch.guest_kernel_mm.context.asid[cpu]; for_each_possible_cpu(i) - if (i != cpu) { - vcpu->arch.guest_user_asid[i] = 0; + if (i != cpu) vcpu->arch.guest_kernel_asid[i] = 0; - } preempt_enable(); } kvm_write_c0_guest_entryhi(cop0, diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a6ea084..ad1b15b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -411,6 +411,31 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } +/* Must be called with preemption disabled, just before entering guest */ +static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + int cpu = smp_processor_id(); + unsigned int gasid; + + /* + * Lazy host ASID regeneration for guest user mode. + * If the guest ASID has changed since the last guest usermode + * execution, regenerate the host ASID so as to invalidate stale TLB + * entries. + */ + if (!KVM_GUEST_KERNEL_MODE(vcpu)) { + gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; + if (gasid != vcpu->arch.last_user_gasid) { + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, + vcpu); + vcpu->arch.guest_user_asid[cpu] = + vcpu->arch.guest_user_mm.context.asid[cpu]; + vcpu->arch.last_user_gasid = gasid; + } + } +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = 0; @@ -438,6 +463,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) htw_stop(); trace_kvm_enter(vcpu); + + kvm_mips_check_asids(vcpu); + r = vcpu->arch.vcpu_run(run, vcpu); trace_kvm_out(vcpu); @@ -1551,6 +1579,8 @@ skip_emul: if (ret == RESUME_GUEST) { trace_kvm_reenter(vcpu); + kvm_mips_check_asids(vcpu); + /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context * is live), restore FCR31 / MSACSR. diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index c1f8758..8e1f2bff 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -260,9 +260,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & asid_version_mask(cpu)) { + u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & + KVM_ENTRYHI_ASID; + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); vcpu->arch.guest_user_asid[cpu] = vcpu->arch.guest_user_mm.context.asid[cpu]; + vcpu->arch.last_user_gasid = gasid; newasid++; kvm_debug("[%d]: cpu_context: %#lx\n", cpu, -- cgit v0.10.2 From bf18db4e7bd99f3a65bcc43225790b16af733321 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 16 Sep 2016 13:14:09 +0100 Subject: KVM: MIPS: Drop dubious EntryHi optimisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There exists a slightly dubious optimisation in the implementation of the MIPS KVM EntryHi emulation which skips TLB invalidation if the EntryHi points to an address in the guest KSeg0 region, intended to catch guest TLB invalidations where the ASID is almost immediately restored to the previous value. Now that we perform lazy host ASID regeneration for guest user mode when the guest ASID changes we should be able to drop the optimisation without a significant impact (only the extra TLB refills for the small amount of code while the TLB is being invalidated). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 8dc9e64..4db4c03 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1162,8 +1162,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { u32 nasid = vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; - if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) && - ((kvm_read_c0_guest_entryhi(cop0) & + if (((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) { trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) -- cgit v0.10.2