From afc2f792cdcb67f4257f0e68d10ee4a7b7eae57a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 5 May 2013 20:03:40 +0800 Subject: KVM: add missing misc_deregister() on error in kvm_init() Add the missing misc_deregister() before return from kvm_init() in the debugfs init error handling case. Signed-off-by: Wei Yongjun Signed-off-by: Gleb Natapov diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 302681c..b547a1c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3181,6 +3181,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, out_undebugfs: unregister_syscore_ops(&kvm_syscore_ops); + misc_deregister(&kvm_dev); out_unreg: kvm_async_pf_deinit(); out_free: -- cgit v0.10.2 From e2858b4ab1f8d7ce36ae3cd96ba650664af0db5e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 9 May 2013 15:45:12 +0900 Subject: KVM: MMU: Use kvm_mmu_sync_roots() in kvm_mmu_load() No need to open-code this function. Signed-off-by: Takuya Yoshikawa Reviewed-by: Xiao Guangrong Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87..40d7b2d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3764,9 +3764,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_alloc_roots(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_sync_roots(vcpu); if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ -- cgit v0.10.2 From f1ed0450a5fac7067590317cbf027f566b6ccbca Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 28 Apr 2013 14:00:41 +0200 Subject: KVM: x86: Remove support for reporting coalesced APIC IRQs Since the arrival of posted interrupt support we can no longer guarantee that coalesced IRQs are always reported to the IRQ source. Moreover, accumulated APIC timer events could cause a busy loop when a VCPU should rather be halted. The consensus is to remove coalesced tracking from the LAPIC. Signed-off-by: Jan Kiszka Acked-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e1adbb4..9d75193 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) return highest_irr; } -static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map); +static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode, + unsigned long *dest_map); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map) +void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; - return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, - irq->level, irq->trig_mode, dest_map); + __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + irq->level, irq->trig_mode, dest_map); } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) @@ -608,7 +608,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq->shorthand == APIC_DEST_SELF) { - *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); + kvm_apic_set_irq(src->vcpu, irq, dest_map); + *r = 1; return true; } @@ -653,7 +654,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, continue; if (*r < 0) *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); + kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); + *r += 1; } ret = true; @@ -662,15 +664,11 @@ out: return ret; } -/* - * Add a pending IRQ into lapic. - * Return 1 if successfully added and 0 if discarded. - */ -static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map) +/* Set an IRQ pending in the lapic. */ +static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode, + unsigned long *dest_map) { - int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; switch (delivery_mode) { @@ -684,13 +682,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) { - result = 1; + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - } else { - result = !apic_test_and_set_irr(vector, apic); - - if (!result) { + else { + if (apic_test_and_set_irr(vector, apic)) { if (trig_mode) apic_debug("level trig mode repeatedly " "for vector %d", vector); @@ -702,7 +697,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); + trig_mode, vector, false); break; case APIC_DM_REMRD: @@ -714,14 +709,12 @@ out: break; case APIC_DM_NMI: - result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (!trig_mode || level) { - result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */ apic->pending_events = (1UL << KVM_APIC_INIT); /* make sure pending_events is visible before sending @@ -738,7 +731,6 @@ out: case APIC_DM_STARTUP: apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - result = 1; apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); @@ -760,7 +752,6 @@ out: delivery_mode); break; } - return result; } int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) @@ -1470,7 +1461,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; @@ -1479,10 +1470,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); } - return 0; } void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) @@ -1608,8 +1597,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) return; if (atomic_read(&apic->lapic_timer.pending) > 0) { - if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->lapic_timer.pending); + kvm_apic_local_deliver(apic, APIC_LVTT); + atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c730ac9..61a73a0 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,9 +57,9 @@ void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map); -int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); +void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map); +void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index e2e6b44..ef1817b 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -91,7 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (!kvm_is_dm_lowest_prio(irq)) { if (r < 0) r = 0; - r += kvm_apic_set_irq(vcpu, irq, dest_map); + kvm_apic_set_irq(vcpu, irq, dest_map); + r++; } else if (kvm_lapic_enabled(vcpu)) { if (!lowest) lowest = vcpu; @@ -100,8 +101,10 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, } } - if (lowest) - r = kvm_apic_set_irq(lowest, irq, dest_map); + if (lowest) { + kvm_apic_set_irq(lowest, irq, dest_map); + r = 1; + } return r; } -- cgit v0.10.2 From 0061d53daf26ff713ab43ab84ae5c44b5edbefa9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 9 May 2013 20:21:41 -0300 Subject: KVM: x86: limit difference between kvmclock updates kvmclock updates which are isolated to a given vcpu, such as vcpu->cpu migration, should not allow system_timestamp from the rest of the vcpus to remain static. Otherwise ntp frequency correction applies to one vcpu's system_timestamp but not the others. So in those cases, request a kvmclock update for all vcpus. The worst case for a remote vcpu to update its kvmclock is then bounded by maximum nohz sleep latency. Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 094b5d9..8d28810 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1588,6 +1588,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; } +/* + * kvmclock updates which are isolated to a given vcpu, such as + * vcpu->cpu migration, should not allow system_timestamp from + * the rest of the vcpus to remain static. Otherwise ntp frequency + * correction applies to one vcpu's system_timestamp but not + * the others. + * + * So in those cases, request a kvmclock update for all vcpus. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. + */ + +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + int i; + struct kvm *kvm = v->kvm; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_vcpu_kick(vcpu); + } +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -1985,7 +2009,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvmclock_reset(vcpu); vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -2702,7 +2726,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * kvmclock on vcpu->cpu migration */ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -5703,6 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) kvm_gen_update_masterclock(vcpu->kvm); + if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) + kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f0eea07..d9a3c30 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MCLOCK_INPROGRESS 19 #define KVM_REQ_EPR_EXIT 20 #define KVM_REQ_SCAN_IOAPIC 21 +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 22 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 -- cgit v0.10.2 From 35af577aacb2fd2a6b407953044aea1cf0546fad Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 16 May 2013 11:55:51 +0300 Subject: KVM: MMU: clenaup locking in mmu_free_roots() Do locking around each case separately instead of having one lock and two unlocks. Move root_hpa assignment out of the lock. Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 40d7b2d..f8ca2f3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2869,22 +2869,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - spin_lock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; + spin_lock(&vcpu->kvm->mmu_lock); sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; return; } + + spin_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; -- cgit v0.10.2 From 7275acdfe29ba03ad2f6e150386900c4e2d43fb1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 May 2013 14:31:01 +0100 Subject: ARM: KVM: move GIC/timer code to a common location As KVM/arm64 is looming on the horizon, it makes sense to move some of the common code to a single location in order to reduce duplication. The code could live anywhere. Actually, most of KVM is already built with a bunch of ugly ../../.. hacks in the various Makefiles, so we're not exactly talking about style here. But maybe it is time to start moving into a less ugly direction. The include files must be in a "public" location, as they are accessed from non-KVM files (arch/arm/kernel/asm-offsets.c). For this purpose, introduce two new locations: - virt/kvm/arm/ : x86 and ia64 already share the ioapic code in virt/kvm, so this could be seen as a (very ugly) precedent. - include/kvm/ : there is already an include/xen, and while the intent is slightly different, this seems as good a location as any Eventually, we should probably have independant Makefiles at every levels (just like everywhere else in the kernel), but this is just the first step. Signed-off-by: Marc Zyngier Signed-off-by: Gleb Natapov diff --git a/arch/arm/include/asm/kvm_arch_timer.h b/arch/arm/include/asm/kvm_arch_timer.h deleted file mode 100644 index 68cb9e1..0000000 --- a/arch/arm/include/asm/kvm_arch_timer.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef __ASM_ARM_KVM_ARCH_TIMER_H -#define __ASM_ARM_KVM_ARCH_TIMER_H - -#include -#include -#include - -struct arch_timer_kvm { -#ifdef CONFIG_KVM_ARM_TIMER - /* Is the timer enabled */ - bool enabled; - - /* Virtual offset */ - cycle_t cntvoff; -#endif -}; - -struct arch_timer_cpu { -#ifdef CONFIG_KVM_ARM_TIMER - /* Registers: control register, timer value */ - u32 cntv_ctl; /* Saved/restored */ - cycle_t cntv_cval; /* Saved/restored */ - - /* - * Anything that is not used directly from assembly code goes - * here. - */ - - /* Background timer used when the guest is not running */ - struct hrtimer timer; - - /* Work queued with the above timer expires */ - struct work_struct expired; - - /* Background timer active */ - bool armed; - - /* Timer IRQ */ - const struct kvm_irq_level *irq; -#endif -}; - -#ifdef CONFIG_KVM_ARM_TIMER -int kvm_timer_hyp_init(void); -int kvm_timer_init(struct kvm *kvm); -void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); -void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); -void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); -#else -static inline int kvm_timer_hyp_init(void) -{ - return 0; -}; - -static inline int kvm_timer_init(struct kvm *kvm) -{ - return 0; -} - -static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {} -static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {} -static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {} -static inline void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) {} -#endif - -#endif diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 57cb786..ff5aaf1 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS #define KVM_USER_MEM_SLOTS 32 @@ -38,7 +38,7 @@ #define KVM_NR_PAGE_SIZES 1 #define KVM_PAGES_PER_HPAGE(x) (1UL<<31) -#include +#include struct kvm_vcpu; u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h deleted file mode 100644 index 343744e..0000000 --- a/arch/arm/include/asm/kvm_vgic.h +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef __ASM_ARM_KVM_VGIC_H -#define __ASM_ARM_KVM_VGIC_H - -#include -#include -#include -#include -#include -#include - -#define VGIC_NR_IRQS 128 -#define VGIC_NR_SGIS 16 -#define VGIC_NR_PPIS 16 -#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) -#define VGIC_NR_SHARED_IRQS (VGIC_NR_IRQS - VGIC_NR_PRIVATE_IRQS) -#define VGIC_MAX_CPUS KVM_MAX_VCPUS -#define VGIC_MAX_LRS (1 << 6) - -/* Sanity checks... */ -#if (VGIC_MAX_CPUS > 8) -#error Invalid number of CPU interfaces -#endif - -#if (VGIC_NR_IRQS & 31) -#error "VGIC_NR_IRQS must be a multiple of 32" -#endif - -#if (VGIC_NR_IRQS > 1024) -#error "VGIC_NR_IRQS must be <= 1024" -#endif - -/* - * The GIC distributor registers describing interrupts have two parts: - * - 32 per-CPU interrupts (SGI + PPI) - * - a bunch of shared interrupts (SPI) - */ -struct vgic_bitmap { - union { - u32 reg[VGIC_NR_PRIVATE_IRQS / 32]; - DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS); - } percpu[VGIC_MAX_CPUS]; - union { - u32 reg[VGIC_NR_SHARED_IRQS / 32]; - DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS); - } shared; -}; - -struct vgic_bytemap { - u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4]; - u32 shared[VGIC_NR_SHARED_IRQS / 4]; -}; - -struct vgic_dist { -#ifdef CONFIG_KVM_ARM_VGIC - spinlock_t lock; - bool ready; - - /* Virtual control interface mapping */ - void __iomem *vctrl_base; - - /* Distributor and vcpu interface mapping in the guest */ - phys_addr_t vgic_dist_base; - phys_addr_t vgic_cpu_base; - - /* Distributor enabled */ - u32 enabled; - - /* Interrupt enabled (one bit per IRQ) */ - struct vgic_bitmap irq_enabled; - - /* Interrupt 'pin' level */ - struct vgic_bitmap irq_state; - - /* Level-triggered interrupt in progress */ - struct vgic_bitmap irq_active; - - /* Interrupt priority. Not used yet. */ - struct vgic_bytemap irq_priority; - - /* Level/edge triggered */ - struct vgic_bitmap irq_cfg; - - /* Source CPU per SGI and target CPU */ - u8 irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS]; - - /* Target CPU for each IRQ */ - u8 irq_spi_cpu[VGIC_NR_SHARED_IRQS]; - struct vgic_bitmap irq_spi_target[VGIC_MAX_CPUS]; - - /* Bitmap indicating which CPU has something pending */ - unsigned long irq_pending_on_cpu; -#endif -}; - -struct vgic_cpu { -#ifdef CONFIG_KVM_ARM_VGIC - /* per IRQ to LR mapping */ - u8 vgic_irq_lr_map[VGIC_NR_IRQS]; - - /* Pending interrupts on this VCPU */ - DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP( pending_shared, VGIC_NR_SHARED_IRQS); - - /* Bitmap of used/free list registers */ - DECLARE_BITMAP( lr_used, VGIC_MAX_LRS); - - /* Number of list registers on this CPU */ - int nr_lr; - - /* CPU vif control registers for world switch */ - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr[2]; /* Saved only */ - u32 vgic_elrsr[2]; /* Saved only */ - u32 vgic_apr; - u32 vgic_lr[VGIC_MAX_LRS]; -#endif -}; - -#define LR_EMPTY 0xff - -struct kvm; -struct kvm_vcpu; -struct kvm_run; -struct kvm_exit_mmio; - -#ifdef CONFIG_KVM_ARM_VGIC -int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr); -int kvm_vgic_hyp_init(void); -int kvm_vgic_init(struct kvm *kvm); -int kvm_vgic_create(struct kvm *kvm); -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, - bool level); -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); -bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, - struct kvm_exit_mmio *mmio); - -#define irqchip_in_kernel(k) (!!((k)->arch.vgic.vctrl_base)) -#define vgic_initialized(k) ((k)->arch.vgic.ready) - -#else -static inline int kvm_vgic_hyp_init(void) -{ - return 0; -} - -static inline int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) -{ - return 0; -} - -static inline int kvm_vgic_init(struct kvm *kvm) -{ - return 0; -} - -static inline int kvm_vgic_create(struct kvm *kvm) -{ - return 0; -} - -static inline int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) {} -static inline void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) {} - -static inline int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, - unsigned int irq_num, bool level) -{ - return 0; -} - -static inline int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, - struct kvm_exit_mmio *mmio) -{ - return false; -} - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 0; -} - -static inline bool vgic_initialized(struct kvm *kvm) -{ - return true; -} -#endif - -#endif diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 53c5ed8..9184a49 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -14,10 +14,11 @@ CFLAGS_mmu.o := -I. AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) -kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +KVM := ../../../virt/kvm +kvm-arm-y = $(addprefix $(KVM)/, kvm_main.o coalesced_mmio.o) obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o -obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o -obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o +obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o +obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arch_timer.c b/arch/arm/kvm/arch_timer.c deleted file mode 100644 index c55b608..0000000 --- a/arch/arm/kvm/arch_timer.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -static struct timecounter *timecounter; -static struct workqueue_struct *wqueue; -static struct kvm_irq_level timer_irq = { - .level = 1, -}; - -static cycle_t kvm_phys_timer_read(void) -{ - return timecounter->cc->read(timecounter->cc); -} - -static bool timer_is_armed(struct arch_timer_cpu *timer) -{ - return timer->armed; -} - -/* timer_arm: as in "arm the timer", not as in ARM the company */ -static void timer_arm(struct arch_timer_cpu *timer, u64 ns) -{ - timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), - HRTIMER_MODE_ABS); -} - -static void timer_disarm(struct arch_timer_cpu *timer) -{ - if (timer_is_armed(timer)) { - hrtimer_cancel(&timer->timer); - cancel_work_sync(&timer->expired); - timer->armed = false; - } -} - -static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; - kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - vcpu->arch.timer_cpu.irq->irq, - vcpu->arch.timer_cpu.irq->level); -} - -static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) -{ - struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - - /* - * We disable the timer in the world switch and let it be - * handled by kvm_timer_sync_hwstate(). Getting a timer - * interrupt at this point is a sure sign of some major - * breakage. - */ - pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu); - return IRQ_HANDLED; -} - -static void kvm_timer_inject_irq_work(struct work_struct *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); - vcpu->arch.timer_cpu.armed = false; - kvm_timer_inject_irq(vcpu); -} - -static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) -{ - struct arch_timer_cpu *timer; - timer = container_of(hrt, struct arch_timer_cpu, timer); - queue_work(wqueue, &timer->expired); - return HRTIMER_NORESTART; -} - -/** - * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu - * @vcpu: The vcpu pointer - * - * Disarm any pending soft timers, since the world-switch code will write the - * virtual timer state back to the physical CPU. - */ -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - /* - * We're about to run this vcpu again, so there is no need to - * keep the background timer running, as we're about to - * populate the CPU timer again. - */ - timer_disarm(timer); -} - -/** - * kvm_timer_sync_hwstate - sync timer state from cpu - * @vcpu: The vcpu pointer - * - * Check if the virtual timer was armed and either schedule a corresponding - * soft timer or inject directly if already expired. - */ -void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - cycle_t cval, now; - u64 ns; - - if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || - !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE)) - return; - - cval = timer->cntv_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - - BUG_ON(timer_is_armed(timer)); - - if (cval <= now) { - /* - * Timer has already expired while we were not - * looking. Inject the interrupt and carry on. - */ - kvm_timer_inject_irq(vcpu); - return; - } - - ns = cyclecounter_cyc2ns(timecounter->cc, cval - now); - timer_arm(timer, ns); -} - -void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->timer.function = kvm_timer_expire; - timer->irq = &timer_irq; -} - -static void kvm_timer_init_interrupt(void *info) -{ - enable_percpu_irq(timer_irq.irq, 0); -} - - -static int kvm_timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) -{ - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: - kvm_timer_init_interrupt(NULL); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - disable_percpu_irq(timer_irq.irq); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block kvm_timer_cpu_nb = { - .notifier_call = kvm_timer_cpu_notify, -}; - -static const struct of_device_id arch_timer_of_match[] = { - { .compatible = "arm,armv7-timer", }, - {}, -}; - -int kvm_timer_hyp_init(void) -{ - struct device_node *np; - unsigned int ppi; - int err; - - timecounter = arch_timer_get_timecounter(); - if (!timecounter) - return -ENODEV; - - np = of_find_matching_node(NULL, arch_timer_of_match); - if (!np) { - kvm_err("kvm_arch_timer: can't find DT node\n"); - return -ENODEV; - } - - ppi = irq_of_parse_and_map(np, 2); - if (!ppi) { - kvm_err("kvm_arch_timer: no virtual timer interrupt\n"); - err = -EINVAL; - goto out; - } - - err = request_percpu_irq(ppi, kvm_arch_timer_handler, - "kvm guest timer", kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", - ppi, err); - goto out; - } - - timer_irq.irq = ppi; - - err = register_cpu_notifier(&kvm_timer_cpu_nb); - if (err) { - kvm_err("Cannot register timer CPU notifier\n"); - goto out_free; - } - - wqueue = create_singlethread_workqueue("kvm_arch_timer"); - if (!wqueue) { - err = -ENOMEM; - goto out_free; - } - - kvm_info("%s IRQ%d\n", np->name, ppi); - on_each_cpu(kvm_timer_init_interrupt, NULL, 1); - - goto out; -out_free: - free_percpu_irq(ppi, kvm_get_running_vcpus()); -out: - of_node_put(np); - return err; -} - -void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - timer_disarm(timer); -} - -int kvm_timer_init(struct kvm *kvm) -{ - if (timecounter && wqueue) { - kvm->arch.timer.cntvoff = kvm_phys_timer_read(); - kvm->arch.timer.enabled = 1; - } - - return 0; -} diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c deleted file mode 100644 index 17c5ac7..0000000 --- a/arch/arm/kvm/vgic.c +++ /dev/null @@ -1,1499 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -/* - * How the whole thing works (courtesy of Christoffer Dall): - * - * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if - * something is pending - * - VGIC pending interrupts are stored on the vgic.irq_state vgic - * bitmap (this bitmap is updated by both user land ioctls and guest - * mmio ops, and other in-kernel peripherals such as the - * arch. timers) and indicate the 'wire' state. - * - Every time the bitmap changes, the irq_pending_on_cpu oracle is - * recalculated - * - To calculate the oracle, we need info for each cpu from - * compute_pending_for_cpu, which considers: - * - PPI: dist->irq_state & dist->irq_enable - * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target - * - irq_spi_target is a 'formatted' version of the GICD_ICFGR - * registers, stored on each vcpu. We only keep one bit of - * information per interrupt, making sure that only one vcpu can - * accept the interrupt. - * - The same is true when injecting an interrupt, except that we only - * consider a single interrupt at a time. The irq_spi_cpu array - * contains the target CPU for each SPI. - * - * The handling of level interrupts adds some extra complexity. We - * need to track when the interrupt has been EOIed, so we can sample - * the 'line' again. This is achieved as such: - * - * - When a level interrupt is moved onto a vcpu, the corresponding - * bit in irq_active is set. As long as this bit is set, the line - * will be ignored for further interrupts. The interrupt is injected - * into the vcpu with the GICH_LR_EOI bit set (generate a - * maintenance interrupt on EOI). - * - When the interrupt is EOIed, the maintenance interrupt fires, - * and clears the corresponding bit in irq_active. This allow the - * interrupt line to be sampled again. - */ - -#define VGIC_ADDR_UNDEF (-1) -#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) - -/* Physical address of vgic virtual cpu interface */ -static phys_addr_t vgic_vcpu_base; - -/* Virtual control interface base address */ -static void __iomem *vgic_vctrl_base; - -static struct device_node *vgic_node; - -#define ACCESS_READ_VALUE (1 << 0) -#define ACCESS_READ_RAZ (0 << 0) -#define ACCESS_READ_MASK(x) ((x) & (1 << 0)) -#define ACCESS_WRITE_IGNORED (0 << 1) -#define ACCESS_WRITE_SETBIT (1 << 1) -#define ACCESS_WRITE_CLEARBIT (2 << 1) -#define ACCESS_WRITE_VALUE (3 << 1) -#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1)) - -static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); -static void vgic_update_state(struct kvm *kvm); -static void vgic_kick_vcpus(struct kvm *kvm); -static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); -static u32 vgic_nr_lr; - -static unsigned int vgic_maint_irq; - -static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, - int cpuid, u32 offset) -{ - offset >>= 2; - if (!offset) - return x->percpu[cpuid].reg; - else - return x->shared.reg + offset - 1; -} - -static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, - int cpuid, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - return test_bit(irq, x->percpu[cpuid].reg_ul); - - return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul); -} - -static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, - int irq, int val) -{ - unsigned long *reg; - - if (irq < VGIC_NR_PRIVATE_IRQS) { - reg = x->percpu[cpuid].reg_ul; - } else { - reg = x->shared.reg_ul; - irq -= VGIC_NR_PRIVATE_IRQS; - } - - if (val) - set_bit(irq, reg); - else - clear_bit(irq, reg); -} - -static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) -{ - if (unlikely(cpuid >= VGIC_MAX_CPUS)) - return NULL; - return x->percpu[cpuid].reg_ul; -} - -static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) -{ - return x->shared.reg_ul; -} - -static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) -{ - offset >>= 2; - BUG_ON(offset > (VGIC_NR_IRQS / 4)); - if (offset < 4) - return x->percpu[cpuid] + offset; - else - return x->shared + offset - 8; -} - -#define VGIC_CFG_LEVEL 0 -#define VGIC_CFG_EDGE 1 - -static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int irq_val; - - irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq); - return irq_val == VGIC_CFG_EDGE; -} - -static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); -} - -static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); -} - -static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); -} - -static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); -} - -static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq); -} - -static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1); -} - -static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0); -} - -static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); - else - set_bit(irq - VGIC_NR_PRIVATE_IRQS, - vcpu->arch.vgic_cpu.pending_shared); -} - -static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); - else - clear_bit(irq - VGIC_NR_PRIVATE_IRQS, - vcpu->arch.vgic_cpu.pending_shared); -} - -static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) -{ - return *((u32 *)mmio->data) & mask; -} - -static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value) -{ - *((u32 *)mmio->data) = value & mask; -} - -/** - * vgic_reg_access - access vgic register - * @mmio: pointer to the data describing the mmio access - * @reg: pointer to the virtual backing of vgic distributor data - * @offset: least significant 2 bits used for word offset - * @mode: ACCESS_ mode (see defines above) - * - * Helper to make vgic register access easier using one of the access - * modes defined for vgic register access - * (read,raz,write-ignored,setbit,clearbit,write) - */ -static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, - phys_addr_t offset, int mode) -{ - int word_offset = (offset & 3) * 8; - u32 mask = (1UL << (mmio->len * 8)) - 1; - u32 regval; - - /* - * Any alignment fault should have been delivered to the guest - * directly (ARM ARM B3.12.7 "Prioritization of aborts"). - */ - - if (reg) { - regval = *reg; - } else { - BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED)); - regval = 0; - } - - if (mmio->is_write) { - u32 data = mmio_data_read(mmio, mask) << word_offset; - switch (ACCESS_WRITE_MASK(mode)) { - case ACCESS_WRITE_IGNORED: - return; - - case ACCESS_WRITE_SETBIT: - regval |= data; - break; - - case ACCESS_WRITE_CLEARBIT: - regval &= ~data; - break; - - case ACCESS_WRITE_VALUE: - regval = (regval & ~(mask << word_offset)) | data; - break; - } - *reg = regval; - } else { - switch (ACCESS_READ_MASK(mode)) { - case ACCESS_READ_RAZ: - regval = 0; - /* fall through */ - - case ACCESS_READ_VALUE: - mmio_data_write(mmio, mask, regval >> word_offset); - } - } -} - -static bool handle_mmio_misc(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - u32 word_offset = offset & 3; - - switch (offset & ~3) { - case 0: /* CTLR */ - reg = vcpu->kvm->arch.vgic.enabled; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vcpu->kvm->arch.vgic.enabled = reg & 1; - vgic_update_state(vcpu->kvm); - return true; - } - break; - - case 4: /* TYPER */ - reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - reg |= (VGIC_NR_IRQS >> 5) - 1; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - break; - - case 8: /* IIDR */ - reg = 0x4B00043B; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - break; - } - - return false; -} - -static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); - if (mmio->is_write) { - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); - if (mmio->is_write) { - if (offset < 4) /* Force SGI enabled */ - *reg |= 0xffff; - vgic_retire_disabled_irqs(vcpu); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); - if (mmio->is_write) { - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); - if (mmio->is_write) { - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -#define GICD_ITARGETSR_SIZE 32 -#define GICD_CPUTARGETS_BITS 8 -#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS) -static u32 vgic_get_target_reg(struct kvm *kvm, int irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i, c; - unsigned long *bmap; - u32 val = 0; - - irq -= VGIC_NR_PRIVATE_IRQS; - - kvm_for_each_vcpu(c, vcpu, kvm) { - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); - for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) - if (test_bit(irq + i, bmap)) - val |= 1 << (c + i * 8); - } - - return val; -} - -static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i, c; - unsigned long *bmap; - u32 target; - - irq -= VGIC_NR_PRIVATE_IRQS; - - /* - * Pick the LSB in each byte. This ensures we target exactly - * one vcpu per IRQ. If the byte is null, assume we target - * CPU0. - */ - for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) { - int shift = i * GICD_CPUTARGETS_BITS; - target = ffs((val >> shift) & 0xffU); - target = target ? (target - 1) : 0; - dist->irq_spi_cpu[irq + i] = target; - kvm_for_each_vcpu(c, vcpu, kvm) { - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); - if (c == target) - set_bit(irq + i, bmap); - else - clear_bit(irq + i, bmap); - } - } -} - -static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - - /* We treat the banked interrupts targets as read-only */ - if (offset < 32) { - u32 roreg = 1 << vcpu->vcpu_id; - roreg |= roreg << 8; - roreg |= roreg << 16; - - vgic_reg_access(mmio, &roreg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U); - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static u32 vgic_cfg_expand(u16 val) -{ - u32 res = 0; - int i; - - /* - * Turn a 16bit value like abcd...mnop into a 32bit word - * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is. - */ - for (i = 0; i < 16; i++) - res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1); - - return res; -} - -static u16 vgic_cfg_compress(u32 val) -{ - u16 res = 0; - int i; - - /* - * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like - * abcd...mnop which is what we really care about. - */ - for (i = 0; i < 16; i++) - res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i; - - return res; -} - -/* - * The distributor uses 2 bits per IRQ for the CFG register, but the - * LSB is always 0. As such, we only keep the upper bit, and use the - * two above functions to compress/expand the bits - */ -static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 val; - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - vcpu->vcpu_id, offset >> 1); - if (offset & 2) - val = *reg >> 16; - else - val = *reg & 0xffff; - - val = vgic_cfg_expand(val); - vgic_reg_access(mmio, &val, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - if (offset < 4) { - *reg = ~0U; /* Force PPIs/SGIs to 1 */ - return false; - } - - val = vgic_cfg_compress(val); - if (offset & 2) { - *reg &= 0xffff; - *reg |= val << 16; - } else { - *reg &= 0xffff << 16; - *reg |= val; - } - } - - return false; -} - -static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vgic_dispatch_sgi(vcpu, reg); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -/* - * I would have liked to use the kvm_bus_io_*() API instead, but it - * cannot cope with banked registers (only the VM pointer is passed - * around, and we need the vcpu). One of these days, someone please - * fix it! - */ -struct mmio_range { - phys_addr_t base; - unsigned long len; - bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset); -}; - -static const struct mmio_range vgic_ranges[] = { - { - .base = GIC_DIST_CTRL, - .len = 12, - .handle_mmio = handle_mmio_misc, - }, - { - .base = GIC_DIST_IGROUP, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_DIST_ENABLE_SET, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_set_enable_reg, - }, - { - .base = GIC_DIST_ENABLE_CLEAR, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_clear_enable_reg, - }, - { - .base = GIC_DIST_PENDING_SET, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_set_pending_reg, - }, - { - .base = GIC_DIST_PENDING_CLEAR, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_clear_pending_reg, - }, - { - .base = GIC_DIST_ACTIVE_SET, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_DIST_ACTIVE_CLEAR, - .len = VGIC_NR_IRQS / 8, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_DIST_PRI, - .len = VGIC_NR_IRQS, - .handle_mmio = handle_mmio_priority_reg, - }, - { - .base = GIC_DIST_TARGET, - .len = VGIC_NR_IRQS, - .handle_mmio = handle_mmio_target_reg, - }, - { - .base = GIC_DIST_CONFIG, - .len = VGIC_NR_IRQS / 4, - .handle_mmio = handle_mmio_cfg_reg, - }, - { - .base = GIC_DIST_SOFTINT, - .len = 4, - .handle_mmio = handle_mmio_sgi_reg, - }, - {} -}; - -static const -struct mmio_range *find_matching_range(const struct mmio_range *ranges, - struct kvm_exit_mmio *mmio, - phys_addr_t base) -{ - const struct mmio_range *r = ranges; - phys_addr_t addr = mmio->phys_addr - base; - - while (r->len) { - if (addr >= r->base && - (addr + mmio->len) <= (r->base + r->len)) - return r; - r++; - } - - return NULL; -} - -/** - * vgic_handle_mmio - handle an in-kernel MMIO access - * @vcpu: pointer to the vcpu performing the access - * @run: pointer to the kvm_run structure - * @mmio: pointer to the data describing the access - * - * returns true if the MMIO access has been performed in kernel space, - * and false if it needs to be emulated in user space. - */ -bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, - struct kvm_exit_mmio *mmio) -{ - const struct mmio_range *range; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long base = dist->vgic_dist_base; - bool updated_state; - unsigned long offset; - - if (!irqchip_in_kernel(vcpu->kvm) || - mmio->phys_addr < base || - (mmio->phys_addr + mmio->len) > (base + KVM_VGIC_V2_DIST_SIZE)) - return false; - - /* We don't support ldrd / strd or ldm / stm to the emulated vgic */ - if (mmio->len > 4) { - kvm_inject_dabt(vcpu, mmio->phys_addr); - return true; - } - - range = find_matching_range(vgic_ranges, mmio, base); - if (unlikely(!range || !range->handle_mmio)) { - pr_warn("Unhandled access %d %08llx %d\n", - mmio->is_write, mmio->phys_addr, mmio->len); - return false; - } - - spin_lock(&vcpu->kvm->arch.vgic.lock); - offset = mmio->phys_addr - range->base - base; - updated_state = range->handle_mmio(vcpu, mmio, offset); - spin_unlock(&vcpu->kvm->arch.vgic.lock); - kvm_prepare_mmio(run, mmio); - kvm_handle_mmio_return(vcpu, run); - - if (updated_state) - vgic_kick_vcpus(vcpu->kvm); - - return true; -} - -static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *dist = &kvm->arch.vgic; - int nrcpus = atomic_read(&kvm->online_vcpus); - u8 target_cpus; - int sgi, mode, c, vcpu_id; - - vcpu_id = vcpu->vcpu_id; - - sgi = reg & 0xf; - target_cpus = (reg >> 16) & 0xff; - mode = (reg >> 24) & 3; - - switch (mode) { - case 0: - if (!target_cpus) - return; - - case 1: - target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff; - break; - - case 2: - target_cpus = 1 << vcpu_id; - break; - } - - kvm_for_each_vcpu(c, vcpu, kvm) { - if (target_cpus & 1) { - /* Flag the SGI as pending */ - vgic_dist_irq_set(vcpu, sgi); - dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; - kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); - } - - target_cpus >>= 1; - } -} - -static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *pending, *enabled, *pend_percpu, *pend_shared; - unsigned long pending_private, pending_shared; - int vcpu_id; - - vcpu_id = vcpu->vcpu_id; - pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; - pend_shared = vcpu->arch.vgic_cpu.pending_shared; - - pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id); - enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); - bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); - - pending = vgic_bitmap_get_shared_map(&dist->irq_state); - enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); - bitmap_and(pend_shared, pend_shared, - vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - VGIC_NR_SHARED_IRQS); - - pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); - pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS); - return (pending_private < VGIC_NR_PRIVATE_IRQS || - pending_shared < VGIC_NR_SHARED_IRQS); -} - -/* - * Update the interrupt state and determine which CPUs have pending - * interrupts. Must be called with distributor lock held. - */ -static void vgic_update_state(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int c; - - if (!dist->enabled) { - set_bit(0, &dist->irq_pending_on_cpu); - return; - } - - kvm_for_each_vcpu(c, vcpu, kvm) { - if (compute_pending_for_cpu(vcpu)) { - pr_debug("CPU%d has pending interrupts\n", c); - set_bit(c, &dist->irq_pending_on_cpu); - } - } -} - -#define LR_CPUID(lr) \ - (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT) -#define MK_LR_PEND(src, irq) \ - (GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq)) - -/* - * An interrupt may have been disabled after being made pending on the - * CPU interface (the classic case is a timer running while we're - * rebooting the guest - the interrupt would kick as soon as the CPU - * interface gets enabled, with deadly consequences). - * - * The solution is to examine already active LRs, and check the - * interrupt is still enabled. If not, just retire it. - */ -static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - int lr; - - for_each_set_bit(lr, vgic_cpu->lr_used, vgic_cpu->nr_lr) { - int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; - - if (!vgic_irq_is_enabled(vcpu, irq)) { - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; - clear_bit(lr, vgic_cpu->lr_used); - vgic_cpu->vgic_lr[lr] &= ~GICH_LR_STATE; - if (vgic_irq_is_active(vcpu, irq)) - vgic_irq_clear_active(vcpu, irq); - } - } -} - -/* - * Queue an interrupt to a CPU virtual interface. Return true on success, - * or false if it wasn't possible to queue it. - */ -static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - int lr; - - /* Sanitize the input... */ - BUG_ON(sgi_source_id & ~7); - BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); - BUG_ON(irq >= VGIC_NR_IRQS); - - kvm_debug("Queue IRQ%d\n", irq); - - lr = vgic_cpu->vgic_irq_lr_map[irq]; - - /* Do we have an active interrupt for the same CPUID? */ - if (lr != LR_EMPTY && - (LR_CPUID(vgic_cpu->vgic_lr[lr]) == sgi_source_id)) { - kvm_debug("LR%d piggyback for IRQ%d %x\n", - lr, irq, vgic_cpu->vgic_lr[lr]); - BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); - vgic_cpu->vgic_lr[lr] |= GICH_LR_PENDING_BIT; - return true; - } - - /* Try to use another LR for this interrupt */ - lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, - vgic_cpu->nr_lr); - if (lr >= vgic_cpu->nr_lr) - return false; - - kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); - vgic_cpu->vgic_lr[lr] = MK_LR_PEND(sgi_source_id, irq); - vgic_cpu->vgic_irq_lr_map[irq] = lr; - set_bit(lr, vgic_cpu->lr_used); - - if (!vgic_irq_is_edge(vcpu, irq)) - vgic_cpu->vgic_lr[lr] |= GICH_LR_EOI; - - return true; -} - -static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long sources; - int vcpu_id = vcpu->vcpu_id; - int c; - - sources = dist->irq_sgi_sources[vcpu_id][irq]; - - for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { - if (vgic_queue_irq(vcpu, c, irq)) - clear_bit(c, &sources); - } - - dist->irq_sgi_sources[vcpu_id][irq] = sources; - - /* - * If the sources bitmap has been cleared it means that we - * could queue all the SGIs onto link registers (see the - * clear_bit above), and therefore we are done with them in - * our emulated gic and can get rid of them. - */ - if (!sources) { - vgic_dist_irq_clear(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - return true; - } - - return false; -} - -static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) -{ - if (vgic_irq_is_active(vcpu, irq)) - return true; /* level interrupt, already queued */ - - if (vgic_queue_irq(vcpu, 0, irq)) { - if (vgic_irq_is_edge(vcpu, irq)) { - vgic_dist_irq_clear(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - } else { - vgic_irq_set_active(vcpu, irq); - } - - return true; - } - - return false; -} - -/* - * Fill the list registers with pending interrupts before running the - * guest. - */ -static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int i, vcpu_id; - int overflow = 0; - - vcpu_id = vcpu->vcpu_id; - - /* - * We may not have any pending interrupt, or the interrupts - * may have been serviced from another vcpu. In all cases, - * move along. - */ - if (!kvm_vgic_vcpu_pending_irq(vcpu)) { - pr_debug("CPU%d has no pending interrupt\n", vcpu_id); - goto epilog; - } - - /* SGIs */ - for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) { - if (!vgic_queue_sgi(vcpu, i)) - overflow = 1; - } - - /* PPIs */ - for_each_set_bit_from(i, vgic_cpu->pending_percpu, VGIC_NR_PRIVATE_IRQS) { - if (!vgic_queue_hwirq(vcpu, i)) - overflow = 1; - } - - /* SPIs */ - for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) { - if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) - overflow = 1; - } - -epilog: - if (overflow) { - vgic_cpu->vgic_hcr |= GICH_HCR_UIE; - } else { - vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; - /* - * We're about to run this VCPU, and we've consumed - * everything the distributor had in store for - * us. Claim we don't have anything pending. We'll - * adjust that if needed while exiting. - */ - clear_bit(vcpu_id, &dist->irq_pending_on_cpu); - } -} - -static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - bool level_pending = false; - - kvm_debug("MISR = %08x\n", vgic_cpu->vgic_misr); - - if (vgic_cpu->vgic_misr & GICH_MISR_EOI) { - /* - * Some level interrupts have been EOIed. Clear their - * active bit. - */ - int lr, irq; - - for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_eisr, - vgic_cpu->nr_lr) { - irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; - - vgic_irq_clear_active(vcpu, irq); - vgic_cpu->vgic_lr[lr] &= ~GICH_LR_EOI; - - /* Any additional pending interrupt? */ - if (vgic_dist_irq_is_pending(vcpu, irq)) { - vgic_cpu_irq_set(vcpu, irq); - level_pending = true; - } else { - vgic_cpu_irq_clear(vcpu, irq); - } - - /* - * Despite being EOIed, the LR may not have - * been marked as empty. - */ - set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr); - vgic_cpu->vgic_lr[lr] &= ~GICH_LR_ACTIVE_BIT; - } - } - - if (vgic_cpu->vgic_misr & GICH_MISR_U) - vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; - - return level_pending; -} - -/* - * Sync back the VGIC state after a guest run. The distributor lock is - * needed so we don't get preempted in the middle of the state processing. - */ -static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int lr, pending; - bool level_pending; - - level_pending = vgic_process_maintenance(vcpu); - - /* Clear mappings for empty LRs */ - for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr, - vgic_cpu->nr_lr) { - int irq; - - if (!test_and_clear_bit(lr, vgic_cpu->lr_used)) - continue; - - irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; - - BUG_ON(irq >= VGIC_NR_IRQS); - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; - } - - /* Check if we still have something up our sleeve... */ - pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elrsr, - vgic_cpu->nr_lr); - if (level_pending || pending < vgic_cpu->nr_lr) - set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); -} - -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - spin_lock(&dist->lock); - __kvm_vgic_flush_hwstate(vcpu); - spin_unlock(&dist->lock); -} - -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - spin_lock(&dist->lock); - __kvm_vgic_sync_hwstate(vcpu); - spin_unlock(&dist->lock); -} - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - - return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); -} - -static void vgic_kick_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c; - - /* - * We've injected an interrupt, time to find out who deserves - * a good kick... - */ - kvm_for_each_vcpu(c, vcpu, kvm) { - if (kvm_vgic_vcpu_pending_irq(vcpu)) - kvm_vcpu_kick(vcpu); - } -} - -static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) -{ - int is_edge = vgic_irq_is_edge(vcpu, irq); - int state = vgic_dist_irq_is_pending(vcpu, irq); - - /* - * Only inject an interrupt if: - * - edge triggered and we have a rising edge - * - level triggered and we change level - */ - if (is_edge) - return level > state; - else - return level != state; -} - -static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, - unsigned int irq_num, bool level) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int is_edge, is_level; - int enabled; - bool ret = true; - - spin_lock(&dist->lock); - - vcpu = kvm_get_vcpu(kvm, cpuid); - is_edge = vgic_irq_is_edge(vcpu, irq_num); - is_level = !is_edge; - - if (!vgic_validate_injection(vcpu, irq_num, level)) { - ret = false; - goto out; - } - - if (irq_num >= VGIC_NR_PRIVATE_IRQS) { - cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS]; - vcpu = kvm_get_vcpu(kvm, cpuid); - } - - kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); - - if (level) - vgic_dist_irq_set(vcpu, irq_num); - else - vgic_dist_irq_clear(vcpu, irq_num); - - enabled = vgic_irq_is_enabled(vcpu, irq_num); - - if (!enabled) { - ret = false; - goto out; - } - - if (is_level && vgic_irq_is_active(vcpu, irq_num)) { - /* - * Level interrupt in progress, will be picked up - * when EOId. - */ - ret = false; - goto out; - } - - if (level) { - vgic_cpu_irq_set(vcpu, irq_num); - set_bit(cpuid, &dist->irq_pending_on_cpu); - } - -out: - spin_unlock(&dist->lock); - - return ret; -} - -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @irq_num: The IRQ number that is assigned to the device - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: activates an interrupt - * false: deactivates an interrupt - * - * The GIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, - bool level) -{ - if (vgic_update_irq_state(kvm, cpuid, irq_num, level)) - vgic_kick_vcpus(kvm); - - return 0; -} - -static irqreturn_t vgic_maintenance_handler(int irq, void *data) -{ - /* - * We cannot rely on the vgic maintenance interrupt to be - * delivered synchronously. This means we can only use it to - * exit the VM, and we perform the handling of EOIed - * interrupts on the exit path (see vgic_process_maintenance). - */ - return IRQ_HANDLED; -} - -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int i; - - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - - if (vcpu->vcpu_id >= VGIC_MAX_CPUS) - return -EBUSY; - - for (i = 0; i < VGIC_NR_IRQS; i++) { - if (i < VGIC_NR_PPIS) - vgic_bitmap_set_irq_val(&dist->irq_enabled, - vcpu->vcpu_id, i, 1); - if (i < VGIC_NR_PRIVATE_IRQS) - vgic_bitmap_set_irq_val(&dist->irq_cfg, - vcpu->vcpu_id, i, VGIC_CFG_EDGE); - - vgic_cpu->vgic_irq_lr_map[i] = LR_EMPTY; - } - - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vgic_cpu->vgic_vmcr = 0; - - vgic_cpu->nr_lr = vgic_nr_lr; - vgic_cpu->vgic_hcr = GICH_HCR_EN; /* Get the show on the road... */ - - return 0; -} - -static void vgic_init_maintenance_interrupt(void *info) -{ - enable_percpu_irq(vgic_maint_irq, 0); -} - -static int vgic_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) -{ - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: - vgic_init_maintenance_interrupt(NULL); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - disable_percpu_irq(vgic_maint_irq); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block vgic_cpu_nb = { - .notifier_call = vgic_cpu_notify, -}; - -int kvm_vgic_hyp_init(void) -{ - int ret; - struct resource vctrl_res; - struct resource vcpu_res; - - vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic"); - if (!vgic_node) { - kvm_err("error: no compatible vgic node in DT\n"); - return -ENODEV; - } - - vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0); - if (!vgic_maint_irq) { - kvm_err("error getting vgic maintenance irq from DT\n"); - ret = -ENXIO; - goto out; - } - - ret = request_percpu_irq(vgic_maint_irq, vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", vgic_maint_irq); - goto out; - } - - ret = register_cpu_notifier(&vgic_cpu_nb); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - - ret = of_address_to_resource(vgic_node, 2, &vctrl_res); - if (ret) { - kvm_err("Cannot obtain VCTRL resource\n"); - goto out_free_irq; - } - - vgic_vctrl_base = of_iomap(vgic_node, 2); - if (!vgic_vctrl_base) { - kvm_err("Cannot ioremap VCTRL\n"); - ret = -ENOMEM; - goto out_free_irq; - } - - vgic_nr_lr = readl_relaxed(vgic_vctrl_base + GICH_VTR); - vgic_nr_lr = (vgic_nr_lr & 0x3f) + 1; - - ret = create_hyp_io_mappings(vgic_vctrl_base, - vgic_vctrl_base + resource_size(&vctrl_res), - vctrl_res.start); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out_unmap; - } - - kvm_info("%s@%llx IRQ%d\n", vgic_node->name, - vctrl_res.start, vgic_maint_irq); - on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); - - if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { - kvm_err("Cannot obtain VCPU resource\n"); - ret = -ENXIO; - goto out_unmap; - } - vgic_vcpu_base = vcpu_res.start; - - goto out; - -out_unmap: - iounmap(vgic_vctrl_base); -out_free_irq: - free_percpu_irq(vgic_maint_irq, kvm_get_running_vcpus()); -out: - of_node_put(vgic_node); - return ret; -} - -int kvm_vgic_init(struct kvm *kvm) -{ - int ret = 0, i; - - mutex_lock(&kvm->lock); - - if (vgic_initialized(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; - } - - ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, - vgic_vcpu_base, KVM_VGIC_V2_CPU_SIZE); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out; - } - - for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) - vgic_set_target_reg(kvm, 0, i); - - kvm_timer_init(kvm); - kvm->arch.vgic.ready = true; -out: - mutex_unlock(&kvm->lock); - return ret; -} - -int kvm_vgic_create(struct kvm *kvm) -{ - int ret = 0; - - mutex_lock(&kvm->lock); - - if (atomic_read(&kvm->online_vcpus) || kvm->arch.vgic.vctrl_base) { - ret = -EEXIST; - goto out; - } - - spin_lock_init(&kvm->arch.vgic.lock); - kvm->arch.vgic.vctrl_base = vgic_vctrl_base; - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - -out: - mutex_unlock(&kvm->lock); - return ret; -} - -static bool vgic_ioaddr_overlap(struct kvm *kvm) -{ - phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; - phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; - - if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu)) - return 0; - if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) || - (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist)) - return -EBUSY; - return 0; -} - -static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t size) -{ - int ret; - - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; - if (addr + size < addr) - return -EINVAL; - - ret = vgic_ioaddr_overlap(kvm); - if (ret) - return ret; - *ioaddr = addr; - return ret; -} - -int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) -{ - int r = 0; - struct vgic_dist *vgic = &kvm->arch.vgic; - - if (addr & ~KVM_PHYS_MASK) - return -E2BIG; - - if (addr & (SZ_4K - 1)) - return -EINVAL; - - mutex_lock(&kvm->lock); - switch (type) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base, - addr, KVM_VGIC_V2_DIST_SIZE); - break; - case KVM_VGIC_V2_ADDR_TYPE_CPU: - r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base, - addr, KVM_VGIC_V2_CPU_SIZE); - break; - default: - r = -ENODEV; - } - - mutex_unlock(&kvm->lock); - return r; -} diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h new file mode 100644 index 0000000..68cb9e1 --- /dev/null +++ b/include/kvm/arm_arch_timer.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __ASM_ARM_KVM_ARCH_TIMER_H +#define __ASM_ARM_KVM_ARCH_TIMER_H + +#include +#include +#include + +struct arch_timer_kvm { +#ifdef CONFIG_KVM_ARM_TIMER + /* Is the timer enabled */ + bool enabled; + + /* Virtual offset */ + cycle_t cntvoff; +#endif +}; + +struct arch_timer_cpu { +#ifdef CONFIG_KVM_ARM_TIMER + /* Registers: control register, timer value */ + u32 cntv_ctl; /* Saved/restored */ + cycle_t cntv_cval; /* Saved/restored */ + + /* + * Anything that is not used directly from assembly code goes + * here. + */ + + /* Background timer used when the guest is not running */ + struct hrtimer timer; + + /* Work queued with the above timer expires */ + struct work_struct expired; + + /* Background timer active */ + bool armed; + + /* Timer IRQ */ + const struct kvm_irq_level *irq; +#endif +}; + +#ifdef CONFIG_KVM_ARM_TIMER +int kvm_timer_hyp_init(void); +int kvm_timer_init(struct kvm *kvm); +void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); +void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); +void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); +#else +static inline int kvm_timer_hyp_init(void) +{ + return 0; +}; + +static inline int kvm_timer_init(struct kvm *kvm) +{ + return 0; +} + +static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {} +static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {} +static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {} +static inline void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) {} +#endif + +#endif diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h new file mode 100644 index 0000000..343744e --- /dev/null +++ b/include/kvm/arm_vgic.h @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __ASM_ARM_KVM_VGIC_H +#define __ASM_ARM_KVM_VGIC_H + +#include +#include +#include +#include +#include +#include + +#define VGIC_NR_IRQS 128 +#define VGIC_NR_SGIS 16 +#define VGIC_NR_PPIS 16 +#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) +#define VGIC_NR_SHARED_IRQS (VGIC_NR_IRQS - VGIC_NR_PRIVATE_IRQS) +#define VGIC_MAX_CPUS KVM_MAX_VCPUS +#define VGIC_MAX_LRS (1 << 6) + +/* Sanity checks... */ +#if (VGIC_MAX_CPUS > 8) +#error Invalid number of CPU interfaces +#endif + +#if (VGIC_NR_IRQS & 31) +#error "VGIC_NR_IRQS must be a multiple of 32" +#endif + +#if (VGIC_NR_IRQS > 1024) +#error "VGIC_NR_IRQS must be <= 1024" +#endif + +/* + * The GIC distributor registers describing interrupts have two parts: + * - 32 per-CPU interrupts (SGI + PPI) + * - a bunch of shared interrupts (SPI) + */ +struct vgic_bitmap { + union { + u32 reg[VGIC_NR_PRIVATE_IRQS / 32]; + DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS); + } percpu[VGIC_MAX_CPUS]; + union { + u32 reg[VGIC_NR_SHARED_IRQS / 32]; + DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS); + } shared; +}; + +struct vgic_bytemap { + u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4]; + u32 shared[VGIC_NR_SHARED_IRQS / 4]; +}; + +struct vgic_dist { +#ifdef CONFIG_KVM_ARM_VGIC + spinlock_t lock; + bool ready; + + /* Virtual control interface mapping */ + void __iomem *vctrl_base; + + /* Distributor and vcpu interface mapping in the guest */ + phys_addr_t vgic_dist_base; + phys_addr_t vgic_cpu_base; + + /* Distributor enabled */ + u32 enabled; + + /* Interrupt enabled (one bit per IRQ) */ + struct vgic_bitmap irq_enabled; + + /* Interrupt 'pin' level */ + struct vgic_bitmap irq_state; + + /* Level-triggered interrupt in progress */ + struct vgic_bitmap irq_active; + + /* Interrupt priority. Not used yet. */ + struct vgic_bytemap irq_priority; + + /* Level/edge triggered */ + struct vgic_bitmap irq_cfg; + + /* Source CPU per SGI and target CPU */ + u8 irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS]; + + /* Target CPU for each IRQ */ + u8 irq_spi_cpu[VGIC_NR_SHARED_IRQS]; + struct vgic_bitmap irq_spi_target[VGIC_MAX_CPUS]; + + /* Bitmap indicating which CPU has something pending */ + unsigned long irq_pending_on_cpu; +#endif +}; + +struct vgic_cpu { +#ifdef CONFIG_KVM_ARM_VGIC + /* per IRQ to LR mapping */ + u8 vgic_irq_lr_map[VGIC_NR_IRQS]; + + /* Pending interrupts on this VCPU */ + DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP( pending_shared, VGIC_NR_SHARED_IRQS); + + /* Bitmap of used/free list registers */ + DECLARE_BITMAP( lr_used, VGIC_MAX_LRS); + + /* Number of list registers on this CPU */ + int nr_lr; + + /* CPU vif control registers for world switch */ + u32 vgic_hcr; + u32 vgic_vmcr; + u32 vgic_misr; /* Saved only */ + u32 vgic_eisr[2]; /* Saved only */ + u32 vgic_elrsr[2]; /* Saved only */ + u32 vgic_apr; + u32 vgic_lr[VGIC_MAX_LRS]; +#endif +}; + +#define LR_EMPTY 0xff + +struct kvm; +struct kvm_vcpu; +struct kvm_run; +struct kvm_exit_mmio; + +#ifdef CONFIG_KVM_ARM_VGIC +int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr); +int kvm_vgic_hyp_init(void); +int kvm_vgic_init(struct kvm *kvm); +int kvm_vgic_create(struct kvm *kvm); +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, + bool level); +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); +bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_exit_mmio *mmio); + +#define irqchip_in_kernel(k) (!!((k)->arch.vgic.vctrl_base)) +#define vgic_initialized(k) ((k)->arch.vgic.ready) + +#else +static inline int kvm_vgic_hyp_init(void) +{ + return 0; +} + +static inline int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) +{ + return 0; +} + +static inline int kvm_vgic_init(struct kvm *kvm) +{ + return 0; +} + +static inline int kvm_vgic_create(struct kvm *kvm) +{ + return 0; +} + +static inline int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) {} +static inline void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) {} + +static inline int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, + unsigned int irq_num, bool level) +{ + return 0; +} + +static inline int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_exit_mmio *mmio) +{ + return false; +} + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return 0; +} + +static inline bool vgic_initialized(struct kvm *kvm) +{ + return true; +} +#endif + +#endif diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c new file mode 100644 index 0000000..2d00b29 --- /dev/null +++ b/virt/kvm/arm/arch_timer.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +static struct timecounter *timecounter; +static struct workqueue_struct *wqueue; +static struct kvm_irq_level timer_irq = { + .level = 1, +}; + +static cycle_t kvm_phys_timer_read(void) +{ + return timecounter->cc->read(timecounter->cc); +} + +static bool timer_is_armed(struct arch_timer_cpu *timer) +{ + return timer->armed; +} + +/* timer_arm: as in "arm the timer", not as in ARM the company */ +static void timer_arm(struct arch_timer_cpu *timer, u64 ns) +{ + timer->armed = true; + hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), + HRTIMER_MODE_ABS); +} + +static void timer_disarm(struct arch_timer_cpu *timer) +{ + if (timer_is_armed(timer)) { + hrtimer_cancel(&timer->timer); + cancel_work_sync(&timer->expired); + timer->armed = false; + } +} + +static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; + kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + vcpu->arch.timer_cpu.irq->irq, + vcpu->arch.timer_cpu.irq->level); +} + +static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) +{ + struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; + + /* + * We disable the timer in the world switch and let it be + * handled by kvm_timer_sync_hwstate(). Getting a timer + * interrupt at this point is a sure sign of some major + * breakage. + */ + pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu); + return IRQ_HANDLED; +} + +static void kvm_timer_inject_irq_work(struct work_struct *work) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); + vcpu->arch.timer_cpu.armed = false; + kvm_timer_inject_irq(vcpu); +} + +static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) +{ + struct arch_timer_cpu *timer; + timer = container_of(hrt, struct arch_timer_cpu, timer); + queue_work(wqueue, &timer->expired); + return HRTIMER_NORESTART; +} + +/** + * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu + * @vcpu: The vcpu pointer + * + * Disarm any pending soft timers, since the world-switch code will write the + * virtual timer state back to the physical CPU. + */ +void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + /* + * We're about to run this vcpu again, so there is no need to + * keep the background timer running, as we're about to + * populate the CPU timer again. + */ + timer_disarm(timer); +} + +/** + * kvm_timer_sync_hwstate - sync timer state from cpu + * @vcpu: The vcpu pointer + * + * Check if the virtual timer was armed and either schedule a corresponding + * soft timer or inject directly if already expired. + */ +void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + cycle_t cval, now; + u64 ns; + + if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || + !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE)) + return; + + cval = timer->cntv_cval; + now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + + BUG_ON(timer_is_armed(timer)); + + if (cval <= now) { + /* + * Timer has already expired while we were not + * looking. Inject the interrupt and carry on. + */ + kvm_timer_inject_irq(vcpu); + return; + } + + ns = cyclecounter_cyc2ns(timecounter->cc, cval - now); + timer_arm(timer, ns); +} + +void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); + hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + timer->timer.function = kvm_timer_expire; + timer->irq = &timer_irq; +} + +static void kvm_timer_init_interrupt(void *info) +{ + enable_percpu_irq(timer_irq.irq, 0); +} + + +static int kvm_timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *cpu) +{ + switch (action) { + case CPU_STARTING: + case CPU_STARTING_FROZEN: + kvm_timer_init_interrupt(NULL); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + disable_percpu_irq(timer_irq.irq); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block kvm_timer_cpu_nb = { + .notifier_call = kvm_timer_cpu_notify, +}; + +static const struct of_device_id arch_timer_of_match[] = { + { .compatible = "arm,armv7-timer", }, + {}, +}; + +int kvm_timer_hyp_init(void) +{ + struct device_node *np; + unsigned int ppi; + int err; + + timecounter = arch_timer_get_timecounter(); + if (!timecounter) + return -ENODEV; + + np = of_find_matching_node(NULL, arch_timer_of_match); + if (!np) { + kvm_err("kvm_arch_timer: can't find DT node\n"); + return -ENODEV; + } + + ppi = irq_of_parse_and_map(np, 2); + if (!ppi) { + kvm_err("kvm_arch_timer: no virtual timer interrupt\n"); + err = -EINVAL; + goto out; + } + + err = request_percpu_irq(ppi, kvm_arch_timer_handler, + "kvm guest timer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", + ppi, err); + goto out; + } + + timer_irq.irq = ppi; + + err = register_cpu_notifier(&kvm_timer_cpu_nb); + if (err) { + kvm_err("Cannot register timer CPU notifier\n"); + goto out_free; + } + + wqueue = create_singlethread_workqueue("kvm_arch_timer"); + if (!wqueue) { + err = -ENOMEM; + goto out_free; + } + + kvm_info("%s IRQ%d\n", np->name, ppi); + on_each_cpu(kvm_timer_init_interrupt, NULL, 1); + + goto out; +out_free: + free_percpu_irq(ppi, kvm_get_running_vcpus()); +out: + of_node_put(np); + return err; +} + +void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + timer_disarm(timer); +} + +int kvm_timer_init(struct kvm *kvm) +{ + if (timecounter && wqueue) { + kvm->arch.timer.cntvoff = kvm_phys_timer_read(); + kvm->arch.timer.enabled = 1; + } + + return 0; +} diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c new file mode 100644 index 0000000..17c5ac7 --- /dev/null +++ b/virt/kvm/arm/vgic.c @@ -0,0 +1,1499 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* + * How the whole thing works (courtesy of Christoffer Dall): + * + * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if + * something is pending + * - VGIC pending interrupts are stored on the vgic.irq_state vgic + * bitmap (this bitmap is updated by both user land ioctls and guest + * mmio ops, and other in-kernel peripherals such as the + * arch. timers) and indicate the 'wire' state. + * - Every time the bitmap changes, the irq_pending_on_cpu oracle is + * recalculated + * - To calculate the oracle, we need info for each cpu from + * compute_pending_for_cpu, which considers: + * - PPI: dist->irq_state & dist->irq_enable + * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target + * - irq_spi_target is a 'formatted' version of the GICD_ICFGR + * registers, stored on each vcpu. We only keep one bit of + * information per interrupt, making sure that only one vcpu can + * accept the interrupt. + * - The same is true when injecting an interrupt, except that we only + * consider a single interrupt at a time. The irq_spi_cpu array + * contains the target CPU for each SPI. + * + * The handling of level interrupts adds some extra complexity. We + * need to track when the interrupt has been EOIed, so we can sample + * the 'line' again. This is achieved as such: + * + * - When a level interrupt is moved onto a vcpu, the corresponding + * bit in irq_active is set. As long as this bit is set, the line + * will be ignored for further interrupts. The interrupt is injected + * into the vcpu with the GICH_LR_EOI bit set (generate a + * maintenance interrupt on EOI). + * - When the interrupt is EOIed, the maintenance interrupt fires, + * and clears the corresponding bit in irq_active. This allow the + * interrupt line to be sampled again. + */ + +#define VGIC_ADDR_UNDEF (-1) +#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) + +/* Physical address of vgic virtual cpu interface */ +static phys_addr_t vgic_vcpu_base; + +/* Virtual control interface base address */ +static void __iomem *vgic_vctrl_base; + +static struct device_node *vgic_node; + +#define ACCESS_READ_VALUE (1 << 0) +#define ACCESS_READ_RAZ (0 << 0) +#define ACCESS_READ_MASK(x) ((x) & (1 << 0)) +#define ACCESS_WRITE_IGNORED (0 << 1) +#define ACCESS_WRITE_SETBIT (1 << 1) +#define ACCESS_WRITE_CLEARBIT (2 << 1) +#define ACCESS_WRITE_VALUE (3 << 1) +#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1)) + +static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); +static void vgic_update_state(struct kvm *kvm); +static void vgic_kick_vcpus(struct kvm *kvm); +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); +static u32 vgic_nr_lr; + +static unsigned int vgic_maint_irq; + +static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, + int cpuid, u32 offset) +{ + offset >>= 2; + if (!offset) + return x->percpu[cpuid].reg; + else + return x->shared.reg + offset - 1; +} + +static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, + int cpuid, int irq) +{ + if (irq < VGIC_NR_PRIVATE_IRQS) + return test_bit(irq, x->percpu[cpuid].reg_ul); + + return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul); +} + +static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, + int irq, int val) +{ + unsigned long *reg; + + if (irq < VGIC_NR_PRIVATE_IRQS) { + reg = x->percpu[cpuid].reg_ul; + } else { + reg = x->shared.reg_ul; + irq -= VGIC_NR_PRIVATE_IRQS; + } + + if (val) + set_bit(irq, reg); + else + clear_bit(irq, reg); +} + +static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) +{ + if (unlikely(cpuid >= VGIC_MAX_CPUS)) + return NULL; + return x->percpu[cpuid].reg_ul; +} + +static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) +{ + return x->shared.reg_ul; +} + +static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) +{ + offset >>= 2; + BUG_ON(offset > (VGIC_NR_IRQS / 4)); + if (offset < 4) + return x->percpu[cpuid] + offset; + else + return x->shared + offset - 8; +} + +#define VGIC_CFG_LEVEL 0 +#define VGIC_CFG_EDGE 1 + +static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int irq_val; + + irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq); + return irq_val == VGIC_CFG_EDGE; +} + +static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); +} + +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); +} + +static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); +} + +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); +} + +static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq); +} + +static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1); +} + +static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0); +} + +static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) +{ + if (irq < VGIC_NR_PRIVATE_IRQS) + set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); + else + set_bit(irq - VGIC_NR_PRIVATE_IRQS, + vcpu->arch.vgic_cpu.pending_shared); +} + +static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) +{ + if (irq < VGIC_NR_PRIVATE_IRQS) + clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); + else + clear_bit(irq - VGIC_NR_PRIVATE_IRQS, + vcpu->arch.vgic_cpu.pending_shared); +} + +static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) +{ + return *((u32 *)mmio->data) & mask; +} + +static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value) +{ + *((u32 *)mmio->data) = value & mask; +} + +/** + * vgic_reg_access - access vgic register + * @mmio: pointer to the data describing the mmio access + * @reg: pointer to the virtual backing of vgic distributor data + * @offset: least significant 2 bits used for word offset + * @mode: ACCESS_ mode (see defines above) + * + * Helper to make vgic register access easier using one of the access + * modes defined for vgic register access + * (read,raz,write-ignored,setbit,clearbit,write) + */ +static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, + phys_addr_t offset, int mode) +{ + int word_offset = (offset & 3) * 8; + u32 mask = (1UL << (mmio->len * 8)) - 1; + u32 regval; + + /* + * Any alignment fault should have been delivered to the guest + * directly (ARM ARM B3.12.7 "Prioritization of aborts"). + */ + + if (reg) { + regval = *reg; + } else { + BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED)); + regval = 0; + } + + if (mmio->is_write) { + u32 data = mmio_data_read(mmio, mask) << word_offset; + switch (ACCESS_WRITE_MASK(mode)) { + case ACCESS_WRITE_IGNORED: + return; + + case ACCESS_WRITE_SETBIT: + regval |= data; + break; + + case ACCESS_WRITE_CLEARBIT: + regval &= ~data; + break; + + case ACCESS_WRITE_VALUE: + regval = (regval & ~(mask << word_offset)) | data; + break; + } + *reg = regval; + } else { + switch (ACCESS_READ_MASK(mode)) { + case ACCESS_READ_RAZ: + regval = 0; + /* fall through */ + + case ACCESS_READ_VALUE: + mmio_data_write(mmio, mask, regval >> word_offset); + } + } +} + +static bool handle_mmio_misc(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ + u32 reg; + u32 word_offset = offset & 3; + + switch (offset & ~3) { + case 0: /* CTLR */ + reg = vcpu->kvm->arch.vgic.enabled; + vgic_reg_access(mmio, ®, word_offset, + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); + if (mmio->is_write) { + vcpu->kvm->arch.vgic.enabled = reg & 1; + vgic_update_state(vcpu->kvm); + return true; + } + break; + + case 4: /* TYPER */ + reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; + reg |= (VGIC_NR_IRQS >> 5) - 1; + vgic_reg_access(mmio, ®, word_offset, + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); + break; + + case 8: /* IIDR */ + reg = 0x4B00043B; + vgic_reg_access(mmio, ®, word_offset, + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); + break; + } + + return false; +} + +static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ + vgic_reg_access(mmio, NULL, offset, + ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); + return false; +} + +static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); + if (mmio->is_write) { + vgic_update_state(vcpu->kvm); + return true; + } + + return false; +} + +static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); + if (mmio->is_write) { + if (offset < 4) /* Force SGI enabled */ + *reg |= 0xffff; + vgic_retire_disabled_irqs(vcpu); + vgic_update_state(vcpu->kvm); + return true; + } + + return false; +} + +static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); + if (mmio->is_write) { + vgic_update_state(vcpu->kvm); + return true; + } + + return false; +} + +static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); + if (mmio->is_write) { + vgic_update_state(vcpu->kvm); + return true; + } + + return false; +} + +static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); + return false; +} + +#define GICD_ITARGETSR_SIZE 32 +#define GICD_CPUTARGETS_BITS 8 +#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS) +static u32 vgic_get_target_reg(struct kvm *kvm, int irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i, c; + unsigned long *bmap; + u32 val = 0; + + irq -= VGIC_NR_PRIVATE_IRQS; + + kvm_for_each_vcpu(c, vcpu, kvm) { + bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); + for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) + if (test_bit(irq + i, bmap)) + val |= 1 << (c + i * 8); + } + + return val; +} + +static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i, c; + unsigned long *bmap; + u32 target; + + irq -= VGIC_NR_PRIVATE_IRQS; + + /* + * Pick the LSB in each byte. This ensures we target exactly + * one vcpu per IRQ. If the byte is null, assume we target + * CPU0. + */ + for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) { + int shift = i * GICD_CPUTARGETS_BITS; + target = ffs((val >> shift) & 0xffU); + target = target ? (target - 1) : 0; + dist->irq_spi_cpu[irq + i] = target; + kvm_for_each_vcpu(c, vcpu, kvm) { + bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); + if (c == target) + set_bit(irq + i, bmap); + else + clear_bit(irq + i, bmap); + } + } +} + +static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 reg; + + /* We treat the banked interrupts targets as read-only */ + if (offset < 32) { + u32 roreg = 1 << vcpu->vcpu_id; + roreg |= roreg << 8; + roreg |= roreg << 16; + + vgic_reg_access(mmio, &roreg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); + return false; + } + + reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U); + vgic_reg_access(mmio, ®, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); + if (mmio->is_write) { + vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U); + vgic_update_state(vcpu->kvm); + return true; + } + + return false; +} + +static u32 vgic_cfg_expand(u16 val) +{ + u32 res = 0; + int i; + + /* + * Turn a 16bit value like abcd...mnop into a 32bit word + * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is. + */ + for (i = 0; i < 16; i++) + res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1); + + return res; +} + +static u16 vgic_cfg_compress(u32 val) +{ + u16 res = 0; + int i; + + /* + * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like + * abcd...mnop which is what we really care about. + */ + for (i = 0; i < 16; i++) + res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i; + + return res; +} + +/* + * The distributor uses 2 bits per IRQ for the CFG register, but the + * LSB is always 0. As such, we only keep the upper bit, and use the + * two above functions to compress/expand the bits + */ +static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ + u32 val; + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, + vcpu->vcpu_id, offset >> 1); + if (offset & 2) + val = *reg >> 16; + else + val = *reg & 0xffff; + + val = vgic_cfg_expand(val); + vgic_reg_access(mmio, &val, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); + if (mmio->is_write) { + if (offset < 4) { + *reg = ~0U; /* Force PPIs/SGIs to 1 */ + return false; + } + + val = vgic_cfg_compress(val); + if (offset & 2) { + *reg &= 0xffff; + *reg |= val << 16; + } else { + *reg &= 0xffff << 16; + *reg |= val; + } + } + + return false; +} + +static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ + u32 reg; + vgic_reg_access(mmio, ®, offset, + ACCESS_READ_RAZ | ACCESS_WRITE_VALUE); + if (mmio->is_write) { + vgic_dispatch_sgi(vcpu, reg); + vgic_update_state(vcpu->kvm); + return true; + } + + return false; +} + +/* + * I would have liked to use the kvm_bus_io_*() API instead, but it + * cannot cope with banked registers (only the VM pointer is passed + * around, and we need the vcpu). One of these days, someone please + * fix it! + */ +struct mmio_range { + phys_addr_t base; + unsigned long len; + bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, + phys_addr_t offset); +}; + +static const struct mmio_range vgic_ranges[] = { + { + .base = GIC_DIST_CTRL, + .len = 12, + .handle_mmio = handle_mmio_misc, + }, + { + .base = GIC_DIST_IGROUP, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_raz_wi, + }, + { + .base = GIC_DIST_ENABLE_SET, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_set_enable_reg, + }, + { + .base = GIC_DIST_ENABLE_CLEAR, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_clear_enable_reg, + }, + { + .base = GIC_DIST_PENDING_SET, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_set_pending_reg, + }, + { + .base = GIC_DIST_PENDING_CLEAR, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_clear_pending_reg, + }, + { + .base = GIC_DIST_ACTIVE_SET, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_raz_wi, + }, + { + .base = GIC_DIST_ACTIVE_CLEAR, + .len = VGIC_NR_IRQS / 8, + .handle_mmio = handle_mmio_raz_wi, + }, + { + .base = GIC_DIST_PRI, + .len = VGIC_NR_IRQS, + .handle_mmio = handle_mmio_priority_reg, + }, + { + .base = GIC_DIST_TARGET, + .len = VGIC_NR_IRQS, + .handle_mmio = handle_mmio_target_reg, + }, + { + .base = GIC_DIST_CONFIG, + .len = VGIC_NR_IRQS / 4, + .handle_mmio = handle_mmio_cfg_reg, + }, + { + .base = GIC_DIST_SOFTINT, + .len = 4, + .handle_mmio = handle_mmio_sgi_reg, + }, + {} +}; + +static const +struct mmio_range *find_matching_range(const struct mmio_range *ranges, + struct kvm_exit_mmio *mmio, + phys_addr_t base) +{ + const struct mmio_range *r = ranges; + phys_addr_t addr = mmio->phys_addr - base; + + while (r->len) { + if (addr >= r->base && + (addr + mmio->len) <= (r->base + r->len)) + return r; + r++; + } + + return NULL; +} + +/** + * vgic_handle_mmio - handle an in-kernel MMIO access + * @vcpu: pointer to the vcpu performing the access + * @run: pointer to the kvm_run structure + * @mmio: pointer to the data describing the access + * + * returns true if the MMIO access has been performed in kernel space, + * and false if it needs to be emulated in user space. + */ +bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_exit_mmio *mmio) +{ + const struct mmio_range *range; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + unsigned long base = dist->vgic_dist_base; + bool updated_state; + unsigned long offset; + + if (!irqchip_in_kernel(vcpu->kvm) || + mmio->phys_addr < base || + (mmio->phys_addr + mmio->len) > (base + KVM_VGIC_V2_DIST_SIZE)) + return false; + + /* We don't support ldrd / strd or ldm / stm to the emulated vgic */ + if (mmio->len > 4) { + kvm_inject_dabt(vcpu, mmio->phys_addr); + return true; + } + + range = find_matching_range(vgic_ranges, mmio, base); + if (unlikely(!range || !range->handle_mmio)) { + pr_warn("Unhandled access %d %08llx %d\n", + mmio->is_write, mmio->phys_addr, mmio->len); + return false; + } + + spin_lock(&vcpu->kvm->arch.vgic.lock); + offset = mmio->phys_addr - range->base - base; + updated_state = range->handle_mmio(vcpu, mmio, offset); + spin_unlock(&vcpu->kvm->arch.vgic.lock); + kvm_prepare_mmio(run, mmio); + kvm_handle_mmio_return(vcpu, run); + + if (updated_state) + vgic_kick_vcpus(vcpu->kvm); + + return true; +} + +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) +{ + struct kvm *kvm = vcpu->kvm; + struct vgic_dist *dist = &kvm->arch.vgic; + int nrcpus = atomic_read(&kvm->online_vcpus); + u8 target_cpus; + int sgi, mode, c, vcpu_id; + + vcpu_id = vcpu->vcpu_id; + + sgi = reg & 0xf; + target_cpus = (reg >> 16) & 0xff; + mode = (reg >> 24) & 3; + + switch (mode) { + case 0: + if (!target_cpus) + return; + + case 1: + target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff; + break; + + case 2: + target_cpus = 1 << vcpu_id; + break; + } + + kvm_for_each_vcpu(c, vcpu, kvm) { + if (target_cpus & 1) { + /* Flag the SGI as pending */ + vgic_dist_irq_set(vcpu, sgi); + dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; + kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); + } + + target_cpus >>= 1; + } +} + +static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + unsigned long *pending, *enabled, *pend_percpu, *pend_shared; + unsigned long pending_private, pending_shared; + int vcpu_id; + + vcpu_id = vcpu->vcpu_id; + pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; + pend_shared = vcpu->arch.vgic_cpu.pending_shared; + + pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id); + enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); + bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); + + pending = vgic_bitmap_get_shared_map(&dist->irq_state); + enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); + bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); + bitmap_and(pend_shared, pend_shared, + vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), + VGIC_NR_SHARED_IRQS); + + pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); + pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS); + return (pending_private < VGIC_NR_PRIVATE_IRQS || + pending_shared < VGIC_NR_SHARED_IRQS); +} + +/* + * Update the interrupt state and determine which CPUs have pending + * interrupts. Must be called with distributor lock held. + */ +static void vgic_update_state(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int c; + + if (!dist->enabled) { + set_bit(0, &dist->irq_pending_on_cpu); + return; + } + + kvm_for_each_vcpu(c, vcpu, kvm) { + if (compute_pending_for_cpu(vcpu)) { + pr_debug("CPU%d has pending interrupts\n", c); + set_bit(c, &dist->irq_pending_on_cpu); + } + } +} + +#define LR_CPUID(lr) \ + (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT) +#define MK_LR_PEND(src, irq) \ + (GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq)) + +/* + * An interrupt may have been disabled after being made pending on the + * CPU interface (the classic case is a timer running while we're + * rebooting the guest - the interrupt would kick as soon as the CPU + * interface gets enabled, with deadly consequences). + * + * The solution is to examine already active LRs, and check the + * interrupt is still enabled. If not, just retire it. + */ +static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int lr; + + for_each_set_bit(lr, vgic_cpu->lr_used, vgic_cpu->nr_lr) { + int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + + if (!vgic_irq_is_enabled(vcpu, irq)) { + vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; + clear_bit(lr, vgic_cpu->lr_used); + vgic_cpu->vgic_lr[lr] &= ~GICH_LR_STATE; + if (vgic_irq_is_active(vcpu, irq)) + vgic_irq_clear_active(vcpu, irq); + } + } +} + +/* + * Queue an interrupt to a CPU virtual interface. Return true on success, + * or false if it wasn't possible to queue it. + */ +static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int lr; + + /* Sanitize the input... */ + BUG_ON(sgi_source_id & ~7); + BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); + BUG_ON(irq >= VGIC_NR_IRQS); + + kvm_debug("Queue IRQ%d\n", irq); + + lr = vgic_cpu->vgic_irq_lr_map[irq]; + + /* Do we have an active interrupt for the same CPUID? */ + if (lr != LR_EMPTY && + (LR_CPUID(vgic_cpu->vgic_lr[lr]) == sgi_source_id)) { + kvm_debug("LR%d piggyback for IRQ%d %x\n", + lr, irq, vgic_cpu->vgic_lr[lr]); + BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); + vgic_cpu->vgic_lr[lr] |= GICH_LR_PENDING_BIT; + return true; + } + + /* Try to use another LR for this interrupt */ + lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, + vgic_cpu->nr_lr); + if (lr >= vgic_cpu->nr_lr) + return false; + + kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); + vgic_cpu->vgic_lr[lr] = MK_LR_PEND(sgi_source_id, irq); + vgic_cpu->vgic_irq_lr_map[irq] = lr; + set_bit(lr, vgic_cpu->lr_used); + + if (!vgic_irq_is_edge(vcpu, irq)) + vgic_cpu->vgic_lr[lr] |= GICH_LR_EOI; + + return true; +} + +static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + unsigned long sources; + int vcpu_id = vcpu->vcpu_id; + int c; + + sources = dist->irq_sgi_sources[vcpu_id][irq]; + + for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { + if (vgic_queue_irq(vcpu, c, irq)) + clear_bit(c, &sources); + } + + dist->irq_sgi_sources[vcpu_id][irq] = sources; + + /* + * If the sources bitmap has been cleared it means that we + * could queue all the SGIs onto link registers (see the + * clear_bit above), and therefore we are done with them in + * our emulated gic and can get rid of them. + */ + if (!sources) { + vgic_dist_irq_clear(vcpu, irq); + vgic_cpu_irq_clear(vcpu, irq); + return true; + } + + return false; +} + +static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) +{ + if (vgic_irq_is_active(vcpu, irq)) + return true; /* level interrupt, already queued */ + + if (vgic_queue_irq(vcpu, 0, irq)) { + if (vgic_irq_is_edge(vcpu, irq)) { + vgic_dist_irq_clear(vcpu, irq); + vgic_cpu_irq_clear(vcpu, irq); + } else { + vgic_irq_set_active(vcpu, irq); + } + + return true; + } + + return false; +} + +/* + * Fill the list registers with pending interrupts before running the + * guest. + */ +static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int i, vcpu_id; + int overflow = 0; + + vcpu_id = vcpu->vcpu_id; + + /* + * We may not have any pending interrupt, or the interrupts + * may have been serviced from another vcpu. In all cases, + * move along. + */ + if (!kvm_vgic_vcpu_pending_irq(vcpu)) { + pr_debug("CPU%d has no pending interrupt\n", vcpu_id); + goto epilog; + } + + /* SGIs */ + for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) { + if (!vgic_queue_sgi(vcpu, i)) + overflow = 1; + } + + /* PPIs */ + for_each_set_bit_from(i, vgic_cpu->pending_percpu, VGIC_NR_PRIVATE_IRQS) { + if (!vgic_queue_hwirq(vcpu, i)) + overflow = 1; + } + + /* SPIs */ + for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) { + if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) + overflow = 1; + } + +epilog: + if (overflow) { + vgic_cpu->vgic_hcr |= GICH_HCR_UIE; + } else { + vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; + /* + * We're about to run this VCPU, and we've consumed + * everything the distributor had in store for + * us. Claim we don't have anything pending. We'll + * adjust that if needed while exiting. + */ + clear_bit(vcpu_id, &dist->irq_pending_on_cpu); + } +} + +static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + bool level_pending = false; + + kvm_debug("MISR = %08x\n", vgic_cpu->vgic_misr); + + if (vgic_cpu->vgic_misr & GICH_MISR_EOI) { + /* + * Some level interrupts have been EOIed. Clear their + * active bit. + */ + int lr, irq; + + for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_eisr, + vgic_cpu->nr_lr) { + irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + + vgic_irq_clear_active(vcpu, irq); + vgic_cpu->vgic_lr[lr] &= ~GICH_LR_EOI; + + /* Any additional pending interrupt? */ + if (vgic_dist_irq_is_pending(vcpu, irq)) { + vgic_cpu_irq_set(vcpu, irq); + level_pending = true; + } else { + vgic_cpu_irq_clear(vcpu, irq); + } + + /* + * Despite being EOIed, the LR may not have + * been marked as empty. + */ + set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr); + vgic_cpu->vgic_lr[lr] &= ~GICH_LR_ACTIVE_BIT; + } + } + + if (vgic_cpu->vgic_misr & GICH_MISR_U) + vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; + + return level_pending; +} + +/* + * Sync back the VGIC state after a guest run. The distributor lock is + * needed so we don't get preempted in the middle of the state processing. + */ +static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int lr, pending; + bool level_pending; + + level_pending = vgic_process_maintenance(vcpu); + + /* Clear mappings for empty LRs */ + for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr, + vgic_cpu->nr_lr) { + int irq; + + if (!test_and_clear_bit(lr, vgic_cpu->lr_used)) + continue; + + irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + + BUG_ON(irq >= VGIC_NR_IRQS); + vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; + } + + /* Check if we still have something up our sleeve... */ + pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elrsr, + vgic_cpu->nr_lr); + if (level_pending || pending < vgic_cpu->nr_lr) + set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); +} + +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + spin_lock(&dist->lock); + __kvm_vgic_flush_hwstate(vcpu); + spin_unlock(&dist->lock); +} + +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + spin_lock(&dist->lock); + __kvm_vgic_sync_hwstate(vcpu); + spin_unlock(&dist->lock); +} + +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 0; + + return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); +} + +static void vgic_kick_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int c; + + /* + * We've injected an interrupt, time to find out who deserves + * a good kick... + */ + kvm_for_each_vcpu(c, vcpu, kvm) { + if (kvm_vgic_vcpu_pending_irq(vcpu)) + kvm_vcpu_kick(vcpu); + } +} + +static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) +{ + int is_edge = vgic_irq_is_edge(vcpu, irq); + int state = vgic_dist_irq_is_pending(vcpu, irq); + + /* + * Only inject an interrupt if: + * - edge triggered and we have a rising edge + * - level triggered and we change level + */ + if (is_edge) + return level > state; + else + return level != state; +} + +static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, + unsigned int irq_num, bool level) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int is_edge, is_level; + int enabled; + bool ret = true; + + spin_lock(&dist->lock); + + vcpu = kvm_get_vcpu(kvm, cpuid); + is_edge = vgic_irq_is_edge(vcpu, irq_num); + is_level = !is_edge; + + if (!vgic_validate_injection(vcpu, irq_num, level)) { + ret = false; + goto out; + } + + if (irq_num >= VGIC_NR_PRIVATE_IRQS) { + cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS]; + vcpu = kvm_get_vcpu(kvm, cpuid); + } + + kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); + + if (level) + vgic_dist_irq_set(vcpu, irq_num); + else + vgic_dist_irq_clear(vcpu, irq_num); + + enabled = vgic_irq_is_enabled(vcpu, irq_num); + + if (!enabled) { + ret = false; + goto out; + } + + if (is_level && vgic_irq_is_active(vcpu, irq_num)) { + /* + * Level interrupt in progress, will be picked up + * when EOId. + */ + ret = false; + goto out; + } + + if (level) { + vgic_cpu_irq_set(vcpu, irq_num); + set_bit(cpuid, &dist->irq_pending_on_cpu); + } + +out: + spin_unlock(&dist->lock); + + return ret; +} + +/** + * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic + * @kvm: The VM structure pointer + * @cpuid: The CPU for PPIs + * @irq_num: The IRQ number that is assigned to the device + * @level: Edge-triggered: true: to trigger the interrupt + * false: to ignore the call + * Level-sensitive true: activates an interrupt + * false: deactivates an interrupt + * + * The GIC is not concerned with devices being active-LOW or active-HIGH for + * level-sensitive interrupts. You can think of the level parameter as 1 + * being HIGH and 0 being LOW and all devices being active-HIGH. + */ +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, + bool level) +{ + if (vgic_update_irq_state(kvm, cpuid, irq_num, level)) + vgic_kick_vcpus(kvm); + + return 0; +} + +static irqreturn_t vgic_maintenance_handler(int irq, void *data) +{ + /* + * We cannot rely on the vgic maintenance interrupt to be + * delivered synchronously. This means we can only use it to + * exit the VM, and we perform the handling of EOIed + * interrupts on the exit path (see vgic_process_maintenance). + */ + return IRQ_HANDLED; +} + +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int i; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 0; + + if (vcpu->vcpu_id >= VGIC_MAX_CPUS) + return -EBUSY; + + for (i = 0; i < VGIC_NR_IRQS; i++) { + if (i < VGIC_NR_PPIS) + vgic_bitmap_set_irq_val(&dist->irq_enabled, + vcpu->vcpu_id, i, 1); + if (i < VGIC_NR_PRIVATE_IRQS) + vgic_bitmap_set_irq_val(&dist->irq_cfg, + vcpu->vcpu_id, i, VGIC_CFG_EDGE); + + vgic_cpu->vgic_irq_lr_map[i] = LR_EMPTY; + } + + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vgic_cpu->vgic_vmcr = 0; + + vgic_cpu->nr_lr = vgic_nr_lr; + vgic_cpu->vgic_hcr = GICH_HCR_EN; /* Get the show on the road... */ + + return 0; +} + +static void vgic_init_maintenance_interrupt(void *info) +{ + enable_percpu_irq(vgic_maint_irq, 0); +} + +static int vgic_cpu_notify(struct notifier_block *self, + unsigned long action, void *cpu) +{ + switch (action) { + case CPU_STARTING: + case CPU_STARTING_FROZEN: + vgic_init_maintenance_interrupt(NULL); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + disable_percpu_irq(vgic_maint_irq); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block vgic_cpu_nb = { + .notifier_call = vgic_cpu_notify, +}; + +int kvm_vgic_hyp_init(void) +{ + int ret; + struct resource vctrl_res; + struct resource vcpu_res; + + vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic"); + if (!vgic_node) { + kvm_err("error: no compatible vgic node in DT\n"); + return -ENODEV; + } + + vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0); + if (!vgic_maint_irq) { + kvm_err("error getting vgic maintenance irq from DT\n"); + ret = -ENXIO; + goto out; + } + + ret = request_percpu_irq(vgic_maint_irq, vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", vgic_maint_irq); + goto out; + } + + ret = register_cpu_notifier(&vgic_cpu_nb); + if (ret) { + kvm_err("Cannot register vgic CPU notifier\n"); + goto out_free_irq; + } + + ret = of_address_to_resource(vgic_node, 2, &vctrl_res); + if (ret) { + kvm_err("Cannot obtain VCTRL resource\n"); + goto out_free_irq; + } + + vgic_vctrl_base = of_iomap(vgic_node, 2); + if (!vgic_vctrl_base) { + kvm_err("Cannot ioremap VCTRL\n"); + ret = -ENOMEM; + goto out_free_irq; + } + + vgic_nr_lr = readl_relaxed(vgic_vctrl_base + GICH_VTR); + vgic_nr_lr = (vgic_nr_lr & 0x3f) + 1; + + ret = create_hyp_io_mappings(vgic_vctrl_base, + vgic_vctrl_base + resource_size(&vctrl_res), + vctrl_res.start); + if (ret) { + kvm_err("Cannot map VCTRL into hyp\n"); + goto out_unmap; + } + + kvm_info("%s@%llx IRQ%d\n", vgic_node->name, + vctrl_res.start, vgic_maint_irq); + on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); + + if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { + kvm_err("Cannot obtain VCPU resource\n"); + ret = -ENXIO; + goto out_unmap; + } + vgic_vcpu_base = vcpu_res.start; + + goto out; + +out_unmap: + iounmap(vgic_vctrl_base); +out_free_irq: + free_percpu_irq(vgic_maint_irq, kvm_get_running_vcpus()); +out: + of_node_put(vgic_node); + return ret; +} + +int kvm_vgic_init(struct kvm *kvm) +{ + int ret = 0, i; + + mutex_lock(&kvm->lock); + + if (vgic_initialized(kvm)) + goto out; + + if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) || + IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) { + kvm_err("Need to set vgic cpu and dist addresses first\n"); + ret = -ENXIO; + goto out; + } + + ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, + vgic_vcpu_base, KVM_VGIC_V2_CPU_SIZE); + if (ret) { + kvm_err("Unable to remap VGIC CPU to VCPU\n"); + goto out; + } + + for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) + vgic_set_target_reg(kvm, 0, i); + + kvm_timer_init(kvm); + kvm->arch.vgic.ready = true; +out: + mutex_unlock(&kvm->lock); + return ret; +} + +int kvm_vgic_create(struct kvm *kvm) +{ + int ret = 0; + + mutex_lock(&kvm->lock); + + if (atomic_read(&kvm->online_vcpus) || kvm->arch.vgic.vctrl_base) { + ret = -EEXIST; + goto out; + } + + spin_lock_init(&kvm->arch.vgic.lock); + kvm->arch.vgic.vctrl_base = vgic_vctrl_base; + kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +static bool vgic_ioaddr_overlap(struct kvm *kvm) +{ + phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; + phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; + + if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu)) + return 0; + if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) || + (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist)) + return -EBUSY; + return 0; +} + +static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t size) +{ + int ret; + + if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) + return -EEXIST; + if (addr + size < addr) + return -EINVAL; + + ret = vgic_ioaddr_overlap(kvm); + if (ret) + return ret; + *ioaddr = addr; + return ret; +} + +int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) +{ + int r = 0; + struct vgic_dist *vgic = &kvm->arch.vgic; + + if (addr & ~KVM_PHYS_MASK) + return -E2BIG; + + if (addr & (SZ_4K - 1)) + return -EINVAL; + + mutex_lock(&kvm->lock); + switch (type) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base, + addr, KVM_VGIC_V2_DIST_SIZE); + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base, + addr, KVM_VGIC_V2_CPU_SIZE); + break; + default: + r = -ENODEV; + } + + mutex_unlock(&kvm->lock); + return r; +} -- cgit v0.10.2 From 535cf7b3b13c7faed3dfabafb6598417de1129ca Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 May 2013 14:31:02 +0100 Subject: KVM: get rid of $(addprefix ../../../virt/kvm/, ...) in Makefiles As requested by the KVM maintainers, remove the addprefix used to refer to the main KVM code from the arch code, and replace it with a KVM variable that does the same thing. Tested-by: Christian Borntraeger Cc: Paolo Bonzini Cc: Gleb Natapov Cc: Christoffer Dall Acked-by: Xiantao Zhang Cc: Tony Luck Cc: Fenghua Yu Cc: Alexander Graf Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Cornelia Huck Signed-off-by: Marc Zyngier Signed-off-by: Gleb Natapov diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 9184a49..d99bee4 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -15,7 +15,7 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) KVM := ../../../virt/kvm -kvm-arm-y = $(addprefix $(KVM)/, kvm_main.o coalesced_mmio.o) +kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile index 1a40537..18e45ec 100644 --- a/arch/ia64/kvm/Makefile +++ b/arch/ia64/kvm/Makefile @@ -47,12 +47,13 @@ FORCE : $(obj)/$(offsets-file) ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ +KVM := ../../../virt/kvm -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o) +common-objs = $(KVM)/kvm_main.o $(KVM)/ioapic.o \ + $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y) -common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o) +common-objs += $(KVM)/assigned-dev.o $(KVM)/iommu.o endif kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 422de3f..008cd85 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -5,9 +5,10 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm +KVM := ../../../virt/kvm -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ - eventfd.o) +common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ + $(KVM)/eventfd.o CFLAGS_44x_tlb.o := -I. CFLAGS_e500_mmu.o := -I. @@ -53,7 +54,7 @@ kvm-e500mc-objs := \ kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ - ../../../virt/kvm/coalesced_mmio.o \ + $(KVM)/coalesced_mmio.o \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ @@ -86,8 +87,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o kvm-book3s_64-module-objs := \ - ../../../virt/kvm/kvm_main.o \ - ../../../virt/kvm/eventfd.o \ + $(KVM)/kvm_main.o \ + $(KVM)/eventfd.o \ powerpc.o \ emulate.o \ book3s.o \ @@ -111,7 +112,7 @@ kvm-book3s_32-objs := \ kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o -kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o) +kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-objs := $(kvm-objs-m) $(kvm-objs-y) diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 8fe9d65..40b4c64 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -6,7 +6,8 @@ # it under the terms of the GNU General Public License (version 2 only) # as published by the Free Software Foundation. -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o) +KVM := ../../../virt/kvm +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d609e1d..bf4fb04 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,13 @@ CFLAGS_x86.o := -I. CFLAGS_svm.o := -I. CFLAGS_vmx.o := -I. -kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o eventfd.o \ - irqchip.o) -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ - assigned-dev.o iommu.o) -kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) +KVM := ../../../virt/kvm + +kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ + $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ + $(KVM)/eventfd.o $(KVM)/irqchip.o +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o cpuid.o pmu.o -- cgit v0.10.2 From eed3b1e55be07a057ee41a24f04abffeda7b885f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 17 May 2013 14:41:31 +0200 Subject: s390/pgtable: fix ipte notify bit Dont use the same bit as user referenced. Signed-off-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: Gleb Natapov diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 4105b82..0f0de30 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -306,7 +306,7 @@ extern unsigned long MODULES_END; #define RCP_HC_BIT 0x00200000UL #define RCP_GR_BIT 0x00040000UL #define RCP_GC_BIT 0x00020000UL -#define RCP_IN_BIT 0x00008000UL /* IPTE notify bit */ +#define RCP_IN_BIT 0x00002000UL /* IPTE notify bit */ /* User dirty / referenced bit for KVM's migration feature */ #define KVM_UR_BIT 0x00008000UL @@ -374,7 +374,7 @@ extern unsigned long MODULES_END; #define RCP_HC_BIT 0x0020000000000000UL #define RCP_GR_BIT 0x0004000000000000UL #define RCP_GC_BIT 0x0002000000000000UL -#define RCP_IN_BIT 0x0000800000000000UL /* IPTE notify bit */ +#define RCP_IN_BIT 0x0000200000000000UL /* IPTE notify bit */ /* User dirty / referenced bit for KVM's migration feature */ #define KVM_UR_BIT 0x0000800000000000UL -- cgit v0.10.2 From dfcf7dc64237dbe1acc2147ad3552f793003874b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 May 2013 14:41:32 +0200 Subject: s390/kvm: fix psw rewinding in handle_skey The PSW can wrap if the guest has been running in the 24 bit or 31 bit addressing mode. Use __rewind_psw to find the correct address. Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Gleb Natapov diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 6bbd7b5..ecc58a6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -105,7 +105,8 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) static int handle_skey(struct kvm_vcpu *vcpu) { vcpu->stat.instruction_storage_key++; - vcpu->arch.sie_block->gpsw.addr -= 4; + vcpu->arch.sie_block->gpsw.addr = + __rewind_psw(vcpu->arch.sie_block->gpsw, 4); VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); return 0; } -- cgit v0.10.2 From 0d0dafc1e48fd254c22f75738def870a7ffd2c3e Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 May 2013 14:41:33 +0200 Subject: s390/kvm: rename RCP_xxx defines to PGSTE_xxx The RCP byte is a part of the PGSTE value, the existing RCP_xxx names are inaccurate. As the defines describe bits and pieces of the PGSTE, the names should start with PGSTE_. The KVM_UR_BIT and KVM_UC_BIT are part of the PGSTE as well, give them better names as well. Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Gleb Natapov diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0f0de30..1fc68d9 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -299,18 +299,16 @@ extern unsigned long MODULES_END; #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV) /* Page status table bits for virtualization */ -#define RCP_ACC_BITS 0xf0000000UL -#define RCP_FP_BIT 0x08000000UL -#define RCP_PCL_BIT 0x00800000UL -#define RCP_HR_BIT 0x00400000UL -#define RCP_HC_BIT 0x00200000UL -#define RCP_GR_BIT 0x00040000UL -#define RCP_GC_BIT 0x00020000UL -#define RCP_IN_BIT 0x00002000UL /* IPTE notify bit */ - -/* User dirty / referenced bit for KVM's migration feature */ -#define KVM_UR_BIT 0x00008000UL -#define KVM_UC_BIT 0x00004000UL +#define PGSTE_ACC_BITS 0xf0000000UL +#define PGSTE_FP_BIT 0x08000000UL +#define PGSTE_PCL_BIT 0x00800000UL +#define PGSTE_HR_BIT 0x00400000UL +#define PGSTE_HC_BIT 0x00200000UL +#define PGSTE_GR_BIT 0x00040000UL +#define PGSTE_GC_BIT 0x00020000UL +#define PGSTE_UR_BIT 0x00008000UL +#define PGSTE_UC_BIT 0x00004000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x00002000UL /* IPTE notify bit */ #else /* CONFIG_64BIT */ @@ -367,18 +365,16 @@ extern unsigned long MODULES_END; | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO) /* Page status table bits for virtualization */ -#define RCP_ACC_BITS 0xf000000000000000UL -#define RCP_FP_BIT 0x0800000000000000UL -#define RCP_PCL_BIT 0x0080000000000000UL -#define RCP_HR_BIT 0x0040000000000000UL -#define RCP_HC_BIT 0x0020000000000000UL -#define RCP_GR_BIT 0x0004000000000000UL -#define RCP_GC_BIT 0x0002000000000000UL -#define RCP_IN_BIT 0x0000200000000000UL /* IPTE notify bit */ - -/* User dirty / referenced bit for KVM's migration feature */ -#define KVM_UR_BIT 0x0000800000000000UL -#define KVM_UC_BIT 0x0000400000000000UL +#define PGSTE_ACC_BITS 0xf000000000000000UL +#define PGSTE_FP_BIT 0x0800000000000000UL +#define PGSTE_PCL_BIT 0x0080000000000000UL +#define PGSTE_HR_BIT 0x0040000000000000UL +#define PGSTE_HC_BIT 0x0020000000000000UL +#define PGSTE_GR_BIT 0x0004000000000000UL +#define PGSTE_GC_BIT 0x0002000000000000UL +#define PGSTE_UR_BIT 0x0000800000000000UL +#define PGSTE_UC_BIT 0x0000400000000000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x0000200000000000UL /* IPTE notify bit */ #endif /* CONFIG_64BIT */ @@ -618,8 +614,8 @@ static inline pgste_t pgste_get_lock(pte_t *ptep) asm( " lg %0,%2\n" "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear RCP_PCL_BIT in old */ - " oihh %1,0x0080\n" /* set RCP_PCL_BIT in new */ + " nihh %0,0xff7f\n" /* clear PCL bit in old */ + " oihh %1,0x0080\n" /* set PCL bit in new */ " csg %0,%1,%2\n" " jl 0b\n" : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) @@ -632,7 +628,7 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE asm( - " nihh %1,0xff7f\n" /* clear RCP_PCL_BIT */ + " nihh %1,0xff7f\n" /* clear PCL bit */ " stg %1,%0\n" : "=Q" (ptep[PTRS_PER_PTE]) : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) : "cc"); @@ -657,14 +653,14 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste) else if (bits) page_reset_referenced(address); /* Transfer page changed & referenced bit to guest bits in pgste */ - pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */ + pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ /* Get host changed & referenced bits from pgste */ - bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52; + bits |= (pgste_val(pgste) & (PGSTE_HR_BIT | PGSTE_HC_BIT)) >> 52; /* Transfer page changed & referenced bit to kvm user bits */ - pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */ + pgste_val(pgste) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */ /* Clear relevant host bits in pgste. */ - pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT); - pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT); + pgste_val(pgste) &= ~(PGSTE_HR_BIT | PGSTE_HC_BIT); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); /* Copy page access key and fetch protection bit to pgste */ pgste_val(pgste) |= (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; @@ -685,15 +681,15 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste) /* Get referenced bit from storage key */ young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); if (young) - pgste_val(pgste) |= RCP_GR_BIT; + pgste_val(pgste) |= PGSTE_GR_BIT; /* Get host referenced bit from pgste */ - if (pgste_val(pgste) & RCP_HR_BIT) { - pgste_val(pgste) &= ~RCP_HR_BIT; + if (pgste_val(pgste) & PGSTE_HR_BIT) { + pgste_val(pgste) &= ~PGSTE_HR_BIT; young = 1; } /* Transfer referenced bit to kvm user bits and pte */ if (young) { - pgste_val(pgste) |= KVM_UR_BIT; + pgste_val(pgste) |= PGSTE_UR_BIT; pte_val(*ptep) |= _PAGE_SWR; } #endif @@ -712,7 +708,7 @@ static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry) okey = nkey = page_get_storage_key(address); nkey &= ~(_PAGE_ACC_BITS | _PAGE_FP_BIT); /* Set page access key and fetch protection bit from pgste */ - nkey |= (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56; + nkey |= (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; if (okey != nkey) page_set_storage_key(address, nkey, 0); #endif @@ -801,8 +797,8 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - if (pgste_val(pgste) & RCP_IN_BIT) { - pgste_val(pgste) &= ~RCP_IN_BIT; + if (pgste_val(pgste) & PGSTE_IN_BIT) { + pgste_val(pgste) &= ~PGSTE_IN_BIT; gmap_do_ipte_notify(mm, addr, ptep); } #endif @@ -970,8 +966,8 @@ static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); pgste = pgste_update_all(ptep, pgste); - dirty = !!(pgste_val(pgste) & KVM_UC_BIT); - pgste_val(pgste) &= ~KVM_UC_BIT; + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; pgste_set_unlock(ptep, pgste); return dirty; } @@ -990,8 +986,8 @@ static inline int ptep_test_and_clear_user_young(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); pgste = pgste_update_young(ptep, pgste); - young = !!(pgste_val(pgste) & KVM_UR_BIT); - pgste_val(pgste) &= ~KVM_UR_BIT; + young = !!(pgste_val(pgste) & PGSTE_UR_BIT); + pgste_val(pgste) &= ~PGSTE_UR_BIT; pgste_set_unlock(ptep, pgste); } return young; diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7805ddc..5ca7568 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -690,7 +690,7 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) entry = *ptep; if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) { pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= RCP_IN_BIT; + pgste_val(pgste) |= PGSTE_IN_BIT; pgste_set_unlock(ptep, pgste); start += PAGE_SIZE; len -= PAGE_SIZE; -- cgit v0.10.2 From 95d38fd0bcf1996082f5f8762e6f1c849755e0c6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 17 May 2013 14:41:34 +0200 Subject: s390/kvm: Mark if a cpu is in SIE Lets track in a private bit if the sie control block is active. We want to track this as closely as possible, so we also have to instrument the interrupt and program check handler. Lets use the existing HANDLE_SIE_INTERCEPT macro. Signed-off-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: Gleb Natapov diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 16bd5d1..962b92e 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -68,7 +68,10 @@ struct sca_block { struct kvm_s390_sie_block { atomic_t cpuflags; /* 0x0000 */ __u32 prefix; /* 0x0004 */ - __u8 reserved8[32]; /* 0x0008 */ + __u8 reserved08[4]; /* 0x0008 */ +#define PROG_IN_SIE (1<<0) + __u32 prog0c; /* 0x000c */ + __u8 reserved10[24]; /* 0x0010 */ __u64 cputm; /* 0x0028 */ __u64 ckc; /* 0x0030 */ __u64 epoch; /* 0x0038 */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 7a82f9f..6456bbe 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -7,6 +7,7 @@ #define ASM_OFFSETS_C #include +#include #include #include #include @@ -161,6 +162,7 @@ int main(void) DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb)); DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb)); DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce)); + DEFINE(__SIE_PROG0C, offsetof(struct kvm_s390_sie_block, prog0c)); #endif /* CONFIG_32BIT */ return 0; } diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 4c17eec..c2e81b4 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -84,7 +84,7 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING) .macro HANDLE_SIE_INTERCEPT scratch,pgmcheck #if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) tmhh %r8,0x0001 # interrupting from user ? - jnz .+42 + jnz .+52 lgr \scratch,%r9 slg \scratch,BASED(.Lsie_loop) clg \scratch,BASED(.Lsie_length) @@ -92,12 +92,14 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING) # Some program interrupts are suppressing (e.g. protection). # We must also check the instruction after SIE in that case. # do_protection_exception will rewind to rewind_pad - jh .+22 + jh .+32 .else - jhe .+22 + jhe .+32 .endif lg %r9,BASED(.Lsie_loop) LPP BASED(.Lhost_id) # set host id + lg %r14,__SF_EMPTY(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE #endif .endm @@ -956,10 +958,12 @@ sie_loop: lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce sie_gmap: lg %r14,__SF_EMPTY(%r15) # get control block pointer + oi __SIE_PROG0C+3(%r14),1 # we are in SIE now LPP __SF_EMPTY(%r15) # set guest id sie 0(%r14) sie_done: LPP __SF_EMPTY+16(%r15) # set host id + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lg %r14,__LC_THREAD_INFO # pointer thread_info struct sie_exit: lctlg %c1,%c1,__LC_USER_ASCE # load primary asce -- cgit v0.10.2 From 49b99e1e0dedbd6cc93b2d2776b60fb7151ff3d7 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 17 May 2013 14:41:35 +0200 Subject: s390/kvm: Provide a way to prevent reentering SIE Lets provide functions to prevent KVM from reentering SIE and to kick cpus out of SIE. We cannot use the common kvm_vcpu_kick code, since we need to kick out guests in places that hold architecture specific locks (e.g. pgste lock) which might be necessary on the other cpus - so no waiting possible. So lets provide a bit in a private field of the sie control block that acts as a gate keeper, after we claimed we are in SIE. Please note that we do not reuse prog0c, since we want to access that bit without atomic ops. Signed-off-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: Gleb Natapov diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 962b92e..9a809f9 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -71,7 +71,10 @@ struct kvm_s390_sie_block { __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ - __u8 reserved10[24]; /* 0x0010 */ + __u8 reserved10[16]; /* 0x0010 */ +#define PROG_BLOCK_SIE 0x00000001 + atomic_t prog20; /* 0x0020 */ + __u8 reserved24[4]; /* 0x0024 */ __u64 cputm; /* 0x0028 */ __u64 ckc; /* 0x0030 */ __u64 epoch; /* 0x0038 */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 6456bbe..78db633 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -163,6 +163,7 @@ int main(void) DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb)); DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce)); DEFINE(__SIE_PROG0C, offsetof(struct kvm_s390_sie_block, prog0c)); + DEFINE(__SIE_PROG20, offsetof(struct kvm_s390_sie_block, prog20)); #endif /* CONFIG_32BIT */ return 0; } diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index c2e81b4..c7daeef 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -958,7 +958,9 @@ sie_loop: lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce sie_gmap: lg %r14,__SF_EMPTY(%r15) # get control block pointer - oi __SIE_PROG0C+3(%r14),1 # we are in SIE now + oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now + tm __SIE_PROG20+3(%r14),1 # last exit... + jnz sie_done LPP __SF_EMPTY(%r15) # set guest id sie 0(%r14) sie_done: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c1c7c68..ef4ef21 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -454,6 +454,34 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return 0; } +void s390_vcpu_block(struct kvm_vcpu *vcpu) +{ + atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); +} + +void s390_vcpu_unblock(struct kvm_vcpu *vcpu) +{ + atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); +} + +/* + * Kick a guest cpu out of SIE and wait until SIE is not running. + * If the CPU is not running (e.g. waiting as idle) the function will + * return immediately. */ +void exit_sie(struct kvm_vcpu *vcpu) +{ + atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags); + while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* Kick a guest cpu out of SIE and prevent SIE-reentry */ +void exit_sie_sync(struct kvm_vcpu *vcpu) +{ + s390_vcpu_block(vcpu); + exit_sie(vcpu); +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { /* kvm common code refers to this, but never calls it */ diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index efc14f6..7a8abfd 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -133,6 +133,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); +void s390_vcpu_block(struct kvm_vcpu *vcpu); +void s390_vcpu_unblock(struct kvm_vcpu *vcpu); +void exit_sie(struct kvm_vcpu *vcpu); +void exit_sie_sync(struct kvm_vcpu *vcpu); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); -- cgit v0.10.2 From 2c70fe4416d5f6d092b20ebf7d7654835e09c109 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 17 May 2013 14:41:36 +0200 Subject: s390/kvm: Kick guests out of sie if prefix page host pte is touched The guest prefix pages must be mapped writeable all the time while SIE is running, otherwise the guest might see random behaviour. (pinned at the pte level) Turns out that mlocking is not enough, the page table entry (not the page) might change or become r/o. This patch uses the gmap notifiers to kick guest cpus out of SIE. Signed-off-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: Gleb Natapov diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1fc68d9..1d0ad7d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -739,6 +739,7 @@ struct gmap { struct mm_struct *mm; unsigned long *table; unsigned long asce; + void *private; struct list_head crst_list; }; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index b7d1b2e..f0b8be0 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -174,47 +174,12 @@ static int handle_stop(struct kvm_vcpu *vcpu) static int handle_validity(struct kvm_vcpu *vcpu) { - unsigned long vmaddr; int viwhy = vcpu->arch.sie_block->ipb >> 16; - int rc; vcpu->stat.exit_validity++; trace_kvm_s390_intercept_validity(vcpu, viwhy); - if (viwhy == 0x37) { - vmaddr = gmap_fault(vcpu->arch.sie_block->prefix, - vcpu->arch.gmap); - if (IS_ERR_VALUE(vmaddr)) { - rc = -EOPNOTSUPP; - goto out; - } - rc = fault_in_pages_writeable((char __user *) vmaddr, - PAGE_SIZE); - if (rc) { - /* user will receive sigsegv, exit to user */ - rc = -EOPNOTSUPP; - goto out; - } - vmaddr = gmap_fault(vcpu->arch.sie_block->prefix + PAGE_SIZE, - vcpu->arch.gmap); - if (IS_ERR_VALUE(vmaddr)) { - rc = -EOPNOTSUPP; - goto out; - } - rc = fault_in_pages_writeable((char __user *) vmaddr, - PAGE_SIZE); - if (rc) { - /* user will receive sigsegv, exit to user */ - rc = -EOPNOTSUPP; - goto out; - } - } else - rc = -EOPNOTSUPP; - -out: - if (rc) - VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d", - viwhy); - return rc; + WARN_ONCE(true, "kvm: unhandled validity intercept 0x%x\n", viwhy); + return -EOPNOTSUPP; } static int handle_instruction(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ef4ef21..08227c1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -84,6 +84,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { }; static unsigned long long *facilities; +static struct gmap_notifier gmap_notifier; /* Section: not file related */ int kvm_arch_hardware_enable(void *garbage) @@ -96,13 +97,18 @@ void kvm_arch_hardware_disable(void *garbage) { } +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); + int kvm_arch_hardware_setup(void) { + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_ipte_notifier(&gmap_notifier); return 0; } void kvm_arch_hardware_unsetup(void) { + gmap_unregister_ipte_notifier(&gmap_notifier); } void kvm_arch_check_processor_compat(void *rtn) @@ -239,6 +245,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.gmap = gmap_alloc(current->mm); if (!kvm->arch.gmap) goto out_nogmap; + kvm->arch.gmap->private = kvm; } kvm->arch.css_support = 0; @@ -309,6 +316,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.gmap = gmap_alloc(current->mm); if (!vcpu->arch.gmap) return -ENOMEM; + vcpu->arch.gmap->private = vcpu->kvm; return 0; } @@ -482,6 +490,22 @@ void exit_sie_sync(struct kvm_vcpu *vcpu) exit_sie(vcpu); } +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) +{ + int i; + struct kvm *kvm = gmap->private; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + /* match against both prefix pages */ + if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) { + VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + exit_sie_sync(vcpu); + } + } +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { /* kvm common code refers to this, but never calls it */ @@ -634,6 +658,27 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; /* not implemented yet */ } +static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) +{ + /* + * We use MMU_RELOAD just to re-arm the ipte notifier for the + * guest prefix page. gmap_ipte_notify will wait on the ptl lock. + * This ensures that the ipte instruction for this request has + * already finished. We might race against a second unmapper that + * wants to set the blocking bit. Lets just retry the request loop. + */ + while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { + int rc; + rc = gmap_ipte_notify(vcpu->arch.gmap, + vcpu->arch.sie_block->prefix, + PAGE_SIZE * 2); + if (rc) + return rc; + s390_vcpu_unblock(vcpu); + } + return 0; +} + static int __vcpu_run(struct kvm_vcpu *vcpu) { int rc; @@ -649,6 +694,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (!kvm_is_ucontrol(vcpu->kvm)) kvm_s390_deliver_pending_interrupts(vcpu); + rc = kvm_s390_handle_requests(vcpu); + if (rc) + return rc; + vcpu->arch.sie_block->icptcode = 0; preempt_disable(); kvm_guest_enter(); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 7a8abfd..269b523 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -63,6 +63,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) { vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u; vcpu->arch.sie_block->ihcpu = 0xffff; + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From 7c470539c95630c1f2a10f109e96f249730b75eb Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 May 2013 14:41:37 +0200 Subject: s390/kvm: avoid automatic sie reentry Do not automatically restart the sie instruction in entry64.S after an interrupt, return to the caller with a reason code instead. That allows to deal with RCU and other conditions in C code. Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Gleb Natapov diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index c7daeef..51d99ac 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -47,7 +47,6 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) -_TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING) #define BASED(name) name-system_call(%r13) @@ -81,25 +80,27 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING) #endif .endm - .macro HANDLE_SIE_INTERCEPT scratch,pgmcheck + .macro HANDLE_SIE_INTERCEPT scratch,reason #if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) tmhh %r8,0x0001 # interrupting from user ? - jnz .+52 + jnz .+62 lgr \scratch,%r9 - slg \scratch,BASED(.Lsie_loop) - clg \scratch,BASED(.Lsie_length) - .if \pgmcheck + slg \scratch,BASED(.Lsie_critical) + clg \scratch,BASED(.Lsie_critical_length) + .if \reason==1 # Some program interrupts are suppressing (e.g. protection). # We must also check the instruction after SIE in that case. # do_protection_exception will rewind to rewind_pad - jh .+32 + jh .+42 .else - jhe .+32 + jhe .+42 .endif - lg %r9,BASED(.Lsie_loop) - LPP BASED(.Lhost_id) # set host id - lg %r14,__SF_EMPTY(%r15) # get control block pointer + lg %r14,__SF_EMPTY(%r15) # get control block pointer + LPP __SF_EMPTY+16(%r15) # set host id ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + mvi __SF_EMPTY+31(%r15),\reason # set exit reason #endif .endm @@ -452,7 +453,7 @@ ENTRY(io_int_handler) lg %r12,__LC_THREAD_INFO larl %r13,system_call lmg %r8,%r9,__LC_IO_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,0 + HANDLE_SIE_INTERCEPT %r14,2 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT tmhh %r8,0x0001 # interrupting from user? jz io_skip @@ -597,7 +598,7 @@ ENTRY(ext_int_handler) lg %r12,__LC_THREAD_INFO larl %r13,system_call lmg %r8,%r9,__LC_EXT_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,0 + HANDLE_SIE_INTERCEPT %r14,3 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT tmhh %r8,0x0001 # interrupting from user ? jz ext_skip @@ -645,7 +646,7 @@ ENTRY(mcck_int_handler) lg %r12,__LC_THREAD_INFO larl %r13,system_call lmg %r8,%r9,__LC_MCK_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,0 + HANDLE_SIE_INTERCEPT %r14,4 tm __LC_MCCK_CODE,0x80 # system damage? jo mcck_panic # yes -> rest of mcck code invalid lghi %r14,__LC_CPU_TIMER_SAVE_AREA @@ -939,19 +940,8 @@ ENTRY(sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers stg %r2,__SF_EMPTY(%r15) # save control block pointer stg %r3,__SF_EMPTY+8(%r15) # save guest register save area - xc __SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # host id == 0 + xc __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason lmg %r0,%r13,0(%r3) # load guest gprs 0-13 -# some program checks are suppressing. C code (e.g. do_protection_exception) -# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other -# instructions in the sie_loop should not cause program interrupts. So -# lets use a nop (47 00 00 00) as a landing pad. -# See also HANDLE_SIE_INTERCEPT -rewind_pad: - nop 0 -sie_loop: - lg %r14,__LC_THREAD_INFO # pointer thread_info struct - tm __TI_flags+7(%r14),_TIF_EXIT_SIE - jnz sie_exit lg %r14,__LC_GMAP # get gmap pointer ltgr %r14,%r14 jz sie_gmap @@ -966,33 +956,33 @@ sie_gmap: sie_done: LPP __SF_EMPTY+16(%r15) # set host id ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lg %r14,__LC_THREAD_INFO # pointer thread_info struct -sie_exit: lctlg %c1,%c1,__LC_USER_ASCE # load primary asce +# some program checks are suppressing. C code (e.g. do_protection_exception) +# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other +# instructions beween sie64a and sie_done should not cause program +# interrupts. So lets use a nop (47 00 00 00) as a landing pad. +# See also HANDLE_SIE_INTERCEPT +rewind_pad: + nop 0 +sie_exit: lg %r14,__SF_EMPTY+8(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers - lghi %r2,0 + lg %r2,__SF_EMPTY+24(%r15) # return exit reason code br %r14 sie_fault: - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - lg %r14,__LC_THREAD_INFO # pointer thread_info struct - lg %r14,__SF_EMPTY+8(%r15) # load guest register save area - stmg %r0,%r13,0(%r14) # save guest gprs 0-13 - lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers - lghi %r2,-EFAULT - br %r14 + lghi %r14,-EFAULT + stg %r14,__SF_EMPTY+24(%r15) # set exit reason code + j sie_exit .align 8 -.Lsie_loop: - .quad sie_loop -.Lsie_length: - .quad sie_done - sie_loop -.Lhost_id: - .quad 0 +.Lsie_critical: + .quad sie_gmap +.Lsie_critical_length: + .quad sie_done - sie_gmap EX_TABLE(rewind_pad,sie_fault) - EX_TABLE(sie_loop,sie_fault) + EX_TABLE(sie_exit,sie_fault) #endif .section .rodata, "a" diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 08227c1..93444c4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -707,7 +707,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) trace_kvm_s390_sie_enter(vcpu, atomic_read(&vcpu->arch.sie_block->cpuflags)); rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); - if (rc) { + if (rc > 0) + rc = 0; + if (rc < 0) { if (kvm_is_ucontrol(vcpu->kvm)) { rc = SIE_INTERCEPT_UCONTROL; } else { -- cgit v0.10.2 From f8b5ff2cff232df052955ef975f7219e1faa217f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 17 May 2013 14:41:38 +0200 Subject: s390: fix gmap_ipte_notifier vs. software dirty pages On heavy paging load some guest cpus started to loop in gmap_ipte_notify. This was visible as stalled cpus inside the guest. The gmap_ipte_notifier tries to map a user page and then made sure that the pte is valid and writable. Turns out that with the software change bit tracking the pte can become read-only (and only software writable) if the page is clean. Since we loop in this code, the page would stay clean and, therefore, be never writable again. Let us just use fixup_user_fault, that guarantees to call handle_mm_fault. Signed-off-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: Gleb Natapov diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 5ca7568..1e0c438 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -677,8 +677,7 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) break; } /* Get the page mapped */ - if (get_user_pages(current, gmap->mm, addr, 1, 1, 0, - NULL, NULL) != 1) { + if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { rc = -EFAULT; break; } -- cgit v0.10.2 From fb32b1eda29f2040148b0e172f9cbbd2f07697e4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:44 +0200 Subject: KVM: x86 emulator: add support for writing back the source operand Some instructions write back the source operand, not just the destination. Add support for doing this via the decode flags. Gleb: add BUG_ON() to prevent source to be memory operand. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8db0010..a4c266e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -155,6 +155,7 @@ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ +#define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -1723,45 +1724,42 @@ static void write_register_operand(struct operand *op) } } -static int writeback(struct x86_emulate_ctxt *ctxt) +static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { int rc; - if (ctxt->d & NoWrite) - return X86EMUL_CONTINUE; - - switch (ctxt->dst.type) { + switch (op->type) { case OP_REG: - write_register_operand(&ctxt->dst); + write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) rc = segmented_cmpxchg(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.orig_val, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); else rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.val, - ctxt->dst.bytes); + op->addr.mem, + &op->val, + op->bytes); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_MEM_STR: rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - ctxt->dst.data, - ctxt->dst.bytes * ctxt->dst.count); + op->addr.mem, + op->data, + op->bytes * op->count); if (rc != X86EMUL_CONTINUE) return rc; break; case OP_XMM: - write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); + write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); break; case OP_MM: - write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); + write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -4769,9 +4767,17 @@ special_insn: goto done; writeback: - rc = writeback(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (ctxt->d & SrcWrite) { + BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); + rc = writeback(ctxt, &ctxt->src); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused -- cgit v0.10.2 From 820207c8fc508be8f104d4d6b19c8f695fe0d5f3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:45 +0200 Subject: KVM: x86 emulator: decode extended accumulator explicity Single-operand MUL and DIV access an extended accumulator: AX for byte instructions, and DX:AX, EDX:EAX, or RDX:RAX for larger-sized instructions. Add support for fetching the extended accumulator. In order not to change things too much, RDX is loaded into Src2, which is already loaded by fastop(). This avoids increasing register pressure on i386. Gleb: disable src writeback for ByteOp div/mul. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a4c266e..36cb786 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -61,6 +61,8 @@ #define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ +#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ +#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -86,6 +88,7 @@ #define DstMem64 (OpMem64 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) +#define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcShift 6 @@ -108,6 +111,7 @@ #define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) +#define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -157,6 +161,8 @@ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ +#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) + #define X2(x...) x, x #define X3(x...) X2(x), x #define X4(x...) X2(x), X2(x) @@ -4166,6 +4172,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_register_operand(op); op->orig_val = op->val; break; + case OpAccLo: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpAccHi: + if (ctxt->d & ByteOp) { + op->type = OP_NONE; + break; + } + op->type = OP_REG; + op->bytes = ctxt->op_bytes; + op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); + fetch_register_operand(op); + op->orig_val = op->val; + break; case OpDI: op->type = OP_MEM; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; -- cgit v0.10.2 From ab2c5ce66661e07c315a53bef9b507cf766d7905 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:46 +0200 Subject: KVM: x86 emulator: switch MUL/DIV to DstXacc Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 36cb786..b9fb89b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -142,6 +142,7 @@ /* Source 2 operand type */ #define Src2Shift (31) #define Src2None (OpNone << Src2Shift) +#define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) @@ -548,8 +549,8 @@ FOP_END; #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ - ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \ - ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \ + ulong *rax = &ctxt->dst.val; \ + ulong *rdx = &ctxt->src.val; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "1") \ @@ -564,7 +565,7 @@ FOP_END; _ASM_EXTABLE(1b, 3b) \ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src2.val)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ @@ -3735,10 +3736,10 @@ static const struct opcode group3[] = { F(DstMem | SrcImm | NoWrite, em_test), F(DstMem | SrcNone | Lock, em_not), F(DstMem | SrcNone | Lock, em_neg), - I(SrcMem, em_mul_ex), - I(SrcMem, em_imul_ex), - I(SrcMem, em_div_ex), - I(SrcMem, em_idiv_ex), + I(DstXacc | Src2Mem, em_mul_ex), + I(DstXacc | Src2Mem, em_imul_ex), + I(DstXacc | Src2Mem, em_div_ex), + I(DstXacc | Src2Mem, em_idiv_ex), }; static const struct opcode group4[] = { -- cgit v0.10.2 From 017da7b604cf1982c35162c29895fba842282e11 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:47 +0200 Subject: KVM: x86 emulator: Switch fastop src operand to RDX This makes OpAccHi useful. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b9fb89b..b899d24 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -179,8 +179,8 @@ /* * fastop functions have a special calling convention: * - * dst: [rdx]:rax (in/out) - * src: rbx (in/out) + * dst: rax (in/out) + * src: rdx (in/out) * src2: rcx (in) * flags: rflags (in/out) * @@ -485,19 +485,19 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FASTOP2(op) \ FOP_START(op) \ - FOP2E(op##b, al, bl) \ - FOP2E(op##w, ax, bx) \ - FOP2E(op##l, eax, ebx) \ - ON64(FOP2E(op##q, rax, rbx)) \ + FOP2E(op##b, al, dl) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ FOP_END /* 2 operand, word only */ #define FASTOP2W(op) \ FOP_START(op) \ FOPNOP() \ - FOP2E(op##w, ax, bx) \ - FOP2E(op##l, eax, ebx) \ - ON64(FOP2E(op##q, rax, rbx)) \ + FOP2E(op##w, ax, dx) \ + FOP2E(op##l, eax, edx) \ + ON64(FOP2E(op##q, rax, rdx)) \ FOP_END /* 2 operand, src is CL */ @@ -516,9 +516,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FASTOP3WCL(op) \ FOP_START(op) \ FOPNOP() \ - FOP3E(op##w, ax, bx, cl) \ - FOP3E(op##l, eax, ebx, cl) \ - ON64(FOP3E(op##q, rax, rbx, cl)) \ + FOP3E(op##w, ax, dx, cl) \ + FOP3E(op##l, eax, edx, cl) \ + ON64(FOP3E(op##q, rax, rdx, cl)) \ FOP_END /* Special case for SETcc - 1 instruction per cc */ @@ -4574,7 +4574,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" - : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags) : "c"(ctxt->src2.val), [fastop]"S"(fop)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); return X86EMUL_CONTINUE; -- cgit v0.10.2 From b9fa409b00a1ed2a372758fd09f3f5df70f8d59a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:48 +0200 Subject: KVM: x86 emulator: convert single-operand MUL/IMUL to fastop Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b899d24..3a3542a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -480,6 +480,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ON64(FOP1E(op##q, rax)) \ FOP_END +/* 1-operand, using src2 (for MUL/DIV r/m) */ +#define FASTOP1SRC2(op, name) \ + FOP_START(name) \ + FOP1E(op, cl) \ + FOP1E(op, cx) \ + FOP1E(op, ecx) \ + ON64(FOP1E(op, rcx)) \ + FOP_END + #define FOP2E(op, dst, src) \ FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET @@ -996,6 +1005,9 @@ FASTOP2(xor); FASTOP2(cmp); FASTOP2(test); +FASTOP1SRC2(mul, mul_ex); +FASTOP1SRC2(imul, imul_ex); + FASTOP3WCL(shld); FASTOP3WCL(shrd); @@ -2119,22 +2131,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_mul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "mul", ex); - return X86EMUL_CONTINUE; -} - -static int em_imul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "imul", ex); - return X86EMUL_CONTINUE; -} - static int em_div_ex(struct x86_emulate_ctxt *ctxt) { u8 de = 0; @@ -3736,8 +3732,8 @@ static const struct opcode group3[] = { F(DstMem | SrcImm | NoWrite, em_test), F(DstMem | SrcNone | Lock, em_not), F(DstMem | SrcNone | Lock, em_neg), - I(DstXacc | Src2Mem, em_mul_ex), - I(DstXacc | Src2Mem, em_imul_ex), + F(DstXacc | Src2Mem, em_mul_ex), + F(DstXacc | Src2Mem, em_imul_ex), I(DstXacc | Src2Mem, em_div_ex), I(DstXacc | Src2Mem, em_idiv_ex), }; @@ -4572,7 +4568,8 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; - fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags) : "c"(ctxt->src2.val), [fastop]"S"(fop)); -- cgit v0.10.2 From b8c0b6ae498fe5c3f29966bd2a2d9882911b887b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:49 +0200 Subject: KVM: x86 emulator: convert DIV/IDIV to fastop Since DIV and IDIV can generate exceptions, we need an additional output parameter indicating whether an execption has occured. To avoid increasing register pressure on i386, we use %rsi, which is already allocated for the fastop code pointer. Gleb: added comment about fop usage as exception indication. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3a3542a..8404dc3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -183,6 +183,7 @@ * src: rdx (in/out) * src2: rcx (in) * flags: rflags (in/out) + * ex: rsi (in:fastop pointer, out:zero if exception) * * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for * different operand sizes can be reached by calculation, rather than a jump @@ -470,7 +471,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FOPNOP() FOP_ALIGN FOP_RET #define FOP1E(op, dst) \ - FOP_ALIGN #op " %" #dst " \n\t" FOP_RET + FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + +#define FOP1EEX(op, dst) \ + FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) #define FASTOP1(op) \ FOP_START(op) \ @@ -489,6 +493,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); ON64(FOP1E(op, rcx)) \ FOP_END +/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */ +#define FASTOP1SRC2EX(op, name) \ + FOP_START(name) \ + FOP1EEX(op, cl) \ + FOP1EEX(op, cx) \ + FOP1EEX(op, ecx) \ + ON64(FOP1EEX(op, rcx)) \ + FOP_END + #define FOP2E(op, dst, src) \ FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET @@ -533,6 +546,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); /* Special case for SETcc - 1 instruction per cc */ #define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" +asm(".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret"); + FOP_START(setcc) FOP_SETCC(seto) FOP_SETCC(setno) @@ -1007,6 +1023,8 @@ FASTOP2(test); FASTOP1SRC2(mul, mul_ex); FASTOP1SRC2(imul, imul_ex); +FASTOP1SRC2EX(div, div_ex); +FASTOP1SRC2EX(idiv, idiv_ex); FASTOP3WCL(shld); FASTOP3WCL(shrd); @@ -2131,26 +2149,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_div_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "div", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - -static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "idiv", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - static int em_grp45(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; @@ -3734,8 +3732,8 @@ static const struct opcode group3[] = { F(DstMem | SrcNone | Lock, em_neg), F(DstXacc | Src2Mem, em_mul_ex), F(DstXacc | Src2Mem, em_imul_ex), - I(DstXacc | Src2Mem, em_div_ex), - I(DstXacc | Src2Mem, em_idiv_ex), + F(DstXacc | Src2Mem, em_div_ex), + F(DstXacc | Src2Mem, em_idiv_ex), }; static const struct opcode group4[] = { @@ -4571,9 +4569,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" - : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags) - : "c"(ctxt->src2.val), [fastop]"S"(fop)); + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), + [fastop]"+S"(fop) + : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); + if (!fop) /* exception is returned in fop variable */ + return emulate_de(ctxt); return X86EMUL_CONTINUE; } -- cgit v0.10.2 From 203831e8e4bd9ab3b2f9f86c73c74fe912e06ac5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:50 +0200 Subject: KVM: x86 emulator: drop unused old-style inline emulation Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8404dc3..d71aac0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -285,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) } /* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - #ifdef CONFIG_X86_64 #define ON64(x) x #else #define ON64(x) #endif -#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" ((ctxt)->eflags), \ - "+q" (*(_dsttype*)&(ctxt)->dst.val), \ - "=&r" (_tmp) \ - : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ - break; \ - case 4: \ - ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ - break; \ - case 8: \ - ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((ctxt)->dst.bytes) { \ - case 1: \ - ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ - break; \ - default: \ - __emulate_2op_nobyte(ctxt, _op, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(ctxt, _op) \ - __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (ctxt)->src2.val; \ - _type _srcv = (ctxt)->src.val; \ - _type _dstv = (ctxt)->dst.val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (ctxt)->src2.val = (unsigned long) _clv; \ - (ctxt)->src2.val = (unsigned long) _srcv; \ - (ctxt)->dst.val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - __emulate_2op_cl(ctxt, _op, "w", u16); \ - break; \ - case 4: \ - __emulate_2op_cl(ctxt, _op, "l", u32); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(ctxt, _op, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 1: __emulate_1op(ctxt, _op, "b"); break; \ - case 2: __emulate_1op(ctxt, _op, "w"); break; \ - case 4: __emulate_1op(ctxt, _op, "l"); break; \ - case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ - } \ - } while (0) - static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" @@ -571,47 +414,6 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; -#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ - do { \ - unsigned long _tmp; \ - ulong *rax = &ctxt->dst.val; \ - ulong *rdx = &ctxt->src.val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "1") \ - "1: \n\t" \ - _op _suffix " %6; " \ - "2: \n\t" \ - _POST_EFLAGS("0", "5", "1") \ - ".pushsection .fixup,\"ax\" \n\t" \ - "3: movb $1, %4 \n\t" \ - "jmp 2b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ - "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src2.val)); \ - } while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ - do { \ - switch((ctxt)->src.bytes) { \ - case 1: \ - __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ - break; \ - case 8: ON64( \ - __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ - break; \ - } \ - } while (0) - static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) -- cgit v0.10.2 From e47a5f5fb715b90b40747e9e235de557c6abd56c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 9 Feb 2013 11:31:51 +0200 Subject: KVM: x86 emulator: convert XADD to fastop Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d71aac0..9ebe95c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -853,6 +853,8 @@ FASTOP2W(bts); FASTOP2W(btr); FASTOP2W(btc); +FASTOP2(xadd); + static u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; @@ -3861,7 +3863,7 @@ static const struct opcode twobyte_table[256] = { F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcReg | ModRM | Lock), + F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), /* 0xC8 - 0xCF */ @@ -4698,12 +4700,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; - case 0xc0 ... 0xc1: /* xadd */ - fastop(ctxt, em_add); - /* Write back the register source. */ - ctxt->src.val = ctxt->dst.orig_val; - write_register_operand(&ctxt->src); - break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : -- cgit v0.10.2 From 566af9404bf57267ea4fc29ca6b4628a17ee3ea7 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 27 May 2013 18:42:33 +0200 Subject: KVM: s390: Add "devname:kvm" alias. Providing a "devname:kvm" module alias enables automatic loading of the kvm module when /dev/kvm is opened. Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 93444c4..3b597e5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1138,3 +1138,12 @@ static void __exit kvm_s390_exit(void) module_init(kvm_s390_init); module_exit(kvm_s390_exit); + +/* + * Enable autoloading of the kvm module. + * Note that we add the module alias here instead of virt/kvm/kvm_main.c + * since x86 takes a different approach. + */ +#include +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); -- cgit v0.10.2 From 6ea34c9b78c10289846db0abeebd6b84d5aca084 Mon Sep 17 00:00:00 2001 From: Amos Kong Date: Sat, 25 May 2013 06:44:15 +0800 Subject: kvm: exclude ioeventfd from counting kvm_io_range limit We can easily reach the 1000 limit by start VM with a couple hundred I/O devices (multifunction=on). The hardcode limit already been adjusted 3 times (6 ~ 200 ~ 300 ~ 1000). In userspace, we already have maximum file descriptor to limit ioeventfd count. But kvm_io_bus devices also are used for pit, pic, ioapic, coalesced_mmio. They couldn't be limited by maximum file descriptor. Currently only ioeventfds take too much kvm_io_bus devices, so just exclude it from counting kvm_io_range limit. Also fixed one indent issue in kvm_host.h Signed-off-by: Amos Kong Reviewed-by: Stefan Hajnoczi Signed-off-by: Gleb Natapov diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d9a3c30..e3aae6d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -145,7 +145,8 @@ struct kvm_io_range { #define NR_IOBUS_DEVS 1000 struct kvm_io_bus { - int dev_count; + int dev_count; + int ioeventfd_count; struct kvm_io_range range[]; }; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 64ee720..1550637 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -753,6 +753,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (ret < 0) goto unlock_fail; + kvm->buses[bus_idx]->ioeventfd_count++; list_add_tail(&p->list, &kvm->ioeventfds); mutex_unlock(&kvm->slots_lock); @@ -798,6 +799,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); + kvm->buses[bus_idx]->ioeventfd_count--; ioeventfd_release(p); ret = 0; break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b547a1c..1580dd4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2926,7 +2926,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - if (bus->dev_count > NR_IOBUS_DEVS - 1) + /* exclude ioeventfd which is limited by maximum fd */ + if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * -- cgit v0.10.2 From 758ccc89b83cc15d575204091c1a1fec306245cb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:20 +0800 Subject: KVM: x86: drop calling kvm_mmu_zap_all in emulator_fix_hypercall Quote Gleb's mail: | Back then kvm->lock protected memslot access so code like: | | mutex_lock(&vcpu->kvm->lock); | kvm_mmu_zap_all(vcpu->kvm); | mutex_unlock(&vcpu->kvm->lock); | | which is what 7aa81cc0 does was enough to guaranty that no vcpu will | run while code is patched. This is no longer the case and | mutex_lock(&vcpu->kvm->lock); is gone from that code path long time ago, | so now kvm_mmu_zap_all() there is useless and the code is incorrect. So we drop it and it will be fixed later Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d28810..6739b1d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5523,13 +5523,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. - */ - kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); -- cgit v0.10.2 From a2ae162265e88bf5490ce54fd5f2d430d6d992b7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:21 +0800 Subject: KVM: MMU: drop unnecessary kvm_reload_remote_mmus It is the responsibility of kvm_mmu_zap_all that keeps the consistent of mmu and tlbs. And it is also unnecessary after zap all mmio sptes since no mmio spte exists on root shadow page and it can not be cached into tlb Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6739b1d..3758ff9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7060,16 +7060,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * If memory slot is created, or moved, we need to clear all * mmio sptes. */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) kvm_mmu_zap_mmio_sptes(kvm); - kvm_reload_remote_mmus(kvm); - } } void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); - kvm_reload_remote_mmus(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, -- cgit v0.10.2 From 5304b8d37c2a5ebca48330f5e7868d240eafbed1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:22 +0800 Subject: KVM: MMU: fast invalidate all pages The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to walk and zap all shadow pages one by one, also it need to zap all guest page's rmap and all shadow page's parent spte list. Particularly, things become worse if guest uses more memory or vcpus. It is not good for scalability In this patch, we introduce a faster way to invalidate all shadow pages. KVM maintains a global mmu invalid generation-number which is stored in kvm->arch.mmu_valid_gen and every shadow page stores the current global generation-number into sp->mmu_valid_gen when it is created When KVM need zap all shadow pages sptes, it just simply increase the global generation-number then reload root shadow pages on all vcpus. Vcpu will create a new shadow page table according to current kvm's generation-number. It ensures the old pages are not used any more. Then the obsolete pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen) are zapped by using lock-break technique Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3741c65..bff7d46 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -222,6 +222,7 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; unsigned long parent_ptes; /* Reverse mapping for parent_pte */ + unsigned long mmu_valid_gen; DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 @@ -529,6 +530,7 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; + unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f8ca2f3..d71bf8f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1511,6 +1511,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * The active_mmu_pages list is the FIFO list, do not move the + * page until it is zapped. kvm_zap_obsolete_pages depends on + * this feature. See the comments in kvm_zap_obsolete_pages(). + */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); @@ -1838,6 +1844,11 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1900,6 +1911,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, gfn); } + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); trace_kvm_mmu_get_page(sp, true); return sp; @@ -2070,8 +2082,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { @@ -4195,6 +4209,82 @@ restart: spin_unlock(&kvm->mmu_lock); } +static void kvm_zap_obsolete_pages(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + +restart: + list_for_each_entry_safe_reverse(sp, node, + &kvm->arch.active_mmu_pages, link) { + /* + * No obsolete page exists before new created page since + * active_mmu_pages is the FIFO list. + */ + if (!is_obsolete_sp(kvm, sp)) + break; + + /* + * Do not repeatedly zap a root page to avoid unnecessary + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to + * progress: + * vcpu 0 vcpu 1 + * call vcpu_enter_guest(): + * 1): handle KVM_REQ_MMU_RELOAD + * and require mmu-lock to + * load mmu + * repeat: + * 1): zap root page and + * send KVM_REQ_MMU_RELOAD + * + * 2): if (cond_resched_lock(mmu-lock)) + * + * 2): hold mmu-lock and load mmu + * + * 3): see KVM_REQ_MMU_RELOAD bit + * on vcpu->requests is set + * then return 1 to call + * vcpu_enter_guest() again. + * goto repeat; + * + * Since we are reversely walking the list and the invalid + * list will be moved to the head, skip the invalid page + * can help us to avoid the infinity list walking. + */ + if (sp->role.invalid) + continue; + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + cond_resched_lock(&kvm->mmu_lock); + goto restart; + } + + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + } + + kvm_mmu_commit_zap_page(kvm, &invalid_list); +} + +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) +{ + spin_lock(&kvm->mmu_lock); + kvm->arch.mmu_valid_gen++; + + kvm_zap_obsolete_pages(kvm); + spin_unlock(&kvm->mmu_lock); +} + void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2adcbc2..922bfae 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -97,4 +97,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, return (mmu->permissions[pfec >> 1] >> pte_access) & 1; } +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); #endif -- cgit v0.10.2 From 6ca18b6950f8dee29361722f28f69847724b276f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:23 +0800 Subject: KVM: x86: use the fast way to invalidate all pages Replace kvm_mmu_zap_all by kvm_mmu_invalidate_zap_all_pages Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d71bf8f..c8063b9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4194,21 +4194,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_all(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); -} - static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3758ff9..15e10f7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7066,13 +7066,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_zap_all(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_arch_flush_shadow_all(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From 2248b023219251908aedda0621251cffc548f258 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:24 +0800 Subject: KVM: MMU: show mmu_valid_gen in shadow page related tracepoints Show sp->mmu_valid_gen Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index b8f6172..697f466 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -7,16 +7,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(unsigned long, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -28,8 +30,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", \ + trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ + " %snxe root %u %s%c", __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ -- cgit v0.10.2 From 35006126f024f68727c67001b9cb703c38f69268 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:25 +0800 Subject: KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages It is good for debug and development Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c8063b9..3fd060a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4264,6 +4264,7 @@ restart: void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); + trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; kvm_zap_obsolete_pages(kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 697f466..eb444dd 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -276,6 +276,26 @@ TRACE_EVENT( __spte_satisfied(old_spte), __spte_satisfied(new_spte) ) ); + +TRACE_EVENT( + kvm_mmu_invalidate_zap_all_pages, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(unsigned long, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %lx used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -- cgit v0.10.2 From 7f52af7412275c0d23becfc325331ec8b5ff2458 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:26 +0800 Subject: KVM: MMU: do not reuse the obsolete page The obsolete page will be zapped soon, do not reuse it to reduce future page fault Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3fd060a..0880b9b4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1875,6 +1875,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.quadrant = quadrant; } for_each_gfn_sp(vcpu->kvm, sp, gfn) { + if (is_obsolete_sp(vcpu->kvm, sp)) + continue; + if (!need_sync && sp->unsync) need_sync = true; -- cgit v0.10.2 From e7d11c7a894986a13817c1c001e1e7668c5c4eb4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:27 +0800 Subject: KVM: MMU: zap pages in batch Zap at lease 10 pages before releasing mmu-lock to reduce the overload caused by requiring lock After the patch, kvm_zap_obsolete_pages can forward progress anyway, so update the comments [ It improves the case 0.6% ~ 1% that do kernel building meanwhile read PCI ROM. ] Note: i am not sure that "10" is the best speculative value, i just guessed that '10' can make vcpu do not spend long time on kvm_zap_obsolete_pages and do not cause mmu-lock too hungry. Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0880b9b4..fe9d6f1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4197,14 +4197,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) spin_unlock(&kvm->mmu_lock); } +#define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int batch = 0; restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { + int ret; + /* * No obsolete page exists before new created page since * active_mmu_pages is the FIFO list. @@ -4213,28 +4217,6 @@ restart: break; /* - * Do not repeatedly zap a root page to avoid unnecessary - * KVM_REQ_MMU_RELOAD, otherwise we may not be able to - * progress: - * vcpu 0 vcpu 1 - * call vcpu_enter_guest(): - * 1): handle KVM_REQ_MMU_RELOAD - * and require mmu-lock to - * load mmu - * repeat: - * 1): zap root page and - * send KVM_REQ_MMU_RELOAD - * - * 2): if (cond_resched_lock(mmu-lock)) - * - * 2): hold mmu-lock and load mmu - * - * 3): see KVM_REQ_MMU_RELOAD bit - * on vcpu->requests is set - * then return 1 to call - * vcpu_enter_guest() again. - * goto repeat; - * * Since we are reversely walking the list and the invalid * list will be moved to the head, skip the invalid page * can help us to avoid the infinity list walking. @@ -4242,13 +4224,18 @@ restart: if (sp->role.invalid) continue; - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (batch >= BATCH_ZAP_PAGES && + (need_resched() || spin_needbreak(&kvm->mmu_lock))) { + batch = 0; kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_lock(&kvm->mmu_lock); goto restart; } - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + batch += ret; + + if (ret) goto restart; } -- cgit v0.10.2 From f34d251d66ba263c077ed9d2bbd1874339a4c887 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:28 +0800 Subject: KVM: MMU: collapse TLB flushes when zap all pages kvm_zap_obsolete_pages uses lock-break technique to zap pages, it will flush tlb every time when it does lock-break We can reload mmu on all vcpus after updating the generation number so that the obsolete pages are not used on any vcpus, after that we do not need to flush tlb when obsolete pages are zapped It will do kvm_mmu_prepare_zap_page many times and use one kvm_mmu_commit_zap_page to collapse tlb flush, the side-effects is that causes obsolete pages unlinked from active_list but leave on hash-list, so we add the comment around the hash list walker Note: kvm_mmu_commit_zap_page is still needed before free the pages since other vcpus may be doing locklessly shadow page walking Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fe9d6f1..674c044 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1654,6 +1654,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); +/* + * NOTE: we should pay more attention on the zapped-obsolete page + * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk + * since it has been deleted from active_mmu_pages but still can be found + * at hast list. + * + * for_each_gfn_indirect_valid_sp has skipped that kind of page and + * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped + * all the obsolete pages. + */ #define for_each_gfn_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -4224,11 +4234,13 @@ restart: if (sp->role.invalid) continue; + /* + * Need not flush tlb since we only zap the sp with invalid + * generation number. + */ if (batch >= BATCH_ZAP_PAGES && - (need_resched() || spin_needbreak(&kvm->mmu_lock))) { + cond_resched_lock(&kvm->mmu_lock)) { batch = 0; - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); goto restart; } @@ -4239,6 +4251,10 @@ restart: goto restart; } + /* + * Should flush tlb before free page tables since lockless-walking + * may use the pages. + */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -4257,6 +4273,17 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; + /* + * Notify all vcpus to reload its shadow page table + * and flush TLB. Then all vcpus will switch to new + * shadow page table with the new mmu_valid_gen. + * + * Note: we should do this under the protection of + * mmu-lock, otherwise, vcpu would purge shadow page + * but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); + kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } -- cgit v0.10.2 From 365c886860c4ba670d245e762b23987c912c129a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 31 May 2013 08:36:29 +0800 Subject: KVM: MMU: reclaim the zapped-obsolete page first As Marcelo pointed out that | "(retention of large number of pages while zapping) | can be fatal, it can lead to OOM and host crash" We introduce a list, kvm->arch.zapped_obsolete_pages, to link all the pages which are deleted from the mmu cache but not actually freed. When page reclaiming is needed, we always zap this kind of pages first. Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bff7d46..1f98c1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -536,6 +536,8 @@ struct kvm_arch { * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; + struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; int iommu_flags; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 674c044..79af88a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4211,7 +4211,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); int batch = 0; restart: @@ -4244,7 +4243,8 @@ restart: goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + ret = kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages); batch += ret; if (ret) @@ -4255,7 +4255,7 @@ restart: * Should flush tlb before free page tables since lockless-walking * may use the pages. */ - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); } /* @@ -4306,6 +4306,11 @@ restart: spin_unlock(&kvm->mmu_lock); } +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} + static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; @@ -4334,15 +4339,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages) + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; + } + prepare_zap_oldest_mmu_page(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15e10f7..6402951 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6832,6 +6832,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return -EINVAL; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ -- cgit v0.10.2 From 05988d728dcd962d50374e4e63171324163005b6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Fri, 31 May 2013 08:36:30 +0800 Subject: KVM: MMU: reduce KVM_REQ_MMU_RELOAD when root page is zapped Quote Gleb's mail: | why don't we check for sp->role.invalid in | kvm_mmu_prepare_zap_page before calling kvm_reload_remote_mmus()? and | Actually we can add check for is_obsolete_sp() there too since | kvm_mmu_invalidate_all_pages() already calls kvm_reload_remote_mmus() | after incrementing mmu_valid_gen. [ Xiao: add some comments and the check of is_obsolete_sp() ] Signed-off-by: Gleb Natapov Signed-off-by: Xiao Guangrong Reviewed-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 79af88a..6941fa7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2108,7 +2108,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_reload_remote_mmus(kvm); + + /* + * The obsolete pages can not be used on any vcpus. + * See the comments in kvm_mmu_invalidate_zap_all_pages(). + */ + if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + kvm_reload_remote_mmus(kvm); } sp->role.invalid = 1; -- cgit v0.10.2 From 8915aa27d5efbb9185357175b0acf884325565f9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 11 Jun 2013 23:31:12 -0300 Subject: KVM: x86: handle idiv overflow at kvm_write_tsc Its possible that idivl overflows (due to large delta stored in usdiff, valid scenario). Create an exception handler to catch the overflow exception (division by zero is protected by vcpu->arch.virtual_tsc_khz check), and interpret it accordingly (delta is larger than USEC_PER_SEC). Fixes https://bugzilla.redhat.com/show_bug.cgi?id=969644 Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6402951..737c804 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1194,20 +1194,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { + int faulted = 0; + /* n.b - signed multiplication and division required */ usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else /* do_div() only does unsigned */ - asm("idivl %2; xor %%edx, %%edx" - : "=A"(usdiff) - : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + asm("1: idivl %[divisor]\n" + "2: xor %%edx, %%edx\n" + " movl $0, %[faulted]\n" + "3:\n" + ".section .fixup,\"ax\"\n" + "4: movl $1, %[faulted]\n" + " jmp 3b\n" + ".previous\n" + + _ASM_EXTABLE(1b, 4b) + + : "=A"(usdiff), [faulted] "=r" (faulted) + : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); + #endif do_div(elapsed, 1000); usdiff -= elapsed; if (usdiff < 0) usdiff = -usdiff; + + /* idivl overflow => difference is larger than USEC_PER_SEC */ + if (faulted) + usdiff = USEC_PER_SEC; } else usdiff = USEC_PER_SEC; /* disable TSC match window below */ -- cgit v0.10.2 From db70ccdfb9953b984f5b95d98c50d8da335bab59 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 12 Jun 2013 13:54:52 +0200 Subject: KVM: s390: Provide function for setting the guest storage key From time to time we need to set the guest storage key. Lets provide a helper function that handles the changes with all the right locking and checking. Signed-off-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 590c321..e1408dd 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -22,6 +22,9 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned long); void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *); +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned long key, bool nq); + static inline void clear_table(unsigned long *s, unsigned long val, size_t n) { typedef struct { char _[n]; } addrtype; diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 1e0c438..44b1450 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -771,6 +771,54 @@ static inline void page_table_free_pgste(unsigned long *table) __free_page(page); } +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned long key, bool nq) +{ + spinlock_t *ptl; + pgste_t old, new; + pte_t *ptep; + + down_read(&mm->mmap_sem); + ptep = get_locked_pte(current->mm, addr, &ptl); + if (unlikely(!ptep)) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + + new = old = pgste_get_lock(ptep); + pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; + pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + unsigned long address, bits; + unsigned char skey; + + address = pte_val(*ptep) & PAGE_MASK; + skey = page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + /* Set storage key ACC and FP */ + page_set_storage_key(address, + (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)), + !nq); + + /* Merge host changed & referenced into pgste */ + pgste_val(new) |= bits << 52; + /* Transfer skey changed & referenced bit to kvm user bits */ + pgste_val(new) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */ + } + /* changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & + (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(*ptep, ptl); + up_read(&mm->mmap_sem); + return 0; +} +EXPORT_SYMBOL(set_guest_storage_key); + #else /* CONFIG_PGSTE */ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, -- cgit v0.10.2 From 69d0d3a3160690cf64ea3bf484ca1f9d7a1bf798 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 12 Jun 2013 13:54:53 +0200 Subject: KVM: s390: guest large pages This patch enables kvm to give large pages to the guest. The heavy lifting is done by the hardware, the host only has to take care of the PFMF instruction, which is also part of EDAT-1. We also support the non-quiescing key setting facility if the host supports it, to behave similar to the interpretation of sske. Signed-off-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9a809f9..43207dd 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -62,6 +62,7 @@ struct sca_block { #define CPUSTAT_MCDS 0x00000100 #define CPUSTAT_SM 0x00000080 #define CPUSTAT_G 0x00000008 +#define CPUSTAT_GED 0x00000004 #define CPUSTAT_J 0x00000002 #define CPUSTAT_P 0x00000001 @@ -96,7 +97,8 @@ struct kvm_s390_sie_block { __u32 scaoh; /* 0x005c */ __u8 reserved60; /* 0x0060 */ __u8 ecb; /* 0x0061 */ - __u8 reserved62[2]; /* 0x0062 */ + __u8 ecb2; /* 0x0062 */ + __u8 reserved63[1]; /* 0x0063 */ __u32 scaol; /* 0x0064 */ __u8 reserved68[4]; /* 0x0068 */ __u32 todpr; /* 0x006c */ @@ -136,6 +138,7 @@ struct kvm_vcpu_stat { u32 deliver_program_int; u32 deliver_io_int; u32 exit_wait_state; + u32 instruction_pfmf; u32 instruction_stidp; u32 instruction_spx; u32 instruction_stpx; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3b597e5..426e259 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -59,6 +59,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, { "deliver_program_interruption", VCPU_STAT(deliver_program_int) }, { "exit_wait_state", VCPU_STAT(exit_wait_state) }, + { "instruction_pfmf", VCPU_STAT(instruction_pfmf) }, { "instruction_stidp", VCPU_STAT(instruction_stidp) }, { "instruction_spx", VCPU_STAT(instruction_spx) }, { "instruction_stpx", VCPU_STAT(instruction_stpx) }, @@ -381,8 +382,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | - CPUSTAT_STOPPED); + CPUSTAT_STOPPED | + CPUSTAT_GED); vcpu->arch.sie_block->ecb = 6; + vcpu->arch.sie_block->ecb2 = 8; vcpu->arch.sie_block->eca = 0xC1002001U; vcpu->arch.sie_block->fac = (int) (long) facilities; hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); @@ -1125,7 +1128,7 @@ static int __init kvm_s390_init(void) return -ENOMEM; } memcpy(facilities, S390_lowcore.stfle_fac_list, 16); - facilities[0] &= 0xff00fff3f47c0000ULL; + facilities[0] &= 0xff82fff3f47c0000ULL; facilities[1] &= 0x001c000000000000ULL; return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 269b523..15795b8 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -86,6 +86,12 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; } +static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2) +{ + *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20; + *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; +} + static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ecc58a6..bda9c9b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1,7 +1,7 @@ /* * handling privileged instructions * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008, 2013 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include #include #include "gaccess.h" @@ -212,7 +215,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stfl++; /* only pass the facility bits, which we can handle */ - facility_list = S390_lowcore.stfl_fac_list & 0xff00fff3; + facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3; rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), &facility_list, sizeof(facility_list)); @@ -468,9 +471,88 @@ static int handle_epsw(struct kvm_vcpu *vcpu) return 0; } +#define PFMF_RESERVED 0xfffc0101UL +#define PFMF_SK 0x00020000UL +#define PFMF_CF 0x00010000UL +#define PFMF_UI 0x00008000UL +#define PFMF_FSC 0x00007000UL +#define PFMF_NQ 0x00000800UL +#define PFMF_MR 0x00000400UL +#define PFMF_MC 0x00000200UL +#define PFMF_KEY 0x000000feUL + +static int handle_pfmf(struct kvm_vcpu *vcpu) +{ + int reg1, reg2; + unsigned long start, end; + + vcpu->stat.instruction_pfmf++; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + if (!MACHINE_HAS_PFMF) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OPERATION); + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* Only provide non-quiescing support if the host supports it */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ && + S390_lowcore.stfl_fac_list & 0x00020000) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + /* No support for conditional-SSKE */ + if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { + case 0x00000000: + end = (start + (1UL << 12)) & ~((1UL << 12) - 1); + break; + case 0x00001000: + end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + break; + /* We dont support EDAT2 + case 0x00002000: + end = (start + (1UL << 31)) & ~((1UL << 31) - 1); + break;*/ + default: + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + } + while (start < end) { + unsigned long useraddr; + + useraddr = gmap_translate(start, vcpu->arch.gmap); + if (IS_ERR((void *)useraddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (clear_user((void __user *)useraddr, PAGE_SIZE)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + } + + if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) { + if (set_guest_storage_key(current->mm, useraddr, + vcpu->run->s.regs.gprs[reg1] & PFMF_KEY, + vcpu->run->s.regs.gprs[reg1] & PFMF_NQ)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + } + + start += PAGE_SIZE; + } + if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) + vcpu->run->s.regs.gprs[reg2] = end; + return 0; +} + static const intercept_handler_t b9_handlers[256] = { [0x8d] = handle_epsw, [0x9c] = handle_io_inst, + [0xaf] = handle_pfmf, }; int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From b110feaf4d0bbc31802589ea6b956389afdabcee Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 12 Jun 2013 13:54:54 +0200 Subject: KVM: s390: code cleanup to use common vcpu slab cache cleanup of arch specific code to use common code provided vcpu slab cache instead of kzalloc() provided memory Signed-off-by: Michael Mueller Signed-off-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 426e259..a318365 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -278,7 +278,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)(vcpu->arch.sie_block)); kvm_vcpu_uninit(vcpu); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -408,7 +408,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, rc = -ENOMEM; - vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) goto out; @@ -453,7 +453,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); out_free_cpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(rc); } -- cgit v0.10.2 From d0321a24bf10e2299a997c4747b924f79f70a232 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 12 Jun 2013 13:54:55 +0200 Subject: KVM: s390: Use common waitqueue Lets use the common waitqueue for kvm cpus on s390. By itself it is just a cleanup, but it should also improve the accuracy of diag 0x44 which is implemented via kvm_vcpu_on_spin. kvm_vcpu_on_spin has an explicit check for waiting on the waitqueue to optimize the yielding. Signed-off-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 43207dd..d3ffd7e 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -228,7 +228,7 @@ struct kvm_s390_local_interrupt { atomic_t active; struct kvm_s390_float_interrupt *float_int; int timer_due; /* event indicator for waitqueue below */ - wait_queue_head_t wq; + wait_queue_head_t *wq; atomic_t *cpuflags; unsigned int action_bits; }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5c94817..7f35cb3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -438,7 +438,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) no_timer: spin_lock(&vcpu->arch.local_int.float_int->lock); spin_lock_bh(&vcpu->arch.local_int.lock); - add_wait_queue(&vcpu->arch.local_int.wq, &wait); + add_wait_queue(&vcpu->wq, &wait); while (list_empty(&vcpu->arch.local_int.list) && list_empty(&vcpu->arch.local_int.float_int->list) && (!vcpu->arch.local_int.timer_due) && @@ -452,7 +452,7 @@ no_timer: } __unset_cpu_idle(vcpu); __set_current_state(TASK_RUNNING); - remove_wait_queue(&vcpu->arch.local_int.wq, &wait); + remove_wait_queue(&vcpu->wq, &wait); spin_unlock_bh(&vcpu->arch.local_int.lock); spin_unlock(&vcpu->arch.local_int.float_int->lock); hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); @@ -465,8 +465,8 @@ void kvm_s390_tasklet(unsigned long parm) spin_lock(&vcpu->arch.local_int.lock); vcpu->arch.local_int.timer_due = 1; - if (waitqueue_active(&vcpu->arch.local_int.wq)) - wake_up_interruptible(&vcpu->arch.local_int.wq); + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); spin_unlock(&vcpu->arch.local_int.lock); } @@ -613,7 +613,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) spin_lock_bh(&li->lock); list_add(&inti->list, &li->list); atomic_set(&li->active, 1); - BUG_ON(waitqueue_active(&li->wq)); + BUG_ON(waitqueue_active(li->wq)); spin_unlock_bh(&li->lock); return 0; } @@ -746,8 +746,8 @@ int kvm_s390_inject_vm(struct kvm *kvm, li = fi->local_int[sigcpu]; spin_lock_bh(&li->lock); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - if (waitqueue_active(&li->wq)) - wake_up_interruptible(&li->wq); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); spin_unlock(&fi->lock); mutex_unlock(&kvm->lock); @@ -832,8 +832,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, if (inti->type == KVM_S390_SIGP_STOP) li->action_bits |= ACTION_STOP_ON_STOP; atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - if (waitqueue_active(&li->wq)) - wake_up_interruptible(&vcpu->arch.local_int.wq); + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); spin_unlock_bh(&li->lock); mutex_unlock(&vcpu->kvm->lock); return 0; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a318365..ba694d2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -438,7 +438,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.local_int.float_int = &kvm->arch.float_int; spin_lock(&kvm->arch.float_int.lock); kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int; - init_waitqueue_head(&vcpu->arch.local_int.wq); + vcpu->arch.local_int.wq = &vcpu->wq; vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; spin_unlock(&kvm->arch.float_int.lock); diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 1c48ab2..033c864 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -79,8 +79,8 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - if (waitqueue_active(&li->wq)) - wake_up_interruptible(&li->wq); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); @@ -117,8 +117,8 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - if (waitqueue_active(&li->wq)) - wake_up_interruptible(&li->wq); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); @@ -145,8 +145,8 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) atomic_set(&li->active, 1); atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); li->action_bits |= action; - if (waitqueue_active(&li->wq)) - wake_up_interruptible(&li->wq); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); out: spin_unlock_bh(&li->lock); @@ -250,8 +250,8 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); - if (waitqueue_active(&li->wq)) - wake_up_interruptible(&li->wq); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); -- cgit v0.10.2 From b764bb1c50c279b95a486d338418f7fda74fff71 Mon Sep 17 00:00:00 2001 From: Heinz Graalfs Date: Wed, 12 Jun 2013 13:54:56 +0200 Subject: KVM: s390,perf: Detect if perf samples belong to KVM host or guest This patch is based on an original patch of David Hildenbrand. The perf core implementation calls architecture specific code in order to ask for specific information for a particular sample: perf_instruction_pointer() When perf core code asks for the instruction pointer, architecture specific code must detect if a KVM guest was running when the sample was taken. A sample can be associated with a KVM guest when the PSW supervisor state bit is set and the PSW instruction pointer part contains the address of 'sie_exit'. A KVM guest's instruction pointer information is then retrieved via gpsw entry pointed to by the sie control-block. perf_misc_flags() perf code code calls this function in order to associate the kernel vs. user state infomation with a particular sample. Architecture specific code must also first detectif a KVM guest was running at the time the sample was taken. Signed-off-by: Heinz Graalfs Reviewed-by: Hendrik Brueckner Signed-off-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d3ffd7e..4339069 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -275,4 +275,5 @@ struct kvm_arch{ }; extern int sie64a(struct kvm_s390_sie_block *, u64 *); +extern char sie_exit; #endif diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 5f0173a..1141fb3 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -14,3 +14,13 @@ /* Per-CPU flags for PMU states */ #define PMU_F_RESERVED 0x1000 #define PMU_F_ENABLED 0x2000 + +#ifdef CONFIG_64BIT + +/* Perf callbacks */ +struct pt_regs; +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) + +#endif /* CONFIG_64BIT */ diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 51d99ac..b094ced 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -964,6 +964,7 @@ sie_done: # See also HANDLE_SIE_INTERCEPT rewind_pad: nop 0 + .globl sie_exit sie_exit: lg %r14,__SF_EMPTY+8(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index f58f37f..a6fc037 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -39,6 +40,57 @@ int perf_num_counters(void) } EXPORT_SYMBOL(perf_num_counters); +static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) +{ + struct stack_frame *stack = (struct stack_frame *) regs->gprs[15]; + + if (!stack) + return NULL; + + return (struct kvm_s390_sie_block *) stack->empty1[0]; +} + +static bool is_in_guest(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + + if (user_mode(regs)) + return false; + + return ip == (unsigned long) &sie_exit; +} + +static unsigned long guest_is_user_mode(struct pt_regs *regs) +{ + return sie_block(regs)->gpsw.mask & PSW_MASK_PSTATE; +} + +static unsigned long instruction_pointer_guest(struct pt_regs *regs) +{ + return sie_block(regs)->gpsw.addr & PSW_ADDR_INSN; +} + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + return is_in_guest(regs) ? instruction_pointer_guest(regs) + : instruction_pointer(regs); +} + +static unsigned long perf_misc_guest_flags(struct pt_regs *regs) +{ + return guest_is_user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER + : PERF_RECORD_MISC_GUEST_KERNEL; +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + if (is_in_guest(regs)) + return perf_misc_guest_flags(regs); + + return user_mode(regs) ? PERF_RECORD_MISC_USER + : PERF_RECORD_MISC_KERNEL; +} + void perf_event_print_debug(void) { struct cpumf_ctr_info cf_info; diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c index 9bdbcef..3bac589 100644 --- a/arch/s390/kernel/s390_ksyms.c +++ b/arch/s390/kernel/s390_ksyms.c @@ -7,6 +7,7 @@ EXPORT_SYMBOL(_mcount); #endif #if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) EXPORT_SYMBOL(sie64a); +EXPORT_SYMBOL(sie_exit); #endif EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); -- cgit v0.10.2 From aeb87c3cb7416d4d5931bc939cc083c731479de0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 12 Jun 2013 13:54:57 +0200 Subject: KVM: s390: Fix epsw instruction decoding The handle_epsw() function calculated the first register in the wrong way, so that it always used r0 by mistake. Now the code uses the common helper function for decoding the registers of rre functions instead to avoid such mistakes. Signed-off-by: Thomas Huth Reviewed-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index bda9c9b..a0c63d7 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -457,8 +457,7 @@ static int handle_epsw(struct kvm_vcpu *vcpu) { int reg1, reg2; - reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24; - reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; + kvm_s390_get_regs_rre(vcpu, ®1, ®2); /* This basically extracts the mask half of the psw. */ vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000; -- cgit v0.10.2 From d8968f1f334c18bf65178b757a82864ef3f37613 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 19 Jun 2013 11:42:07 +1000 Subject: kvm api doc: fix section numbers Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 5f91eda..6365fef 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2261,7 +2261,7 @@ return indicates the attribute is implemented. It does not necessarily indicate that the attribute can be read or written in the device's current state. "addr" is ignored. -4.77 KVM_ARM_VCPU_INIT +4.82 KVM_ARM_VCPU_INIT Capability: basic Architectures: arm @@ -2285,7 +2285,7 @@ Possible features: Depends on KVM_CAP_ARM_PSCI. -4.78 KVM_GET_REG_LIST +4.83 KVM_GET_REG_LIST Capability: basic Architectures: arm @@ -2305,7 +2305,7 @@ This ioctl returns the guest registers that are supported for the KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. -4.80 KVM_ARM_SET_DEVICE_ADDR +4.84 KVM_ARM_SET_DEVICE_ADDR Capability: KVM_CAP_ARM_SET_DEVICE_ADDR Architectures: arm @@ -2342,7 +2342,7 @@ and distributor interface, the ioctl must be called after calling KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the base addresses will return -EEXIST. -4.82 KVM_PPC_RTAS_DEFINE_TOKEN +4.85 KVM_PPC_RTAS_DEFINE_TOKEN Capability: KVM_CAP_PPC_RTAS Architectures: ppc -- cgit v0.10.2 From 6c806a7332e6d36c1f95966b4a8a22f342b68948 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:19 +0800 Subject: KVM: MMU: update the documentation for reverse mapping of parent_pte Update the document to match the current reverse mapping of parent_pte Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 43fcb76..869abcc 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -191,12 +191,12 @@ Shadow pages contain the following information: A counter keeping track of how many hardware registers (guest cr3 or pdptrs) are now pointing at the page. While this counter is nonzero, the page cannot be destroyed. See role.invalid. - multimapped: - Whether there exist multiple sptes pointing at this page. - parent_pte/parent_ptes: - If multimapped is zero, parent_pte points at the single spte that points at - this page's spt. Otherwise, parent_ptes points at a data structure - with a list of parent_ptes. + parent_ptes: + The reverse mapping for the pte/ptes pointing at this page's spt. If + parent_ptes bit 0 is zero, only one spte points at this pages and + parent_ptes points at this single spte, otherwise, there exists multiple + sptes pointing at this page and (parent_ptes & ~0x1) points at a data + structure with a list of parent_ptes. unsync: If true, then the translations in this page may not match the guest's translation. This is equivalent to the state of the tlb when a pte is -- cgit v0.10.2 From 208dd7567df471c7c47aa25b94569afc115de5f5 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:21:59 +0200 Subject: KVM: s390: Renamed PGM_PRIVILEGED_OPERATION Renamed the PGM_PRIVILEGED_OPERATION define to PGM_PRIVILEGED_OP since this define was way longer than the other PGM_* defines and caused the code often to exceed the 80 columns limit when not split to multiple lines. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 4339069..3238d40 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -175,7 +175,7 @@ struct kvm_s390_ext_info { }; #define PGM_OPERATION 0x01 -#define PGM_PRIVILEGED_OPERATION 0x02 +#define PGM_PRIVILEGED_OP 0x02 #define PGM_EXECUTE 0x03 #define PGM_PROTECTION 0x04 #define PGM_ADDRESSING 0x05 diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index a0c63d7..a21e014 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -259,8 +259,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) u64 addr; if (gpsw->mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OPERATION); + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + addr = kvm_s390_get_base_disp_s(vcpu); if (addr & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -446,7 +446,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) if (handler) { if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OPERATION); + PGM_PRIVILEGED_OP); else return handler(vcpu); } @@ -493,7 +493,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OPERATION); + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -564,7 +564,7 @@ int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) if ((handler != handle_epsw) && (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)) return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OPERATION); + PGM_PRIVILEGED_OP); else return handler(vcpu); } @@ -581,8 +581,7 @@ int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu) /* All eb instructions that end up here are privileged. */ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OPERATION); + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; if (handler) return handler(vcpu); @@ -642,8 +641,7 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) u32 value; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OPERATION); + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000) return kvm_s390_inject_program_int(vcpu, diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 033c864..bec398c 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -333,8 +333,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) /* sigp in userspace can exit */ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OPERATION); + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); order_code = kvm_s390_get_base_disp_rs(vcpu); -- cgit v0.10.2 From f9f6bbc6991f2ba21bfaff90f4060f2df766ca20 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:22:00 +0200 Subject: KVM: s390: Privileged operation check for TPROT TPROT is a privileged instruction and thus should generate a privileged operation exception when the problem state bit is not cleared in the PSW. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index a21e014..04dc4a1 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -596,6 +596,9 @@ static int handle_tprot(struct kvm_vcpu *vcpu) vcpu->stat.instruction_tprot++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + kvm_s390_get_base_disp_sse(vcpu, &address1, &address2); /* we only handle the Linux memory detection case: -- cgit v0.10.2 From 5087dfa6c8b9f7893819f315eb24201ff5c07142 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:22:01 +0200 Subject: KVM: s390: Privileged operation checks moved to instruction handlers We need more fine-grained control about the point in time when we check for privileged instructions, since the exceptions that can happen during an instruction have a well-defined priority. For example, for the PFMF instruction, the check for PGM_PRIVILEGED_OP must happen after the check for PGM_OPERATION since the latter has a higher precedence - thus the check for privileged operation must not be done in kvm_s390_handle_b9() already. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 04dc4a1..0b19e22 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -37,6 +37,9 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) vcpu->stat.instruction_spx++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + operand2 = kvm_s390_get_base_disp_s(vcpu); /* must be word boundary */ @@ -68,6 +71,9 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stpx++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + operand2 = kvm_s390_get_base_disp_s(vcpu); /* must be word boundary */ @@ -92,6 +98,9 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stap++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + useraddr = kvm_s390_get_base_disp_s(vcpu); if (useraddr & 1) @@ -108,6 +117,10 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) static int handle_skey(struct kvm_vcpu *vcpu) { vcpu->stat.instruction_storage_key++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + vcpu->arch.sie_block->gpsw.addr = __rewind_psw(vcpu->arch.sie_block->gpsw, 4); VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); @@ -186,6 +199,9 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 4, "%s", "I/O instruction"); + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + if (vcpu->kvm->arch.css_support) { /* * Most I/O instructions will be handled by userspace. @@ -214,6 +230,10 @@ static int handle_stfl(struct kvm_vcpu *vcpu) int rc; vcpu->stat.instruction_stfl++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + /* only pass the facility bits, which we can handle */ facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3; @@ -282,6 +302,9 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) psw_t new_psw; u64 addr; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + addr = kvm_s390_get_base_disp_s(vcpu); if (addr & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -300,6 +323,9 @@ static int handle_stidp(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stidp++; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + operand2 = kvm_s390_get_base_disp_s(vcpu); if (operand2 & 7) @@ -355,6 +381,9 @@ static int handle_stsi(struct kvm_vcpu *vcpu) vcpu->stat.instruction_stsi++; VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + operand2 = kvm_s390_get_base_disp_s(vcpu); if (operand2 & 0xfff && fc > 0) @@ -436,20 +465,14 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) intercept_handler_t handler; /* - * a lot of B2 instructions are priviledged. We first check for - * the privileged ones, that we can handle in the kernel. If the - * kernel can handle this instruction, we check for the problem - * state bit and (a) handle the instruction or (b) send a code 2 - * program check. - * Anything else goes to userspace.*/ + * A lot of B2 instructions are priviledged. Here we check for + * the privileged ones, that we can handle in the kernel. + * Anything else goes to userspace. + */ handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; - if (handler) { - if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OP); - else - return handler(vcpu); - } + if (handler) + return handler(vcpu); + return -EOPNOTSUPP; } @@ -560,14 +583,9 @@ int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) /* This is handled just as for the B2 instructions. */ handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; - if (handler) { - if ((handler != handle_epsw) && - (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)) - return kvm_s390_inject_program_int(vcpu, - PGM_PRIVILEGED_OP); - else - return handler(vcpu); - } + if (handler) + return handler(vcpu); + return -EOPNOTSUPP; } @@ -579,9 +597,6 @@ int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu) { intercept_handler_t handler; - /* All eb instructions that end up here are privileged. */ - if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) - return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; if (handler) return handler(vcpu); -- cgit v0.10.2 From 93e1750f5ee8417e015bcb0bf2c37bf07b5ff647 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:22:02 +0200 Subject: KVM: s390: Check for PSTATE when handling DIAGNOSE DIAGNOSE is a privileged instruction and thus we must make sure that we are in supervisor mode before taking any other actions. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 1c01a99..3074475 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -132,6 +132,9 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) { int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + trace_kvm_s390_handle_diag(vcpu, code); switch (code) { case 0x10: -- cgit v0.10.2 From 133608f392ce2e11481317e3d0b02044710a5956 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:22:03 +0200 Subject: KVM: s390: Check for access exceptions during TPI When a guest calls the TPI instruction, the second operand address could point to an invalid location. In this case the problem should be signaled to the guest by throwing an access exception. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 0b19e22..4b8fb6c 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -146,9 +146,10 @@ static int handle_tpi(struct kvm_vcpu *vcpu) * Store the two-word I/O interruption code into the * provided area. */ - put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr); - put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2)); - put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4)); + if (put_guest(vcpu, inti->io.subchannel_id, (u16 __user *)addr) + || put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *)(addr + 2)) + || put_guest(vcpu, inti->io.io_int_parm, (u32 __user *)(addr + 4))) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } else { /* * Store the three-word I/O interruption code into -- cgit v0.10.2 From 953ed88d10444c0e139a2333b6cd96ce01aa94dc Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:22:04 +0200 Subject: KVM: s390: Reworked LCTL and LCTLG instructions LCTL and LCTLG are also privileged instructions, thus there is no need for treating them separately from the other instructions in priv.c. So this patch moves these two instructions to priv.c, adds a check for supervisor state and simplifies the "handle_eb" instruction decoding by merging the two eb_handlers jump tables from intercept.c and priv.c into one table only. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index f0b8be0..5ee56e5 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -22,87 +22,6 @@ #include "trace.h" #include "trace-s390.h" -static int handle_lctlg(struct kvm_vcpu *vcpu) -{ - int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; - int reg3 = vcpu->arch.sie_block->ipa & 0x000f; - u64 useraddr; - int reg, rc; - - vcpu->stat.instruction_lctlg++; - - useraddr = kvm_s390_get_base_disp_rsy(vcpu); - - if (useraddr & 7) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - reg = reg1; - - VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, - useraddr); - trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); - - do { - rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], - (u64 __user *) useraddr); - if (rc) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - useraddr += 8; - if (reg == reg3) - break; - reg = (reg + 1) % 16; - } while (1); - return 0; -} - -static int handle_lctl(struct kvm_vcpu *vcpu) -{ - int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; - int reg3 = vcpu->arch.sie_block->ipa & 0x000f; - u64 useraddr; - u32 val = 0; - int reg, rc; - - vcpu->stat.instruction_lctl++; - - useraddr = kvm_s390_get_base_disp_rs(vcpu); - - if (useraddr & 3) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, - useraddr); - trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr); - - reg = reg1; - do { - rc = get_guest(vcpu, val, (u32 __user *) useraddr); - if (rc) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; - vcpu->arch.sie_block->gcr[reg] |= val; - useraddr += 4; - if (reg == reg3) - break; - reg = (reg + 1) % 16; - } while (1); - return 0; -} - -static const intercept_handler_t eb_handlers[256] = { - [0x2f] = handle_lctlg, - [0x8a] = kvm_s390_handle_priv_eb, -}; - -static int handle_eb(struct kvm_vcpu *vcpu) -{ - intercept_handler_t handler; - - handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; - if (handler) - return handler(vcpu); - return -EOPNOTSUPP; -} static const intercept_handler_t instruction_handlers[256] = { [0x01] = kvm_s390_handle_01, @@ -110,10 +29,10 @@ static const intercept_handler_t instruction_handlers[256] = { [0x83] = kvm_s390_handle_diag, [0xae] = kvm_s390_handle_sigp, [0xb2] = kvm_s390_handle_b2, - [0xb7] = handle_lctl, + [0xb7] = kvm_s390_handle_lctl, [0xb9] = kvm_s390_handle_b9, [0xe5] = kvm_s390_handle_e5, - [0xeb] = handle_eb, + [0xeb] = kvm_s390_handle_eb, }; static int handle_noop(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 15795b8..028ca9f 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -132,7 +132,8 @@ int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); int kvm_s390_handle_01(struct kvm_vcpu *vcpu); int kvm_s390_handle_b9(struct kvm_vcpu *vcpu); int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu); -int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu); +int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); +int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4b8fb6c..c7603f5 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -590,11 +590,87 @@ int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) +{ + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; + u64 useraddr; + u32 val = 0; + int reg, rc; + + vcpu->stat.instruction_lctl++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + useraddr = kvm_s390_get_base_disp_rs(vcpu); + + if (useraddr & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, + useraddr); + trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr); + + reg = reg1; + do { + rc = get_guest(vcpu, val, (u32 __user *) useraddr); + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; + vcpu->arch.sie_block->gcr[reg] |= val; + useraddr += 4; + if (reg == reg3) + break; + reg = (reg + 1) % 16; + } while (1); + + return 0; +} + +static int handle_lctlg(struct kvm_vcpu *vcpu) +{ + int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; + int reg3 = vcpu->arch.sie_block->ipa & 0x000f; + u64 useraddr; + int reg, rc; + + vcpu->stat.instruction_lctlg++; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + useraddr = kvm_s390_get_base_disp_rsy(vcpu); + + if (useraddr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + reg = reg1; + + VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, + useraddr); + trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); + + do { + rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], + (u64 __user *) useraddr); + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + useraddr += 8; + if (reg == reg3) + break; + reg = (reg + 1) % 16; + } while (1); + + return 0; +} + static const intercept_handler_t eb_handlers[256] = { + [0x2f] = handle_lctlg, [0x8a] = handle_io_inst, }; -int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu) +int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) { intercept_handler_t handler; -- cgit v0.10.2 From 87d41fb4da6467622b7a87fd6afe8071abab6dae Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Jun 2013 17:22:05 +0200 Subject: KVM: s390: Fixed priority of execution in STSI Added some missing validity checks for the operands and fixed the priority of exceptions for some function codes according to the "Principles of Operation" document. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c7603f5..0da3e6e 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -385,16 +385,27 @@ static int handle_stsi(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - operand2 = kvm_s390_get_base_disp_s(vcpu); + if (fc > 3) { + vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; /* cc 3 */ + return 0; + } - if (operand2 & 0xfff && fc > 0) + if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 + || vcpu->run->s.regs.gprs[1] & 0xffff0000) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - switch (fc) { - case 0: + if (fc == 0) { vcpu->run->s.regs.gprs[0] = 3 << 28; - vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); + vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); /* cc 0 */ return 0; + } + + operand2 = kvm_s390_get_base_disp_s(vcpu); + + if (operand2 & 0xfff) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + switch (fc) { case 1: /* same handling for 1 and 2 */ case 2: mem = get_zeroed_page(GFP_KERNEL); @@ -411,8 +422,6 @@ static int handle_stsi(struct kvm_vcpu *vcpu) goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; - default: - goto out_no_data; } if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { -- cgit v0.10.2 From 885032b91042288f98d3888c2aaf3a108d348d5c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:23 +0800 Subject: KVM: MMU: retain more available bits on mmio spte Let mmio spte only use bit62 and bit63 on upper 32 bits, then bit 52 ~ bit 61 can be used for other purposes Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..78ee123 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4176,10 +4176,10 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0xffull << 49) is set to quickly identify mmio + * Also, magic bits (0x3ull << 62) is set to quickly identify mmio * spte. */ - kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); + kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 737c804..15cf34d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5280,7 +5280,13 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + /* Mask the reserved physical address bits. */ + mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + + /* Bit 62 is always reserved for 32bit host. */ + mask |= 0x3ull << 62; + + /* Set the present bit. */ mask |= 1ull; #ifdef CONFIG_X86_64 -- cgit v0.10.2 From 5ae7f87a56fab10b8f9b135a8377c144397293ca Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 30 Apr 2013 12:02:15 +0530 Subject: ARM: KVM: Allow host virt timer irq to be different from guest timer virt irq The arch_timer irq numbers (or PPI numbers) are implementation dependent, so the host virtual timer irq number can be different from guest virtual timer irq number. This patch ensures that host virtual timer irq number is read from DTB and guest virtual timer irq is determined based on vcpu target type. Signed-off-by: Anup Patel Signed-off-by: Pranavkumar Sawargaonkar Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index b80256b..b7840e7 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -27,6 +27,8 @@ #include #include +#include + /****************************************************************************** * Cortex-A15 Reset Values */ @@ -37,6 +39,11 @@ static struct kvm_regs a15_regs_reset = { .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, }; +static const struct kvm_irq_level a15_vtimer_irq = { + .irq = 27, + .level = 1, +}; + /******************************************************************************* * Exported reset function @@ -52,6 +59,7 @@ static struct kvm_regs a15_regs_reset = { int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { struct kvm_regs *cpu_reset; + const struct kvm_irq_level *cpu_vtimer_irq; switch (vcpu->arch.target) { case KVM_ARM_TARGET_CORTEX_A15: @@ -59,6 +67,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) return -EINVAL; cpu_reset = &a15_regs_reset; vcpu->arch.midr = read_cpuid_id(); + cpu_vtimer_irq = &a15_vtimer_irq; break; default: return -ENODEV; @@ -70,5 +79,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset CP15 registers */ kvm_reset_coprocs(vcpu); + /* Reset arch_timer context */ + kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); + return 0; } diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 68cb9e1..6d9aedd 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -61,6 +61,8 @@ struct arch_timer_cpu { #ifdef CONFIG_KVM_ARM_TIMER int kvm_timer_hyp_init(void); int kvm_timer_init(struct kvm *kvm); +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, + const struct kvm_irq_level *irq); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); @@ -76,6 +78,8 @@ static inline int kvm_timer_init(struct kvm *kvm) return 0; } +static inline void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, + const struct kvm_irq_level *irq) {} static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {} diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 2d00b29..af4583e 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -30,9 +30,7 @@ static struct timecounter *timecounter; static struct workqueue_struct *wqueue; -static struct kvm_irq_level timer_irq = { - .level = 1, -}; +static unsigned int host_vtimer_irq; static cycle_t kvm_phys_timer_read(void) { @@ -67,8 +65,8 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - vcpu->arch.timer_cpu.irq->irq, - vcpu->arch.timer_cpu.irq->level); + timer->irq->irq, + timer->irq->level); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) @@ -156,6 +154,20 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) timer_arm(timer, ns); } +void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, + const struct kvm_irq_level *irq) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + /* + * The vcpu timer irq number cannot be determined in + * kvm_timer_vcpu_init() because it is called much before + * kvm_vcpu_set_target(). To handle this, we determine + * vcpu timer irq number when the vcpu is reset. + */ + timer->irq = irq; +} + void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; @@ -163,12 +175,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->timer.function = kvm_timer_expire; - timer->irq = &timer_irq; } static void kvm_timer_init_interrupt(void *info) { - enable_percpu_irq(timer_irq.irq, 0); + enable_percpu_irq(host_vtimer_irq, 0); } @@ -182,7 +193,7 @@ static int kvm_timer_cpu_notify(struct notifier_block *self, break; case CPU_DYING: case CPU_DYING_FROZEN: - disable_percpu_irq(timer_irq.irq); + disable_percpu_irq(host_vtimer_irq); break; } @@ -229,7 +240,7 @@ int kvm_timer_hyp_init(void) goto out; } - timer_irq.irq = ppi; + host_vtimer_irq = ppi; err = register_cpu_notifier(&kvm_timer_cpu_nb); if (err) { -- cgit v0.10.2 From 24a7f675752e06729589d40a5256970998a21502 Mon Sep 17 00:00:00 2001 From: Dave P Martin Date: Wed, 1 May 2013 17:49:28 +0100 Subject: ARM: KVM: Don't handle PSCI calls via SMC Currently, kvmtool unconditionally declares that HVC should be used to call PSCI, so the function numbers in the DT tell the guest nothing about the function ID namespace or calling convention for SMC. We already assume that the guest will examine and honour the DT, since there is no way it could possibly guess the KVM-specific PSCI function IDs otherwise. So let's not encourage guests to violate what's specified in the DT by using SMC to make the call. [ Modified to apply to top of kvm/arm tree - Christoffer ] Signed-off-by: Dave P Martin Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 3d74a0b..df4c82d 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -52,9 +52,6 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { - if (kvm_psci_call(vcpu)) - return 1; - kvm_inject_undefined(vcpu); return 1; } diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 7ee5bb7..86a693a 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -75,7 +75,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) * kvm_psci_call - handle PSCI call if r0 value is in range * @vcpu: Pointer to the VCPU struct * - * Handle PSCI calls from guests through traps from HVC or SMC instructions. + * Handle PSCI calls from guests through traps from HVC instructions. * The calling convention is similar to SMC calls to the secure world where * the function number is placed in r0 and this function returns true if the * function number specified in r0 is withing the PSCI range, and false -- cgit v0.10.2 From 368074d908b785588778f00b4384376cd636f4a1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 May 2013 12:11:35 +0100 Subject: ARM: KVM: remove dead prototype for __kvm_tlb_flush_vmid __kvm_tlb_flush_vmid has been renamed to __kvm_tlb_flush_vmid_ipa, and the old prototype should have been removed when the code was modified. Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 18d5032..6ae3d2a 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -72,8 +72,6 @@ extern char __kvm_hyp_vector[]; extern char __kvm_hyp_code_start[]; extern char __kvm_hyp_code_end[]; -extern void __kvm_tlb_flush_vmid(struct kvm *kvm); - extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); -- cgit v0.10.2 From dac288f7b38a7439502b77dabcdf8a9a5c4ae721 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 May 2013 12:11:37 +0100 Subject: ARM: KVM: use phys_addr_t instead of unsigned long long for HYP PGDs HYP PGDs are passed around as phys_addr_t, except just before calling into the hypervisor init code, where they are cast to a rather weird unsigned long long. Just keep them around as phys_addr_t, which is what makes the most sense. Reported-by: Catalin Marinas Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index ff5aaf1..1f3cee2 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -190,8 +190,8 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); -static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, - unsigned long long pgd_ptr, +static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, + phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 37d216d..327a1fb 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -789,8 +789,8 @@ long kvm_arch_vm_ioctl(struct file *filp, static void cpu_init_hyp_mode(void *dummy) { - unsigned long long boot_pgd_ptr; - unsigned long long pgd_ptr; + phys_addr_t boot_pgd_ptr; + phys_addr_t pgd_ptr; unsigned long hyp_stack_ptr; unsigned long stack_page; unsigned long vector_ptr; @@ -798,8 +798,8 @@ static void cpu_init_hyp_mode(void *dummy) /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); - boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr(); - pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); + boot_pgd_ptr = kvm_mmu_get_boot_httbr(); + pgd_ptr = kvm_mmu_get_httbr(); stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; vector_ptr = (unsigned long)__kvm_hyp_vector; -- cgit v0.10.2 From 8734f16fb2aa4ff0bb57ad6532661a38bc8ff957 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 May 2013 12:11:38 +0100 Subject: ARM: KVM: don't special case PC when doing an MMIO Admitedly, reading a MMIO register to load PC is very weird. Writing PC to a MMIO register is probably even worse. But the architecture doesn't forbid any of these, and injecting a Prefetch Abort is the wrong thing to do anyway. Remove this check altogether, and let the adventurous guest wander into LaLaLand if they feel compelled to do so. Reported-by: Catalin Marinas Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 82b4bab..a464e8d 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -65,11 +65,6 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) return cpsr_mode > USR_MODE;; } -static inline bool kvm_vcpu_reg_is_pc(struct kvm_vcpu *vcpu, int reg) -{ - return reg == 15; -} - static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu) { return vcpu->arch.fault.hsr; diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 72a12f2..b8e06b7 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -86,12 +86,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, sign_extend = kvm_vcpu_dabt_issext(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); - if (kvm_vcpu_reg_is_pc(vcpu, rt)) { - /* IO memory trying to read/write pc */ - kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - mmio->is_write = is_write; mmio->phys_addr = fault_ipa; mmio->len = len; -- cgit v0.10.2 From 4db845c3d8e2f8a219e8ac48834dd4fe085e5d63 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 May 2013 12:11:39 +0100 Subject: ARM: KVM: get rid of S2_PGD_SIZE S2_PGD_SIZE defines the number of pages used by a stage-2 PGD and is unused, except for a VM_BUG_ON check that missuses the define. As the check is very unlikely to ever triggered except in circumstances where KVM is the least of our worries, just kill both the define and the VM_BUG_ON check. Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 124623e..64e9696 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -135,7 +135,6 @@ #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL) #define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30)) #define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t)) -#define S2_PGD_SIZE (1 << S2_PGD_ORDER) /* Virtualization Translation Control Register (VTCR) bits */ #define VTCR_SH0 (3 << 12) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 9657065..5385462 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -370,9 +370,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!pgd) return -ENOMEM; - /* stage-2 pgd must be aligned to its size */ - VM_BUG_ON((unsigned long)pgd & (S2_PGD_SIZE - 1)); - memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; -- cgit v0.10.2 From 6a077e4ab9cbfbf279fb955bae05b03781c97013 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Jun 2013 13:08:46 +0100 Subject: ARM: KVM: perform save/restore of PAR Not saving PAR is an unfortunate oversight. If the guest performs an AT* operation and gets scheduled out before reading the result of the translation from PAR, it could become corrupted by another guest or the host. Saving this register is made slightly more complicated as KVM also uses it on the permission fault handling path, leading to an ugly "stash and restore" sequence. Fortunately, this is already a slow path so we don't really care. Also, Linux doesn't do any AT* operation, so Linux guests are not impacted by this bug. [ Slightly tweaked to use an even register as first operand to ldrd and strd operations in interrupts_head.S - Christoffer ] Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 6ae3d2a..a2f43dd 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -37,16 +37,18 @@ #define c5_AIFSR 15 /* Auxilary Instrunction Fault Status R */ #define c6_DFAR 16 /* Data Fault Address Register */ #define c6_IFAR 17 /* Instruction Fault Address Register */ -#define c9_L2CTLR 18 /* Cortex A15 L2 Control Register */ -#define c10_PRRR 19 /* Primary Region Remap Register */ -#define c10_NMRR 20 /* Normal Memory Remap Register */ -#define c12_VBAR 21 /* Vector Base Address Register */ -#define c13_CID 22 /* Context ID Register */ -#define c13_TID_URW 23 /* Thread ID, User R/W */ -#define c13_TID_URO 24 /* Thread ID, User R/O */ -#define c13_TID_PRIV 25 /* Thread ID, Privileged */ -#define c14_CNTKCTL 26 /* Timer Control Register (PL1) */ -#define NR_CP15_REGS 27 /* Number of regs (incl. invalid) */ +#define c7_PAR 18 /* Physical Address Register */ +#define c7_PAR_high 19 /* PAR top 32 bits */ +#define c9_L2CTLR 20 /* Cortex A15 L2 Control Register */ +#define c10_PRRR 21 /* Primary Region Remap Register */ +#define c10_NMRR 22 /* Normal Memory Remap Register */ +#define c12_VBAR 23 /* Vector Base Address Register */ +#define c13_CID 24 /* Context ID Register */ +#define c13_TID_URW 25 /* Thread ID, User R/W */ +#define c13_TID_URO 26 /* Thread ID, User R/O */ +#define c13_TID_PRIV 27 /* Thread ID, Privileged */ +#define c14_CNTKCTL 28 /* Timer Control Register (PL1) */ +#define NR_CP15_REGS 29 /* Number of regs (incl. invalid) */ #define ARM_EXCEPTION_RESET 0 #define ARM_EXCEPTION_UNDEFINED 1 diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 8eea97b..4a51990 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -180,6 +180,10 @@ static const struct coproc_reg cp15_regs[] = { NULL, reset_unknown, c6_DFAR }, { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32, NULL, reset_unknown, c6_IFAR }, + + /* PAR swapped by interrupt.S */ + { CRn( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR }, + /* * DC{C,I,CI}SW operations: */ diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index f7793df..d0a8fa3 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -414,6 +414,10 @@ guest_trap: mrcne p15, 4, r2, c6, c0, 4 @ HPFAR bne 3f + /* Preserve PAR */ + mrrc p15, 0, r0, r1, c7 @ PAR + push {r0, r1} + /* Resolve IPA using the xFAR */ mcr p15, 0, r2, c7, c8, 0 @ ATS1CPR isb @@ -424,13 +428,19 @@ guest_trap: lsl r2, r2, #4 orr r2, r2, r1, lsl #24 + /* Restore PAR */ + pop {r0, r1} + mcrr p15, 0, r0, r1, c7 @ PAR + 3: load_vcpu @ Load VCPU pointer to r0 str r2, [r0, #VCPU_HPFAR] 1: mov r1, #ARM_EXCEPTION_HVC b __kvm_vcpu_return -4: pop {r0, r1, r2} @ Failed translation, return to guest +4: pop {r0, r1} @ Failed translation, return to guest + mcrr p15, 0, r0, r1, c7 @ PAR + pop {r0, r1, r2} eret /* diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index 3c8f2f0..2b44b95 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -302,11 +302,14 @@ vcpu .req r0 @ vcpu pointer always in r0 .endif mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL + mrrc p15, 0, r4, r5, c7 @ PAR .if \store_to_vcpu == 0 - push {r2} + push {r2,r4-r5} .else str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] + add r12, vcpu, #CP15_OFFSET(c7_PAR) + strd r4, r5, [r12] .endif .endm @@ -319,12 +322,15 @@ vcpu .req r0 @ vcpu pointer always in r0 */ .macro write_cp15_state read_from_vcpu .if \read_from_vcpu == 0 - pop {r2} + pop {r2,r4-r5} .else ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] + add r12, vcpu, #CP15_OFFSET(c7_PAR) + ldrd r4, r5, [r12] .endif mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL + mcrr p15, 0, r4, r5, c7 @ PAR .if \read_from_vcpu == 0 pop {r2-r12} -- cgit v0.10.2 From 479c5ae2f8a55509b691494cd13691d3dc31d102 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Jun 2013 13:08:47 +0100 Subject: ARM: KVM: add missing dsb before invalidating Stage-2 TLBs When performing a Stage-2 TLB invalidation, it is necessary to make sure the write to the page tables is observable by all CPUs. For this purpose, add a dsb instruction to __kvm_tlb_flush_vmid_ipa before doing the TLB invalidation itself. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index d0a8fa3..20e03d9 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -49,6 +49,7 @@ __kvm_hyp_code_start: ENTRY(__kvm_tlb_flush_vmid_ipa) push {r2, r3} + dsb ishst add r0, r0, #KVM_VTTBR ldrd r2, r3, [r0] mcrr p15, 6, r2, r3, c2 @ Write VTTBR -- cgit v0.10.2 From 22cfbb6d730ca2fda236b507d9fba17bf002736c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Jun 2013 13:08:48 +0100 Subject: ARM: KVM: clear exclusive monitor on all exception returns Make sure we clear the exclusive monitor on all exception returns, which otherwise could lead to lock corruptions. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 20e03d9..16cd4ba 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -292,6 +292,7 @@ THUMB( orr r2, r2, #PSR_T_BIT ) ldr r2, =BSYM(panic) msr ELR_hyp, r2 ldr r0, =\panic_str + clrex @ Clear exclusive monitor eret .endm @@ -441,6 +442,7 @@ guest_trap: 4: pop {r0, r1} @ Failed translation, return to guest mcrr p15, 0, r0, r1, c7 @ PAR + clrex pop {r0, r1, r2} eret @@ -467,6 +469,7 @@ switch_to_guest_vfp: pop {r3-r7} pop {r0-r2} + clrex eret #endif -- cgit v0.10.2 From f2dda9d829818b055510187059cdfa4ece10c82d Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Thu, 6 Jun 2013 18:02:54 -0700 Subject: arm/kvm: Cleanup KVM_ARM_MAX_VCPUS logic Commit d21a1c83c7595e387545632e44cd7797b76e19cc (ARM: KVM: define KVM_ARM_MAX_VCPUS unconditionally) changed the Kconfig logic for KVM_ARM_MAX_VCPUS to work around a build error arising from the use of KVM_ARM_MAX_VCPUS when CONFIG_KVM=n. The resulting Kconfig logic is a bit awkward and leaves a KVM_ARM_MAX_VCPUS always defined in the kernel config file. This change reverts the Kconfig logic back and adds a simple preprocessor conditional in kvm_host.h to handle when CONFIG_KVM_ARM_MAX_VCPUS is undefined. Signed-off-by: Geoff Levand Signed-off-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 1f3cee2..7d22517 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -25,7 +25,12 @@ #include #include +#if defined(CONFIG_KVM_ARM_MAX_VCPUS) #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS +#else +#define KVM_MAX_VCPUS 0 +#endif + #define KVM_USER_MEM_SLOTS 32 #define KVM_PRIVATE_MEM_SLOTS 4 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 370e1a8..49dd64e 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -41,9 +41,9 @@ config KVM_ARM_HOST Provides host support for ARM processors. config KVM_ARM_MAX_VCPUS - int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST - default 4 if KVM_ARM_HOST - default 0 + int "Number maximum supported virtual CPUs per VM" + depends on KVM_ARM_HOST + default 4 help Static number of max supported virtual CPUs per VM. -- cgit v0.10.2 From 0f4ca79ebc98c1a4c6f8cdc0448f540db70f9ad8 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 21 Jun 2013 13:54:36 -0700 Subject: Update MAINTAINERS: KVM/ARM work now funded by Linaro Change my e-mail address to the Linaro one and change status from Maintained to Supported. Signed-off-by: Christoffer Dall diff --git a/MAINTAINERS b/MAINTAINERS index 3d7782b..b66a8a3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4692,10 +4692,10 @@ F: arch/s390/kvm/ F: drivers/s390/kvm/ KERNEL VIRTUAL MACHINE (KVM) FOR ARM -M: Christoffer Dall +M: Christoffer Dall L: kvmarm@lists.cs.columbia.edu W: http://systems.cs.columbia.edu/projects/kvm-arm -S: Maintained +S: Supported F: arch/arm/include/uapi/asm/kvm* F: arch/arm/include/asm/kvm* F: arch/arm/kvm/ -- cgit v0.10.2 From 8bd4ffd6b3a98f00267051dc095076ea2ff06ea8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 21 Jun 2013 22:33:22 +0200 Subject: ARM: kvm: don't include drivers/virtio/Kconfig The virtio configuration has recently moved and is now visible everywhere. Including the file again from KVM as we used to need earlier now causes dependency problems: warning: (CAIF_VIRTIO && VIRTIO_PCI && VIRTIO_MMIO && REMOTEPROC && RPMSG) selects VIRTIO which has unmet direct dependencies (VIRTUALIZATION) Cc: Christoffer Dall Signed-off-by: Arnd Bergmann Signed-off-by: Christoffer Dall diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 49dd64e..ebf5015 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -67,6 +67,4 @@ config KVM_ARM_TIMER ---help--- Adds support for the Architected Timers in virtual machines -source drivers/virtio/Kconfig - endif # VIRTUALIZATION -- cgit v0.10.2 From f2fd125d32822ee32779551a70d256a7c27dbe40 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:24 +0800 Subject: KVM: MMU: store generation-number into mmio spte Store the generation-number into bit3 ~ bit11 and bit52 ~ bit61, totally 19 bits can be used, it should be enough for nearly all most common cases In this patch, the generation-number is always 0, it will be changed in the later patch [Gleb: masking generation bits from spte in get_mmio_spte_gfn() and get_mmio_spte_access()] Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6941fa7..5d9a1fb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -197,15 +197,52 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +/* + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, + * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation + * number. + */ +#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_HIGH_SHIFT 52 + +#define MMIO_GEN_LOW_SHIFT 9 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_MAX_GEN ((1 << 19) - 1) + +static u64 generation_mmio_spte_mask(unsigned int gen) +{ + u64 mask; + + WARN_ON(gen > MMIO_MAX_GEN); + + mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; + mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + return mask; +} + +static unsigned int get_mmio_spte_generation(u64 spte) +{ + unsigned int gen; + + spte &= ~shadow_mmio_mask; + + gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; + gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; + return gen; +} + +static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, + unsigned access) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); + u64 mask = generation_mmio_spte_mask(0); access &= ACC_WRITE_MASK | ACC_USER_MASK; - + mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; sp->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access); - mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); + + trace_mark_mmio_spte(sptep, gfn, access, 0); + mmu_spte_set(sptep, mask); } static bool is_mmio_spte(u64 spte) @@ -215,18 +252,21 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } @@ -2364,7 +2404,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -3427,8 +3467,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte) *access &= mask; } -static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, - int *nr_present) +static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -3437,7 +3477,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, } (*nr_present)++; - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index eb444dd..ad24757 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -199,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), - TP_ARGS(sptep, gfn, access), + TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), + TP_ARGS(sptep, gfn, access, gen), TP_STRUCT__entry( __field(void *, sptep) __field(gfn_t, gfn) __field(unsigned, access) + __field(unsigned int, gen) ), TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; __entry->access = access; + __entry->gen = gen; ), - TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, - __entry->access) + TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, + __entry->gfn, __entry->access, __entry->gen) ); TRACE_EVENT( diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index da20860..fb50fa6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -792,7 +792,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); - if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) + if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + &nr_present)) continue; if (gfn != sp->gfns[i]) { -- cgit v0.10.2 From b37fbea6cefc3a8ff7b6cfec9867432d1a10046d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:25 +0800 Subject: KVM: MMU: make return value of mmio page fault handler more readable Define some meaningful names instead of raw code Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5d9a1fb..476d155 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3224,17 +3224,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) return spte; } -/* - * If it is a real mmio page fault, return 1 and emulat the instruction - * directly, return 0 to let CPU fault again on the address, -1 is - * returned if bug is detected. - */ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; if (quickly_check_mmio_pf(vcpu, addr, direct)) - return 1; + return RET_MMIO_PF_EMULATE; spte = walk_shadow_page_get_mmio_spte(vcpu, addr); @@ -3247,7 +3242,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return 1; + return RET_MMIO_PF_EMULATE; } /* @@ -3255,13 +3250,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) * it's a BUG if the gfn is not a mmio page. */ if (direct && !check_direct_spte_mmio_pf(spte)) - return -1; + return RET_MMIO_PF_BUG; /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return 0; + return RET_MMIO_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); @@ -3271,7 +3266,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, int ret; ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret < 0); + WARN_ON(ret == RET_MMIO_PF_BUG); return ret; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 922bfae..ba6a19c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,6 +52,20 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); + +/* + * Return values of handle_mmio_page_fault_common: + * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction + * directly. + * RET_MMIO_PF_RETRY: let CPU fault again on the address. + * RET_MMIO_PF_BUG: bug is detected. + */ +enum { + RET_MMIO_PF_EMULATE = 1, + RET_MMIO_PF_RETRY = 0, + RET_MMIO_PF_BUG = -1 +}; + int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 78ee123..85c8d51 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5366,10 +5366,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); ret = handle_mmio_page_fault_common(vcpu, gpa, true); - if (likely(ret == 1)) + if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; - if (unlikely(!ret)) + if (unlikely(ret == RET_MMIO_PF_RETRY)) return 1; /* It is the real ept misconfig */ -- cgit v0.10.2 From f8f559422b6c6a05469dfde614b67789b6142cb5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:26 +0800 Subject: KVM: MMU: fast invalidate all mmio sptes This patch tries to introduce a very simple and scale way to invalidate all mmio sptes - it need not walk any shadow pages and hold mmu-lock KVM maintains a global mmio valid generation-number which is stored in kvm->memslots.generation and every mmio spte stores the current global generation-number into his available bits when it is created When KVM need zap all mmio sptes, it just simply increase the global generation-number. When guests do mmio access, KVM intercepts a MMIO #PF then it walks the shadow page table and get the mmio spte. If the generation-number on the spte does not equal the global generation-number, it will go to the normal #PF handler to update the mmio spte Since 19 bits are used to store generation-number on mmio spte, we zap all mmio sptes when the number is round Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1f98c1b..90d05ed 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -773,7 +773,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 476d155..3e893cd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -205,9 +205,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); #define MMIO_SPTE_GEN_LOW_SHIFT 3 #define MMIO_SPTE_GEN_HIGH_SHIFT 52 +#define MMIO_GEN_SHIFT 19 #define MMIO_GEN_LOW_SHIFT 9 #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) -#define MMIO_MAX_GEN ((1 << 19) - 1) +#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) static u64 generation_mmio_spte_mask(unsigned int gen) { @@ -231,17 +233,23 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } +static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +{ + return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; +} + static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, unsigned access) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); - u64 mask = generation_mmio_spte_mask(0); + unsigned int gen = kvm_current_mmio_generation(kvm); + u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; sp->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access, 0); + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -273,6 +281,12 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, return false; } +static bool check_mmio_spte(struct kvm *kvm, u64 spte) +{ + return likely(get_mmio_spte_generation(spte) == + kvm_current_mmio_generation(kvm)); +} + static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -3237,6 +3251,9 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); + if (!check_mmio_spte(vcpu->kvm, spte)) + return RET_MMIO_PF_INVALID; + if (direct) addr = 0; @@ -3278,8 +3295,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gva, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gva, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -3355,8 +3376,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gpa, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -4329,7 +4354,7 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -4352,6 +4377,19 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +{ + /* + * The very rare case: if the generation-number is round, + * zap all shadow pages. + * + * The max value is MMIO_MAX_GEN - 1 since it is not called + * when mark memslot invalid. + */ + if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) + kvm_mmu_zap_mmio_sptes(kvm); +} + static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ba6a19c..5b59c57 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -56,12 +56,15 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); /* * Return values of handle_mmio_page_fault_common: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction - * directly. + * directly. + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page + * fault path update the mmio spte. * RET_MMIO_PF_RETRY: let CPU fault again on the address. * RET_MMIO_PF_BUG: bug is detected. */ enum { RET_MMIO_PF_EMULATE = 1, + RET_MMIO_PF_INVALID = 2, RET_MMIO_PF_RETRY = 0, RET_MMIO_PF_BUG = -1 }; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fb50fa6..7769699 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, addr, error_code, + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu)); + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + }; r = mmu_topup_memory_caches(vcpu); if (r) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 85c8d51..f4a5b3f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5369,6 +5369,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; + + if (unlikely(ret == RET_MMIO_PF_INVALID)) + return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); + if (unlikely(ret == RET_MMIO_PF_RETRY)) return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15cf34d..aac5ffc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7084,8 +7084,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * If memory slot is created, or moved, we need to clear all * mmio sptes. */ - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) - kvm_mmu_zap_mmio_sptes(kvm); + kvm_mmu_invalidate_mmio_sptes(kvm); } void kvm_arch_flush_shadow_all(struct kvm *kvm) -- cgit v0.10.2 From 089504c0d40a24fe37a108c0eda16a9e7b846f12 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:27 +0800 Subject: KVM: MMU: add tracepoint for check_mmio_spte It is useful for debug mmio spte invalidation Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3e893cd..417f36b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -283,8 +283,13 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm *kvm, u64 spte) { - return likely(get_mmio_spte_generation(spte) == - kvm_current_mmio_generation(kvm)); + unsigned int kvm_gen, spte_gen; + + kvm_gen = kvm_current_mmio_generation(kvm); + spte_gen = get_mmio_spte_generation(spte); + + trace_check_mmio_spte(spte, kvm_gen, spte_gen); + return likely(kvm_gen == spte_gen); } static inline u64 rsvd_bits(int s, int e) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index ad24757..9d2e0ff 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -298,6 +298,30 @@ TRACE_EVENT( __entry->mmu_valid_gen, __entry->mmu_used_pages ) ); + + +TRACE_EVENT( + check_mmio_spte, + TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), + TP_ARGS(spte, kvm_gen, spte_gen), + + TP_STRUCT__entry( + __field(unsigned int, kvm_gen) + __field(unsigned int, spte_gen) + __field(u64, spte) + ), + + TP_fast_assign( + __entry->kvm_gen = kvm_gen; + __entry->spte_gen = spte_gen; + __entry->spte = spte; + ), + + TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte, + __entry->kvm_gen, __entry->spte_gen, + __entry->kvm_gen == __entry->spte_gen + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -- cgit v0.10.2 From 69c9ea93eaea95e3a2c5f1a0cf77b02c58979b9a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 7 Jun 2013 16:51:28 +0800 Subject: KVM: MMU: init kvm generation close to mmio wrap-around value Then it has the chance to trigger mmio generation number wrap-around Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti [Change from MMIO_MAX_GEN - 13 to MMIO_MAX_GEN - 150, because 13 is very close to the number of calls to KVM_SET_USER_MEMORY_REGION before the guest is started and there is any chance to create any spte. - Paolo] Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 417f36b..c212101 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -235,7 +235,12 @@ static unsigned int get_mmio_spte_generation(u64 spte) static unsigned int kvm_current_mmio_generation(struct kvm *kvm) { - return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; + /* + * Init kvm generation close to MMIO_MAX_GEN to easily test the + * code of handling generation number wrap-around. + */ + return (kvm_memslots(kvm)->generation + + MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; } static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, -- cgit v0.10.2 From a8eca9dcc656a405a28ffba43f3d86a1ff0eb331 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 10 Jun 2013 16:28:55 +0800 Subject: KVM: MMU: drop kvm_mmu_zap_mmio_sptes Drop kvm_mmu_zap_mmio_sptes and use kvm_mmu_invalidate_zap_all_pages instead to handle mmio generation number overflow Signed-off-by: Xiao Guangrong Reviewed-by: Gleb Natapov Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 90d05ed..966f265 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -230,7 +230,6 @@ struct kvm_mmu_page { #endif int write_flooding_count; - bool mmio_cached; }; struct kvm_pio_request { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c212101..7113a0f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -246,13 +246,11 @@ static unsigned int kvm_current_mmio_generation(struct kvm *kvm) static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, unsigned access) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); unsigned int gen = kvm_current_mmio_generation(kvm); u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; - sp->mmio_cached = true; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -4364,24 +4362,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); } -static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (!sp->mmio_cached) - continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); -} - static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) { return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); @@ -4397,7 +4377,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * when mark memslot invalid. */ if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) - kvm_mmu_zap_mmio_sptes(kvm); + kvm_mmu_invalidate_zap_all_pages(kvm); } static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) -- cgit v0.10.2 From accaefe07ddbeb12c0de4cec1d62dba6a0ea1605 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:20 +0800 Subject: KVM: MMU: document clear_spte_count Document it to Documentation/virtual/kvm/mmu.txt Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 869abcc..f514a3f 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -210,6 +210,11 @@ Shadow pages contain the following information: A bitmap indicating which sptes in spt point (directly or indirectly) at pages that may be unsynchronized. Used to quickly locate all unsychronized pages reachable from a given page. + clear_spte_count: + Only present on 32-bit hosts, where a 64-bit spte cannot be written + atomically. The reader uses this while running out of the MMU lock + to detect in-progress updates and retry them until the writer has + finished the write. Reverse map =========== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 966f265..5d28c11 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -226,6 +226,10 @@ struct kvm_mmu_page { DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ int clear_spte_count; #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7113a0f..f385a4c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -466,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte(arch/x86/mm/gup.c). - * The difference is we can not catch the spte tlb flush if we leave - * guest mode, so we emulate it by increase clear_spte_count when spte - * is cleared. + * + * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * coalesces them and we are running out of the MMU lock. Therefore + * we need to protect against in-progress updates of the spte. + * + * Reading the spte while an update is in progress may get the old value + * for the high part of the spte. The race is fine for a present->non-present + * change (because the high part of the spte is ignored for non-present spte), + * but for a present->present change we must reread the spte. + * + * All such changes are done in two steps (present->non-present and + * non-present->present), hence it is enough to count the number of + * present->non-present updates: if it changed while reading the spte, + * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { -- cgit v0.10.2 From 0cbf8e437b60b8b12d97589509d3e5a581731d36 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:21 +0800 Subject: KVM: MMU: document write_flooding_count Document write_flooding_count to Documentation/virtual/kvm/mmu.txt Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index f514a3f..0db7743 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -215,6 +215,15 @@ Shadow pages contain the following information: atomically. The reader uses this while running out of the MMU lock to detect in-progress updates and retry them until the writer has finished the write. + write_flooding_count: + A guest may write to a page table many times, causing a lot of + emulations if the page needs to be write-protected (see "Synchronized + and unsynchronized pages" below). Leaf pages can be unsynchronized + so that they do not trigger frequent emulation, but this is not + possible for non-leafs. This field counts the number of emulations + since the last time the page table was actually used; if emulation + is triggered too frequently on this page, KVM will unmap the page + to avoid emulation in the future. Reverse map =========== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5d28c11..6b636fd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -233,6 +233,7 @@ struct kvm_mmu_page { int clear_spte_count; #endif + /* Number of writes since the last time traversal visited this page. */ int write_flooding_count; }; -- cgit v0.10.2 From 67652ed34390b19ede6f847d23d8c68e2c819b50 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:22 +0800 Subject: KVM: MMU: document mmio page fault Document it to Documentation/virtual/kvm/mmu.txt Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 0db7743..0aa8e0e 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -272,14 +272,21 @@ This is the most complicated event. The cause of a page fault can be: Handling a page fault is performed as follows: + - if the RSV bit of the error code is set, the page fault is caused by guest + accessing MMIO and cached MMIO information is available. + - walk shadow page table + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and + vcpu->arch.mmio_gfn, and call the emulator - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) - if permissions are insufficient, reflect the fault back to the guest - determine the host page - - if this is an mmio request, there is no host page; call the emulator - to emulate the instruction instead + - if this is an mmio request, there is no host page; cache the info to + vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn - walk the shadow page table to find the spte for the translation, instantiating missing intermediate page tables as necessary + - If this is an mmio request, cache the mmio info to the spte and set some + reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) - try to unsynchronize the page - if successful, we can let the guest continue and modify the gpte - emulate the instruction -- cgit v0.10.2 From 2d49c47f350b939b395cd2d1abaa8c3bb6c54326 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:23 +0800 Subject: KVM: MMU: document fast page fault Document fast page fault to Documentation/virtual/kvm/mmu.txt Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 0aa8e0e..42193f2 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -277,6 +277,9 @@ Handling a page fault is performed as follows: - walk shadow page table - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn, and call the emulator + - If both P bit and R/W bit of error code are set, this could possibly + be handled as a "fast page fault" (fixed without taking the MMU lock). See + the description in Documentation/virtual/kvm/locking.txt. - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) - if permissions are insufficient, reflect the fault back to the guest -- cgit v0.10.2 From f6f8adeef542a18b1cb26a0b772c9781a10bb477 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:24 +0800 Subject: KVM: MMU: document fast invalidate all pages Document it to Documentation/virtual/kvm/mmu.txt Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 42193f2..89c8a4c 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -210,6 +210,10 @@ Shadow pages contain the following information: A bitmap indicating which sptes in spt point (directly or indirectly) at pages that may be unsynchronized. Used to quickly locate all unsychronized pages reachable from a given page. + mmu_valid_gen: + Generation number of the page. It is compared with kvm->arch.mmu_valid_gen + during hash table lookup, and used to skip invalidated shadow pages (see + "Zapping all pages" below.) clear_spte_count: Only present on 32-bit hosts, where a 64-bit spte cannot be written atomically. The reader uses this while running out of the MMU lock @@ -375,6 +379,27 @@ causes its write_count to be incremented, thus preventing instantiation of a large spte. The frames at the end of an unaligned memory slot have artificially inflated ->write_counts so they can never be instantiated. +Zapping all pages (page generation count) +========================================= + +For the large memory guests, walking and zapping all pages is really slow +(because there are a lot of pages), and also blocks memory accesses of +all VCPUs because it needs to hold the MMU lock. + +To make it be more scalable, kvm maintains a global generation number +which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores +the current global generation-number into sp->mmu_valid_gen when it +is created. Pages with a mismatching generation number are "obsolete". + +When KVM need zap all shadow pages sptes, it just simply increases the global +generation-number then reload root shadow pages on all vcpus. As the VCPUs +create new shadow page tables, the old pages are not used because of the +mismatching generation number. + +KVM then walks through all pages and zaps obsolete pages. While the zap +operation needs to take the MMU lock, the lock can be released periodically +so that the VCPUs can make progress. + Further reading =============== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b636fd..280e327 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -222,7 +222,10 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; unsigned long parent_ptes; /* Reverse mapping for parent_pte */ + + /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ unsigned long mmu_valid_gen; + DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 -- cgit v0.10.2 From 5a9b3830d462971bf05329148873f8996d1c88fc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 19 Jun 2013 17:09:25 +0800 Subject: KVM: MMU: document fast invalidate all mmio sptes Document it to Documentation/virtual/kvm/mmu.txt Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 89c8a4c..2908941 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -279,6 +279,8 @@ Handling a page fault is performed as follows: - if the RSV bit of the error code is set, the page fault is caused by guest accessing MMIO and cached MMIO information is available. - walk shadow page table + - check for valid generation number in the spte (see "Fast invalidation of + MMIO sptes" below) - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn, and call the emulator - If both P bit and R/W bit of error code are set, this could possibly @@ -400,6 +402,30 @@ KVM then walks through all pages and zaps obsolete pages. While the zap operation needs to take the MMU lock, the lock can be released periodically so that the VCPUs can make progress. +Fast invalidation of MMIO sptes +=============================== + +As mentioned in "Reaction to events" above, kvm will cache MMIO +information in leaf sptes. When a new memslot is added or an existing +memslot is changed, this information may become stale and needs to be +invalidated. This also needs to hold the MMU lock while walking all +shadow pages, and is made more scalable with a similar technique. + +MMIO sptes have a few spare bits, which are used to store a +generation number. The global generation number is stored in +kvm_memslots(kvm)->generation, and increased whenever guest memory info +changes. This generation number is distinct from the one described in +the previous section. + +When KVM finds an MMIO spte, it checks the generation number of the spte. +If the generation number of the spte does not equal the global generation +number, it will ignore the cached MMIO information and handle the page +fault through the slow path. + +Since only 19 bits are used to store generation-number on mmio spte, all +pages are zapped when there is an overflow. + + Further reading =============== -- cgit v0.10.2 From 7a2e8aaf0f6873b47bc2347f216ea5b0e4c258ab Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 21 Jun 2013 01:34:31 +0900 Subject: KVM: MMU: Inform users of mmio generation wraparound Without this information, users will just see unexpected performance problems and there is little chance we will get good reports from them: note that mmio generation is increased even when we just start, or stop, dirty logging for some memory slot, in which case users cannot expect all shadow pages to be zapped. printk_ratelimited() is used for this taking into account the problems that we can see the information many times when we start multiple VMs and guests can trigger this by reading ROM in a loop for example. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f385a4c..0d094da 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4387,8 +4387,10 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * The max value is MMIO_MAX_GEN - 1 since it is not called * when mark memslot invalid. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) + if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { + printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); + } } static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) -- cgit v0.10.2 From 489223edf29bc08f84e581c9495a2b42c9d52f08 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Wed, 12 Jun 2013 16:43:44 +0900 Subject: kvm: Add a tracepoint write_tsc_offset Add a tracepoint write_tsc_offset for tracing TSC offset change. We want to merge ftrace's trace data of guest OSs and the host OS using TSC for timestamp in chronological order. We need "TSC offset" values for each guest when merge those because the TSC value on a guest is always the host TSC plus guest's TSC offset. If we get the TSC offset values, we can calculate the host TSC value for each guest events from the TSC offset and the event TSC value. The host TSC values of the guest events are used when we want to merge trace data of guests and the host in chronological order. (Note: the trace_clock of both the host and the guest must be set x86-tsc in this case) This tracepoint also records vcpu_id which can be used to merge trace data for SMP guests. A merge tool will read TSC offset for each vcpu, then the tool converts guest TSC values to host TSC values for each vcpu. TSC offset is stored in the VMCS by vmx_write_tsc_offset() or vmx_adjust_tsc_offset(). KVM executes the former function when a guest boots. The latter function is executed when kvm clock is updated. Only host can read TSC offset value from VMCS, so a host needs to output TSC offset value when TSC offset is changed. Since the TSC offset is not often changed, it could be overwritten by other frequent events while tracing. To avoid that, I recommend to use a special instance for getting this event: 1. set a instance before booting a guest # cd /sys/kernel/debug/tracing/instances # mkdir tsc_offset # cd tsc_offset # echo x86-tsc > trace_clock # echo 1 > events/kvm/kvm_write_tsc_offset/enable 2. boot a guest Signed-off-by: Yoshihiro YUNOMAE Cc: Joerg Roedel Cc: Marcelo Tosatti Cc: Gleb Natapov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Acked-by: Marcelo Tosatti Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a14a6ea..c0bc803 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset, + offset); svm->vmcb->control.tsc_offset = offset + g_tsc_offset; @@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; + else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset - adjustment, + svm->vmcb->control.tsc_offset); + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index fe5e00e..545245d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -756,6 +756,27 @@ TRACE_EVENT( __entry->gpa_match ? "GPA" : "GVA") ); +TRACE_EVENT(kvm_write_tsc_offset, + TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, + __u64 next_tsc_offset), + TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( __u64, previous_tsc_offset ) + __field( __u64, next_tsc_offset ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->previous_tsc_offset = previous_tsc_offset; + __entry->next_tsc_offset = next_tsc_offset; + ), + + TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, + __entry->previous_tsc_offset, __entry->next_tsc_offset) +); + #ifdef CONFIG_X86_64 #define host_clocks \ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4a5b3f..036e863 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12->tsc_offset : 0)); } else { + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vmcs_read64(TSC_OFFSET), offset); vmcs_write64(TSC_OFFSET, offset); } } @@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); if (is_guest_mode(vcpu)) { /* Even when running L2, the adjustment needs to apply to L1 */ to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; - } + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, + offset + adjustment); } static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aac5ffc..7d71c0f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7303,3 +7303,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -- cgit v0.10.2 From 24f7bb52e952912b6a936ebcdc4e744b03e9e5cf Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 24 Jun 2013 15:19:15 +0300 Subject: KVM: Fix RTC interrupt coalescing tracking This reverts most of the f1ed0450a5fac7067590317cbf027f566b6ccbca. After the commit kvm_apic_set_irq() no longer returns accurate information about interrupt injection status if injection is done into disabled APIC. RTC interrupt coalescing tracking relies on the information to be accurate and cannot recover if it is not. Signed-off-by: Gleb Natapov diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9d75193..9f4bea8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) return highest_irr; } -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map); +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode, + unsigned long *dest_map); -void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; - __apic_accept_irq(apic, irq->delivery_mode, irq->vector, - irq->level, irq->trig_mode, dest_map); + return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + irq->level, irq->trig_mode, dest_map); } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) @@ -608,8 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq->shorthand == APIC_DEST_SELF) { - kvm_apic_set_irq(src->vcpu, irq, dest_map); - *r = 1; + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } @@ -654,8 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, continue; if (*r < 0) *r = 0; - kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); - *r += 1; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } ret = true; @@ -664,11 +662,15 @@ out: return ret; } -/* Set an IRQ pending in the lapic. */ -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode, - unsigned long *dest_map) +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode, + unsigned long *dest_map) { + int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; switch (delivery_mode) { @@ -682,10 +684,13 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) + if (kvm_x86_ops->deliver_posted_interrupt) { + result = 1; kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - else { - if (apic_test_and_set_irr(vector, apic)) { + } else { + result = !apic_test_and_set_irr(vector, apic); + + if (!result) { if (trig_mode) apic_debug("level trig mode repeatedly " "for vector %d", vector); @@ -697,7 +702,7 @@ static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, false); + trig_mode, vector, !result); break; case APIC_DM_REMRD: @@ -709,12 +714,14 @@ out: break; case APIC_DM_NMI: + result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (!trig_mode || level) { + result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */ apic->pending_events = (1UL << KVM_APIC_INIT); /* make sure pending_events is visible before sending @@ -731,6 +738,7 @@ out: case APIC_DM_STARTUP: apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); + result = 1; apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); @@ -752,6 +760,7 @@ out: delivery_mode); break; } + return result; } int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) @@ -1461,7 +1470,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; @@ -1470,8 +1479,10 @@ void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + return __apic_accept_irq(apic, mode, vector, 1, trig_mode, + NULL); } + return 0; } void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 61a73a0..c730ac9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -57,9 +57,9 @@ void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map); -void kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map); +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index ef1817b..e2e6b44 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -91,8 +91,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (!kvm_is_dm_lowest_prio(irq)) { if (r < 0) r = 0; - kvm_apic_set_irq(vcpu, irq, dest_map); - r++; + r += kvm_apic_set_irq(vcpu, irq, dest_map); } else if (kvm_lapic_enabled(vcpu)) { if (!lowest) lowest = vcpu; @@ -101,10 +100,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, } } - if (lowest) { - kvm_apic_set_irq(lowest, irq, dest_map); - r = 1; - } + if (lowest) + r = kvm_apic_set_irq(lowest, irq, dest_map); return r; } -- cgit v0.10.2 From 5f17ce8b954a2ff93f0460a2ae35d56285697d0f Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Mon, 13 May 2013 10:00:45 +0800 Subject: KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL Availablity of the doorbell_exception function is guarded by CONFIG_PPC_DOORBELL. Use the same define to guard our caller of it. Signed-off-by: Tiejun Chen [agraf: improve patch description] Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1020119..62d4ece 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, kvmppc_fill_pt_regs(®s); timer_interrupt(®s); break; -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) +#if defined(CONFIG_PPC_DOORBELL) case BOOKE_INTERRUPT_DOORBELL: kvmppc_fill_pt_regs(®s); doorbell_exception(®s); -- cgit v0.10.2 From 8ed7b7e9d2a4e208d3c94af3ddbc628317b30409 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 22 Jun 2013 17:13:32 +1000 Subject: KVM: PPC: Book3S PR: Fix proto-VSID calculations This makes sure the calculation of the proto-VSIDs used by PR KVM is done with 64-bit arithmetic. Since vcpu3s->context_id[] is int, when we do vcpu3s->context_id[0] << ESID_BITS the shift will be done with 32-bit instructions, possibly leading to significant bits getting lost, as the context id can be up to 524283 and ESID_BITS is 18. To fix this we cast the context id to u64 before shifting. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 3a9a1ac..2c6e7ee 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -325,9 +325,9 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) return -1; vcpu3s->context_id[0] = err; - vcpu3s->proto_vsid_max = ((vcpu3s->context_id[0] + 1) + vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1) << ESID_BITS) - 1; - vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << ESID_BITS; + vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS; vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; kvmppc_mmu_hpte_init(vcpu); -- cgit v0.10.2 From bc1bc4e3929d7a2e4078a9e48fe1e0bb853de1e6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 22 Jun 2013 17:14:11 +1000 Subject: KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry On entering a PR KVM guest, we invalidate the whole SLB before loading up the guest entries. We do this using an slbia instruction, which invalidates all entries except entry 0, followed by an slbie to invalidate entry 0. However, the slbie turns out to be ineffective in some circumstances (specifically when the host linear mapping uses 64k pages) because of errors in computing the parameter to the slbie. The result is that the guest kernel hangs very early in boot because it takes a DSI the first time it tries to access kernel data using a linear mapping address in real mode. Currently we construct bits 36 - 43 (big-endian numbering) of the slbie parameter by taking bits 56 - 63 of the SLB VSID doubleword. These bits for the tlbie are C (class, 1 bit), B (segment size, 2 bits) and 5 reserved bits. For the SLB VSID doubleword these are C (class, 1 bit), reserved (1 bit), LP (large page size, 2 bits), and 4 reserved bits. Thus we are not setting the B field correctly, and when LP = 01 as it is for 64k pages, we are setting a reserved bit. Rather than add more instructions to calculate the slbie parameter correctly, this takes a simpler approach, which is to set entry 0 to zeroes explicitly. Normally slbmte should not be used to invalidate an entry, since it doesn't invalidate the ERATs, but it is OK to use it to invalidate an entry if it is immediately followed by slbia, which does invalidate the ERATs. (This has been confirmed with the Power architects.) This approach takes fewer instructions and will work whatever the contents of entry 0. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 56b983e..4f0caec 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -66,10 +66,6 @@ slb_exit_skip_ ## num: ld r12, PACA_SLBSHADOWPTR(r13) - /* Save off the first entry so we can slbie it later */ - ld r10, SHADOW_SLB_ESID(0)(r12) - ld r11, SHADOW_SLB_VSID(0)(r12) - /* Remove bolted entries */ UNBOLT_SLB_ENTRY(0) UNBOLT_SLB_ENTRY(1) @@ -81,15 +77,10 @@ slb_exit_skip_ ## num: /* Flush SLB */ + li r10, 0 + slbmte r10, r10 slbia - /* r0 = esid & ESID_MASK */ - rldicr r10, r10, 0, 35 - /* r0 |= CLASS_BIT(VSID) */ - rldic r12, r11, 56 - 36, 36 - or r10, r10, r12 - slbie r10 - /* Fill SLB with our shadow */ lbz r12, SVCPU_SLB_MAX(r3) -- cgit v0.10.2 From 6ed1485f65f0eb17aa7b649d5abe0b011b92f718 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 22 Jun 2013 17:14:48 +1000 Subject: KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match The loop in kvmppc_mmu_book3s_64_xlate() that looks up a translation in the guest hashed page table (HPT) keeps going if it finds an HPTE that matches but doesn't allow access. This is incorrect; it is different from what the hardware does, and there should never be more than one matching HPTE anyway. This fixes it to stop when any matching HPTE is found. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index b871721..2e93bb5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -167,7 +167,6 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, int i; u8 key = 0; bool found = false; - bool perm_err = false; int second = 0; ulong mp_ea = vcpu->arch.magic_page_ea; @@ -248,11 +247,6 @@ do_second: break; } - if (!gpte->may_read) { - perm_err = true; - continue; - } - dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " "-> 0x%lx\n", eaddr, avpn, gpte->vpage, gpte->raddr); @@ -281,6 +275,8 @@ do_second: if (pteg[i+1] != oldr) copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); + if (!gpte->may_read) + return -EPERM; return 0; } else { dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " @@ -296,13 +292,7 @@ do_second: } } - no_page_found: - - - if (perm_err) - return -EPERM; - return -ENOENT; no_seg_found: -- cgit v0.10.2 From 0f296829b5a59d5a157699cbb23672ccfdd8df4c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 22 Jun 2013 17:16:32 +1000 Subject: KVM: PPC: Book3S PR: Allow guest to use 1TB segments With this, the guest can use 1TB segments as well as 256MB segments. Since we now have the situation where a single emulated guest segment could correspond to multiple shadow segments (as the shadow segments are still 256MB segments), this adds a new kvmppc_mmu_flush_segment() to scan for all shadow segments that need to be removed. This restructures the guest HPT (hashed page table) lookup code to use the correct hashing and matching functions for HPTEs within a 1TB segment. We use the standard hpt_hash() function instead of open-coding the hash calculation, and we use HPTE_V_COMPARE() with an AVPN value that has the B (segment size) field included. The calculation of avpn is done a little earlier since it doesn't change in the loop starting at the do_second label. The computation in kvmppc_mmu_book3s_64_esid_to_vsid() changes so that it returns a 256MB VSID even if the guest SLB entry is a 1TB entry. This is because the users of this function are creating 256MB SLB entries. We set a new VSID_1T flag so that entries created from 1T segments don't collide with entries from 256MB segments. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 349ed85..08891d0 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -107,8 +107,9 @@ struct kvmppc_vcpu_book3s { #define CONTEXT_GUEST 1 #define CONTEXT_GUEST_END 2 -#define VSID_REAL 0x1fffffffffc00000ULL -#define VSID_BAT 0x1fffffffffb00000ULL +#define VSID_REAL 0x0fffffffffc00000ULL +#define VSID_BAT 0x0fffffffffb00000ULL +#define VSID_1T 0x1000000000000000ULL #define VSID_REAL_DR 0x2000000000000000ULL #define VSID_REAL_IR 0x4000000000000000ULL #define VSID_PR 0x8000000000000000ULL @@ -123,6 +124,7 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); +extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size); extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long addr, diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 2e93bb5..ee435ba 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -26,6 +26,7 @@ #include #include #include +#include /* #define DEBUG_MMU */ @@ -76,6 +77,24 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( return NULL; } +static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe) +{ + return slbe->tb ? SID_SHIFT_1T : SID_SHIFT; +} + +static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe) +{ + return (1ul << kvmppc_slb_sid_shift(slbe)) - 1; +} + +static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr) +{ + eaddr &= kvmppc_slb_offset_mask(slb); + + return (eaddr >> VPN_SHIFT) | + ((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT)); +} + static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, bool data) { @@ -85,11 +104,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slb) return 0; - if (slb->tb) - return (((u64)eaddr >> 12) & 0xfffffff) | - (((u64)slb->vsid) << 28); - - return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16); + return kvmppc_slb_calc_vpn(slb, eaddr); } static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) @@ -100,7 +115,8 @@ static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) { int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); - return ((eaddr & 0xfffffff) >> p); + + return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p); } static hva_t kvmppc_mmu_book3s_64_get_pteg( @@ -109,13 +125,15 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( bool second) { u64 hash, pteg, htabsize; - u32 page; + u32 ssize; hva_t r; + u64 vpn; - page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); - hash = slbe->vsid ^ page; + vpn = kvmppc_slb_calc_vpn(slbe, eaddr); + ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M; + hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize); if (second) hash = ~hash; hash &= ((1ULL << 39ULL) - 1ULL); @@ -146,7 +164,7 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) u64 avpn; avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); - avpn |= slbe->vsid << (28 - p); + avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); if (p < 24) avpn >>= ((80 - p) - 56) - 8; @@ -189,13 +207,15 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, if (!slbe) goto no_seg_found; + avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); + if (slbe->tb) + avpn |= SLB_VSID_B_1T; + do_second: ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); if (kvm_is_error_hva(ptegp)) goto no_page_found; - avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); - if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); goto no_page_found; @@ -218,7 +238,7 @@ do_second: continue; /* AVPN compare */ - if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { + if (HPTE_V_COMPARE(avpn, v)) { u8 pp = (r & HPTE_R_PP) | key; int eaddr_mask = 0xFFF; @@ -324,7 +344,7 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe->large = (rs & SLB_VSID_L) ? 1 : 0; slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; slbe->esid = slbe->tb ? esid_1t : esid; - slbe->vsid = rs >> 12; + slbe->vsid = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16); slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; @@ -365,6 +385,7 @@ static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) { struct kvmppc_slb *slbe; + u64 seg_size; dprintk("KVM MMU: slbie(0x%llx)\n", ea); @@ -377,7 +398,8 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) slbe->valid = false; - kvmppc_mmu_map_segment(vcpu, ea); + seg_size = 1ull << kvmppc_slb_sid_shift(slbe); + kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size); } static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) @@ -457,8 +479,14 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); - if (slb) + if (slb) { gvsid = slb->vsid; + if (slb->tb) { + gvsid <<= SID_SHIFT_1T - SID_SHIFT; + gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); + gvsid |= VSID_1T; + } + } } switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 2c6e7ee..b350d94 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -301,6 +301,23 @@ out: return r; } +void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong seg_mask = -seg_size; + int i; + + for (i = 1; i < svcpu->slb_max; i++) { + if ((svcpu->slb[i].esid & SLB_ESID_V) && + (svcpu->slb[i].esid & seg_mask) == ea) { + /* Invalidate this entry */ + svcpu->slb[i].esid = 0; + } + } + + svcpu_put(svcpu); +} + void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index bdc40b8..19498a5 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1239,8 +1239,7 @@ out: #ifdef CONFIG_PPC64 int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) { - /* No flags */ - info->flags = 0; + info->flags = KVM_PPC_1T_SEGMENTS; /* SLB is always 64 entries */ info->slb_size = 64; -- cgit v0.10.2 From 681562cd56f5336cbdf6dab0c4b2f6ef16ea89ed Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 22 Jun 2013 17:15:24 +1000 Subject: KVM: PPC: Book3S PR: Invalidate SLB entries properly At present, if the guest creates a valid SLB (segment lookaside buffer) entry with the slbmte instruction, then invalidates it with the slbie instruction, then reads the entry with the slbmfee/slbmfev instructions, the result of the slbmfee will have the valid bit set, even though the entry is not actually considered valid by the host. This is confusing, if not worse. This fixes it by zeroing out the orige and origv fields of the SLB entry structure when the entry is invalidated. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index ee435ba..739bfba 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -397,6 +397,8 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); slbe->valid = false; + slbe->orige = 0; + slbe->origv = 0; seg_size = 1ull << kvmppc_slb_sid_shift(slbe); kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size); @@ -408,8 +410,11 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) dprintk("KVM MMU: slbia()\n"); - for (i = 1; i < vcpu->arch.slb_nr; i++) + for (i = 1; i < vcpu->arch.slb_nr; i++) { vcpu->arch.slb[i].valid = false; + vcpu->arch.slb[i].orige = 0; + vcpu->arch.slb[i].origv = 0; + } if (vcpu->arch.shared->msr & MSR_IR) { kvmppc_mmu_flush_segments(vcpu); -- cgit v0.10.2 From a3ff5fbc94a829680d4aa005cd17add1c1a1fb5b Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 27 Jun 2013 01:07:15 +0200 Subject: KVM: PPC: Ignore PIR writes While technically it's legal to write to PIR and have the identifier changed, we don't implement logic to do so because we simply expose vcpu_id to the guest. So instead, let's ignore writes to PIR. This ensures that we don't inject faults into the guest for something the guest is allowed to do. While at it, we cross our fingers hoping that it also doesn't mind that we broke its PIR read values. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 631a265..2c52ada 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -169,6 +169,9 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) vcpu->arch.shared->sprg3 = spr_val; break; + /* PIR can legally be written, but we ignore it */ + case SPRN_PIR: break; + default: emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, spr_val); -- cgit v0.10.2