From 81aa8efe0190cf5bf7eaafb57341cd7d0aea96cd Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 7 Oct 2013 16:13:44 +0200 Subject: KVM: s390: add and extend interrupt information data structs With the currently available struct kvm_s390_interrupt it is not possible to inject every kind of interrupt as defined in the z/Architecture. Add additional interruption parameters to the structures and move it to kvm.h Signed-off-by: Jens Freimann Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index eef3dd3..3ffc964 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -168,18 +169,6 @@ struct kvm_vcpu_stat { u32 diagnose_9c; }; -struct kvm_s390_io_info { - __u16 subchannel_id; /* 0x0b8 */ - __u16 subchannel_nr; /* 0x0ba */ - __u32 io_int_parm; /* 0x0bc */ - __u32 io_int_word; /* 0x0c0 */ -}; - -struct kvm_s390_ext_info { - __u32 ext_params; - __u64 ext_params2; -}; - #define PGM_OPERATION 0x01 #define PGM_PRIVILEGED_OP 0x02 #define PGM_EXECUTE 0x03 @@ -188,27 +177,6 @@ struct kvm_s390_ext_info { #define PGM_SPECIFICATION 0x06 #define PGM_DATA 0x07 -struct kvm_s390_pgm_info { - __u16 code; -}; - -struct kvm_s390_prefix_info { - __u32 address; -}; - -struct kvm_s390_extcall_info { - __u16 code; -}; - -struct kvm_s390_emerg_info { - __u16 code; -}; - -struct kvm_s390_mchk_info { - __u64 cr14; - __u64 mcic; -}; - struct kvm_s390_interrupt_info { struct list_head list; u64 type; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 932d7f2..86faf47 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -434,6 +434,69 @@ struct kvm_s390_interrupt { __u64 parm64; }; +struct kvm_s390_io_info { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; +}; + +struct kvm_s390_ext_info { + __u32 ext_params; + __u32 pad; + __u64 ext_params2; +}; + +struct kvm_s390_pgm_info { + __u64 trans_exc_code; + __u64 mon_code; + __u64 per_address; + __u32 data_exc_code; + __u16 code; + __u16 mon_class_nr; + __u8 per_code; + __u8 per_atmid; + __u8 exc_access_id; + __u8 per_access_id; + __u8 op_access_id; + __u8 pad[3]; +}; + +struct kvm_s390_prefix_info { + __u32 address; +}; + +struct kvm_s390_extcall_info { + __u16 code; +}; + +struct kvm_s390_emerg_info { + __u16 code; +}; + +struct kvm_s390_mchk_info { + __u64 cr14; + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 pad; + __u8 fixed_logout[16]; +}; + +struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; +}; + /* for KVM_SET_GUEST_DEBUG */ #define KVM_GUESTDBG_ENABLE 0x00000001 -- cgit v0.10.2 From c05c4186bbe4e99d64e8a36f7ca7f480da5d109f Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 7 Oct 2013 16:13:45 +0200 Subject: KVM: s390: add floating irq controller This patch adds a floating irq controller as a kvm_device. It will be necessary for migration of floating interrupts as well as for hardening the reset code by allowing user space to explicitly remove all pending floating interrupts. Signed-off-by: Jens Freimann Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt new file mode 100644 index 0000000..6b55795 --- /dev/null +++ b/Documentation/virtual/kvm/devices/s390_flic.txt @@ -0,0 +1,36 @@ +FLIC (floating interrupt controller) +==================================== + +FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some +machine check interruptions. All interrupts are stored in a per-vm list of +pending interrupts. FLIC performs operations on this list. + +Only one FLIC instance may be instantiated. + +FLIC provides support to +- add interrupts (KVM_DEV_FLIC_ENQUEUE) +- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) + +Groups: + KVM_DEV_FLIC_ENQUEUE + Passes a buffer and length into the kernel which are then injected into + the list of pending interrupts. + attr->addr contains the pointer to the buffer and attr->attr contains + the length of the buffer. + The format of the data structure kvm_s390_irq as it is copied from userspace + is defined in usr/include/linux/kvm.h. + + KVM_DEV_FLIC_GET_ALL_IRQS + Copies all floating interrupts into a buffer provided by userspace. + When the buffer is too small it returns -ENOMEM, which is the indication + for userspace to try again with a bigger buffer. + All interrupts remain pending, i.e. are not deleted from the list of + currently pending interrupts. + attr->addr contains the userspace address of the buffer into which all + interrupt data will be copied. + attr->attr contains the size of the buffer in bytes. + + KVM_DEV_FLIC_CLEAR_IRQS + Simply deletes all elements from the list of currently pending floating + interrupts. No interrupts are injected into the guest. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3ffc964..59635b5 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -243,6 +243,7 @@ struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; struct kvm_s390_float_interrupt float_int; + struct kvm_device *flic; struct gmap *gmap; int css_support; }; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index d25da59..38d5f98 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -16,6 +16,20 @@ #define __KVM_S390 +/* Device control API: s390-specific devices */ +#define KVM_DEV_FLIC_GET_ALL_IRQS 1 +#define KVM_DEV_FLIC_ENQUEUE 2 +#define KVM_DEV_FLIC_CLEAR_IRQS 3 +/* + * We can have up to 4*64k pending subchannels + 8 adapter interrupts, + * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. + * There are also sclp and machine checks. This gives us + * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000 + * Lets round up to 8192 pages. + */ + +#define KVM_S390_FLIC_MAX_BUFFER 0x2000000 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5f79d2d..a5f18ba 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -659,53 +659,86 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, return inti; } -int kvm_s390_inject_vm(struct kvm *kvm, - struct kvm_s390_interrupt *s390int) +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { struct kvm_s390_local_interrupt *li; struct kvm_s390_float_interrupt *fi; - struct kvm_s390_interrupt_info *inti, *iter; + struct kvm_s390_interrupt_info *iter; int sigcpu; + mutex_lock(&kvm->lock); + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + if (!is_ioint(inti->type)) { + list_add_tail(&inti->list, &fi->list); + } else { + u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word); + + /* Keep I/O interrupts sorted in isc order. */ + list_for_each_entry(iter, &fi->list, list) { + if (!is_ioint(iter->type)) + continue; + if (int_word_to_isc_bits(iter->io.io_int_word) + <= isc_bits) + continue; + break; + } + list_add_tail(&inti->list, &iter->list); + } + atomic_set(&fi->active, 1); + sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); + if (sigcpu == KVM_MAX_VCPUS) { + do { + sigcpu = fi->next_rr_cpu++; + if (sigcpu == KVM_MAX_VCPUS) + sigcpu = fi->next_rr_cpu = 0; + } while (fi->local_int[sigcpu] == NULL); + } + li = fi->local_int[sigcpu]; + spin_lock_bh(&li->lock); + atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); + if (waitqueue_active(li->wq)) + wake_up_interruptible(li->wq); + spin_unlock_bh(&li->lock); + spin_unlock(&fi->lock); + mutex_unlock(&kvm->lock); +} + +int kvm_s390_inject_vm(struct kvm *kvm, + struct kvm_s390_interrupt *s390int) +{ + struct kvm_s390_interrupt_info *inti; + inti = kzalloc(sizeof(*inti), GFP_KERNEL); if (!inti) return -ENOMEM; - switch (s390int->type) { + inti->type = s390int->type; + switch (inti->type) { case KVM_S390_INT_VIRTIO: VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx", s390int->parm, s390int->parm64); - inti->type = s390int->type; inti->ext.ext_params = s390int->parm; inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_INT_SERVICE: VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm); - inti->type = s390int->type; inti->ext.ext_params = s390int->parm; break; - case KVM_S390_PROGRAM_INT: - case KVM_S390_SIGP_STOP: - case KVM_S390_INT_EXTERNAL_CALL: - case KVM_S390_INT_EMERGENCY: - kfree(inti); - return -EINVAL; case KVM_S390_MCHK: VM_EVENT(kvm, 5, "inject: machine check parm64:%llx", s390int->parm64); - inti->type = s390int->type; inti->mchk.cr14 = s390int->parm; /* upper bits are not used */ inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (s390int->type & IOINT_AI_MASK) + if (inti->type & IOINT_AI_MASK) VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); else VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", s390int->type & IOINT_CSSID_MASK, s390int->type & IOINT_SSID_MASK, s390int->type & IOINT_SCHID_MASK); - inti->type = s390int->type; inti->io.subchannel_id = s390int->parm >> 16; inti->io.subchannel_nr = s390int->parm & 0x0000ffffu; inti->io.io_int_parm = s390int->parm64 >> 32; @@ -718,42 +751,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, 2); - mutex_lock(&kvm->lock); - fi = &kvm->arch.float_int; - spin_lock(&fi->lock); - if (!is_ioint(inti->type)) - list_add_tail(&inti->list, &fi->list); - else { - u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word); - - /* Keep I/O interrupts sorted in isc order. */ - list_for_each_entry(iter, &fi->list, list) { - if (!is_ioint(iter->type)) - continue; - if (int_word_to_isc_bits(iter->io.io_int_word) - <= isc_bits) - continue; - break; - } - list_add_tail(&inti->list, &iter->list); - } - atomic_set(&fi->active, 1); - sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); - if (sigcpu == KVM_MAX_VCPUS) { - do { - sigcpu = fi->next_rr_cpu++; - if (sigcpu == KVM_MAX_VCPUS) - sigcpu = fi->next_rr_cpu = 0; - } while (fi->local_int[sigcpu] == NULL); - } - li = fi->local_int[sigcpu]; - spin_lock_bh(&li->lock); - atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - if (waitqueue_active(li->wq)) - wake_up_interruptible(li->wq); - spin_unlock_bh(&li->lock); - spin_unlock(&fi->lock); - mutex_unlock(&kvm->lock); + __inject_vm(kvm, inti); return 0; } @@ -841,3 +839,207 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, mutex_unlock(&vcpu->kvm->lock); return 0; } + +static void clear_floating_interrupts(struct kvm *kvm) +{ + struct kvm_s390_float_interrupt *fi; + struct kvm_s390_interrupt_info *n, *inti = NULL; + + mutex_lock(&kvm->lock); + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + list_for_each_entry_safe(inti, n, &fi->list, list) { + list_del(&inti->list); + kfree(inti); + } + atomic_set(&fi->active, 0); + spin_unlock(&fi->lock); + mutex_unlock(&kvm->lock); +} + +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti, + u8 *addr) +{ + struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr; + struct kvm_s390_irq irq = {0}; + + irq.type = inti->type; + switch (inti->type) { + case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_SERVICE: + irq.u.ext = inti->ext; + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + irq.u.io = inti->io; + break; + case KVM_S390_MCHK: + irq.u.mchk = inti->mchk; + break; + default: + return -EINVAL; + } + + if (copy_to_user(uptr, &irq, sizeof(irq))) + return -EFAULT; + + return 0; +} + +static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len) +{ + struct kvm_s390_interrupt_info *inti; + struct kvm_s390_float_interrupt *fi; + int ret = 0; + int n = 0; + + mutex_lock(&kvm->lock); + fi = &kvm->arch.float_int; + spin_lock(&fi->lock); + + list_for_each_entry(inti, &fi->list, list) { + if (len < sizeof(struct kvm_s390_irq)) { + /* signal userspace to try again */ + ret = -ENOMEM; + break; + } + ret = copy_irq_to_user(inti, buf); + if (ret) + break; + buf += sizeof(struct kvm_s390_irq); + len -= sizeof(struct kvm_s390_irq); + n++; + } + + spin_unlock(&fi->lock); + mutex_unlock(&kvm->lock); + + return ret < 0 ? ret : n; +} + +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int r; + + switch (attr->group) { + case KVM_DEV_FLIC_GET_ALL_IRQS: + r = get_all_floating_irqs(dev->kvm, (u8 *) attr->addr, + attr->attr); + break; + default: + r = -EINVAL; + } + + return r; +} + +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti, + u64 addr) +{ + struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr; + void *target = NULL; + void __user *source; + u64 size; + + if (get_user(inti->type, (u64 __user *)addr)) + return -EFAULT; + + switch (inti->type) { + case KVM_S390_INT_VIRTIO: + case KVM_S390_INT_SERVICE: + target = (void *) &inti->ext; + source = &uptr->u.ext; + size = sizeof(inti->ext); + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + target = (void *) &inti->io; + source = &uptr->u.io; + size = sizeof(inti->io); + break; + case KVM_S390_MCHK: + target = (void *) &inti->mchk; + source = &uptr->u.mchk; + size = sizeof(inti->mchk); + break; + default: + return -EINVAL; + } + + if (copy_from_user(target, source, size)) + return -EFAULT; + + return 0; +} + +static int enqueue_floating_irq(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvm_s390_interrupt_info *inti = NULL; + int r = 0; + int len = attr->attr; + + if (len % sizeof(struct kvm_s390_irq) != 0) + return -EINVAL; + else if (len > KVM_S390_FLIC_MAX_BUFFER) + return -EINVAL; + + while (len >= sizeof(struct kvm_s390_irq)) { + inti = kzalloc(sizeof(*inti), GFP_KERNEL); + if (!inti) + return -ENOMEM; + + r = copy_irq_from_user(inti, attr->addr); + if (r) { + kfree(inti); + return r; + } + __inject_vm(dev->kvm, inti); + len -= sizeof(struct kvm_s390_irq); + attr->addr += sizeof(struct kvm_s390_irq); + } + + return r; +} + +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int r = 0; + + switch (attr->group) { + case KVM_DEV_FLIC_ENQUEUE: + r = enqueue_floating_irq(dev, attr); + break; + case KVM_DEV_FLIC_CLEAR_IRQS: + r = 0; + clear_floating_interrupts(dev->kvm); + break; + default: + r = -EINVAL; + } + + return r; +} + +static int flic_create(struct kvm_device *dev, u32 type) +{ + if (!dev) + return -EINVAL; + if (dev->kvm->arch.flic) + return -EINVAL; + dev->kvm->arch.flic = dev; + return 0; +} + +static void flic_destroy(struct kvm_device *dev) +{ + dev->kvm->arch.flic = NULL; + kfree(dev); +} + +/* s390 floating irq controller (flic) */ +struct kvm_device_ops kvm_flic_ops = { + .name = "kvm-flic", + .get_attr = flic_get_attr, + .set_attr = flic_set_attr, + .create = flic_create, + .destroy = flic_destroy, +}; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e0676f3..782420f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -157,6 +157,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: r = 1; break; case KVM_CAP_NR_VCPUS: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b8e9a43..c0102ef 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1064,6 +1064,7 @@ extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; extern struct kvm_device_ops kvm_vfio_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; +extern struct kvm_device_ops kvm_flic_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 86faf47..19f717b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -918,6 +918,7 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 #define KVM_DEV_TYPE_ARM_VGIC_V2 5 +#define KVM_DEV_TYPE_FLIC 6 /* * ioctls for VM fds diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 03a0381..a9e999a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2284,6 +2284,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ops = &kvm_arm_vgic_v2_ops; break; #endif +#ifdef CONFIG_S390 + case KVM_DEV_TYPE_FLIC: + ops = &kvm_flic_ops; + break; +#endif default: return -ENODEV; } -- cgit v0.10.2 From a91b8ebe8671980151e0a19ee9fec6b0e1ae1d58 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Thu, 30 Jan 2014 08:40:23 +0100 Subject: KVM: s390: limit floating irqs Userspace can flood the kernel with interrupts as of now, so let's limit the number of pending floating interrupts injected via either the floating interrupt controller or the KVM_S390_INTERRUPT ioctl. We can have up to 4*64k pending subchannels + 8 adapter interrupts, as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. There are also sclp and machine checks. This gives us (4*65536+8+64*64+1+1) = 266250 interrupts. Suggested-by: Christian Borntraeger Signed-off-by: Jens Freimann Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 59635b5..c3c5e10 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -214,6 +214,7 @@ struct kvm_s390_float_interrupt { unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1) / sizeof(long)]; struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS]; + unsigned int irq_count; }; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 38d5f98..058b178 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -27,7 +27,7 @@ * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000 * Lets round up to 8192 pages. */ - +#define KVM_S390_MAX_FLOAT_IRQS 266250 #define KVM_S390_FLIC_MAX_BUFFER 0x2000000 /* for KVM_GET_REGS and KVM_SET_REGS */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a5f18ba..9c9192b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -528,6 +528,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) list_for_each_entry_safe(inti, n, &fi->list, list) { if (__interrupt_is_deliverable(vcpu, inti)) { list_del(&inti->list); + fi->irq_count--; deliver = 1; break; } @@ -583,6 +584,7 @@ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu) if ((inti->type == KVM_S390_MCHK) && __interrupt_is_deliverable(vcpu, inti)) { list_del(&inti->list); + fi->irq_count--; deliver = 1; break; } @@ -650,8 +652,10 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, inti = iter; break; } - if (inti) + if (inti) { list_del_init(&inti->list); + fi->irq_count--; + } if (list_empty(&fi->list)) atomic_set(&fi->active, 0); spin_unlock(&fi->lock); @@ -659,16 +663,22 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, return inti; } -static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) +static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { struct kvm_s390_local_interrupt *li; struct kvm_s390_float_interrupt *fi; struct kvm_s390_interrupt_info *iter; int sigcpu; + int rc = 0; mutex_lock(&kvm->lock); fi = &kvm->arch.float_int; spin_lock(&fi->lock); + if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) { + rc = -EINVAL; + goto unlock_fi; + } + fi->irq_count++; if (!is_ioint(inti->type)) { list_add_tail(&inti->list, &fi->list); } else { @@ -700,8 +710,10 @@ static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) if (waitqueue_active(li->wq)) wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); +unlock_fi: spin_unlock(&fi->lock); mutex_unlock(&kvm->lock); + return rc; } int kvm_s390_inject_vm(struct kvm *kvm, @@ -751,8 +763,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, 2); - __inject_vm(kvm, inti); - return 0; + return __inject_vm(kvm, inti); } int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, @@ -852,6 +863,7 @@ static void clear_floating_interrupts(struct kvm *kvm) list_del(&inti->list); kfree(inti); } + fi->irq_count = 0; atomic_set(&fi->active, 0); spin_unlock(&fi->lock); mutex_unlock(&kvm->lock); @@ -992,7 +1004,11 @@ static int enqueue_floating_irq(struct kvm_device *dev, kfree(inti); return r; } - __inject_vm(dev->kvm, inti); + r = __inject_vm(dev->kvm, inti); + if (r) { + kfree(inti); + return r; + } len -= sizeof(struct kvm_s390_irq); attr->addr += sizeof(struct kvm_s390_irq); } -- cgit v0.10.2 From 24eb3a824c4f3ccfaa2305dc1d9d9e2a708828c5 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Mon, 17 Jun 2013 16:25:18 +0200 Subject: KVM: s390: Add FAULT_FLAG_RETRY_NOWAIT for guest fault In the case of a fault, we will retry to exit sie64 but with gmap fault indication for this thread set. This makes it possible to handle async page faults. Based on a patch from Martin Schwidefsky. Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2204400..66101f6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -767,6 +767,7 @@ static inline void pgste_set_pte(pte_t *ptep, pte_t entry) * @table: pointer to the page directory * @asce: address space control element for gmap page table * @crst_list: list of all crst tables used in the guest address space + * @pfault_enabled: defines if pfaults are applicable for the guest */ struct gmap { struct list_head list; @@ -775,6 +776,7 @@ struct gmap { unsigned long asce; void *private; struct list_head crst_list; + bool pfault_enabled; }; /** diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 0a876bc..dc5fc4f 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -79,6 +79,7 @@ struct thread_struct { unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ unsigned long per_flags; /* Flags to control debug behavior */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 782420f..9eec794 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -255,6 +255,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.gmap) goto out_nogmap; kvm->arch.gmap->private = kvm; + kvm->arch.gmap->pfault_enabled = 0; } kvm->arch.css_support = 0; @@ -701,6 +702,17 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; } +static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu) +{ + long rc; + hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); + struct mm_struct *mm = current->mm; + down_read(&mm->mmap_sem); + rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return rc; +} + static int vcpu_pre_run(struct kvm_vcpu *vcpu) { int rc, cpuflags; @@ -730,7 +742,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) { - int rc; + int rc = -1; VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); @@ -744,7 +756,14 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) current->thread.gmap_addr; vcpu->run->s390_ucontrol.pgm_code = 0x10; rc = -EREMOTE; - } else { + + } else if (current->thread.gmap_pfault) { + current->thread.gmap_pfault = 0; + if (kvm_arch_fault_in_sync(vcpu) >= 0) + rc = 0; + } + + if (rc == -1) { VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); trace_kvm_s390_sie_fault(vcpu); rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d95265b2..88cef50 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -50,6 +50,7 @@ #define VM_FAULT_BADMAP 0x020000 #define VM_FAULT_BADACCESS 0x040000 #define VM_FAULT_SIGNAL 0x080000 +#define VM_FAULT_PFAULT 0x100000 static unsigned long store_indication __read_mostly; @@ -227,6 +228,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) return; } case VM_FAULT_BADCONTEXT: + case VM_FAULT_PFAULT: do_no_context(regs); break; case VM_FAULT_SIGNAL: @@ -264,6 +266,9 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault) */ static inline int do_exception(struct pt_regs *regs, int access) { +#ifdef CONFIG_PGSTE + struct gmap *gmap; +#endif struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; @@ -304,9 +309,10 @@ static inline int do_exception(struct pt_regs *regs, int access) down_read(&mm->mmap_sem); #ifdef CONFIG_PGSTE - if ((current->flags & PF_VCPU) && S390_lowcore.gmap) { - address = __gmap_fault(address, - (struct gmap *) S390_lowcore.gmap); + gmap = (struct gmap *) + ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0); + if (gmap) { + address = __gmap_fault(address, gmap); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; @@ -315,6 +321,8 @@ static inline int do_exception(struct pt_regs *regs, int access) fault = VM_FAULT_OOM; goto out_up; } + if (gmap->pfault_enabled) + flags |= FAULT_FLAG_RETRY_NOWAIT; } #endif @@ -371,9 +379,19 @@ retry: regs, address); } if (fault & VM_FAULT_RETRY) { +#ifdef CONFIG_PGSTE + if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* FAULT_FLAG_RETRY_NOWAIT has been set, + * mmap_sem has not been released */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; + } +#endif /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags &= ~(FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT); flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; -- cgit v0.10.2 From e0ead41a6dac09f86675ce07a66e4b253a9b7bd5 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 6 Jun 2013 15:32:37 +0200 Subject: KVM: async_pf: Provide additional direct page notification By setting a Kconfig option, the architecture can control when guest notifications will be presented by the apf backend. There is the default batch mechanism, working as before, where the vcpu thread should pull in this information. Opposite to this, there is now the direct mechanism, that will push the information to the guest. This way s390 can use an already existing architecture interface. Still the vcpu thread should call check_completion to cleanup leftovers. Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e50425d..aaa60f3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3328,7 +3328,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c0102ef..f5937b8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -192,7 +192,7 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index fbe1a48..13f2d19 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -22,6 +22,10 @@ config KVM_MMIO config KVM_ASYNC_PF bool +# Toggle to switch between direct notification and batch job +config KVM_ASYNC_PF_SYNC + bool + config HAVE_KVM_MSI bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 8631d9c..00980ab 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -28,6 +28,21 @@ #include "async_pf.h" #include +static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ +#ifdef CONFIG_KVM_ASYNC_PF_SYNC + kvm_arch_async_page_present(vcpu, work); +#endif +} +static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ +#ifndef CONFIG_KVM_ASYNC_PF_SYNC + kvm_arch_async_page_present(vcpu, work); +#endif +} + static struct kmem_cache *async_pf_cache; int kvm_async_pf_init(void) @@ -69,6 +84,7 @@ static void async_pf_execute(struct work_struct *work) down_read(&mm->mmap_sem); get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); up_read(&mm->mmap_sem); + kvm_async_page_present_sync(vcpu, apf); unuse_mm(mm); spin_lock(&vcpu->async_pf.lock); @@ -138,7 +154,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; @@ -159,7 +175,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, work->wakeup_all = false; work->vcpu = vcpu; work->gva = gva; - work->addr = gfn_to_hva(vcpu->kvm, gfn); + work->addr = hva; work->arch = *arch; work->mm = current->mm; atomic_inc(&work->mm->mm_count); -- cgit v0.10.2 From 9f2ceda49c6b8827c795731c204f6c2587886e2c Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Tue, 3 Sep 2013 12:31:16 +0200 Subject: KVM: async_pf: Allow to wait for outstanding work On s390 we are not able to cancel work. Instead we will flush the work and wait for completion. Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 00980ab..889aad0 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -113,11 +113,16 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_entry(vcpu->async_pf.queue.next, typeof(*work), queue); list_del(&work->queue); + +#ifdef CONFIG_KVM_ASYNC_PF_SYNC + flush_work(&work->work); +#else if (cancel_work_sync(&work->work)) { mmdrop(work->mm); kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ kmem_cache_free(async_pf_cache, work); } +#endif } spin_lock(&vcpu->async_pf.lock); -- cgit v0.10.2 From 3c038e6be0e299d4d3762d0a9a29f02de6e04991 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Mon, 7 Oct 2013 17:11:48 +0200 Subject: KVM: async_pf: Async page fault support on s390 This patch enables async page faults for s390 kvm guests. It provides the userspace API to enable and disable_wait this feature. The disable_wait will enforce that the feature is off by waiting on it. Also it includes the diagnose code, called by the guest to enable async page faults. The async page faults will use an already existing guest interface for this purpose, as described in "CP Programming Services (SC24-6084)". Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt index 6b55795..410fa67 100644 --- a/Documentation/virtual/kvm/devices/s390_flic.txt +++ b/Documentation/virtual/kvm/devices/s390_flic.txt @@ -11,6 +11,7 @@ FLIC provides support to - add interrupts (KVM_DEV_FLIC_ENQUEUE) - inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) - purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) +- enable/disable for the guest transparent async page faults Groups: KVM_DEV_FLIC_ENQUEUE @@ -34,3 +35,12 @@ Groups: KVM_DEV_FLIC_CLEAR_IRQS Simply deletes all elements from the list of currently pending floating interrupts. No interrupts are injected into the guest. + + KVM_DEV_FLIC_APF_ENABLE + Enables async page faults for the guest. So in case of a major page fault + the host is allowed to handle this async and continues the guest. + + KVM_DEV_FLIC_APF_DISABLE_WAIT + Disables async page faults for the guest and waits until already pending + async page faults are done. This is necessary to trigger a completion interrupt + for every init interrupt before migrating the interrupt list. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index c3c5e10..2c69ba2 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -231,6 +231,10 @@ struct kvm_vcpu_arch { u64 stidp_data; }; struct gmap *gmap; +#define KVM_S390_PFAULT_TOKEN_INVALID (-1UL) + unsigned long pfault_token; + unsigned long pfault_select; + unsigned long pfault_compare; }; struct kvm_vm_stat { @@ -257,6 +261,24 @@ static inline bool kvm_is_error_hva(unsigned long addr) return IS_ERR_VALUE(addr); } +#define ASYNC_PF_PER_VCPU 64 +struct kvm_vcpu; +struct kvm_async_pf; +struct kvm_arch_async_pf { + unsigned long pfault_token; +}; + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); + extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; #endif diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 058b178..ccfd0b1 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -20,6 +20,8 @@ #define KVM_DEV_FLIC_GET_ALL_IRQS 1 #define KVM_DEV_FLIC_ENQUEUE 2 #define KVM_DEV_FLIC_CLEAR_IRQS 3 +#define KVM_DEV_FLIC_APF_ENABLE 4 +#define KVM_DEV_FLIC_APF_DISABLE_WAIT 5 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 70b46ea..c8bacbc 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -23,6 +23,8 @@ config KVM select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_EVENTFD + select KVM_ASYNC_PF + select KVM_ASYNC_PF_SYNC ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 40b4c64..a47d2c3 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -7,7 +7,7 @@ # as published by the Free Software Foundation. KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 8216c0e..bf9ed34 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -17,6 +17,7 @@ #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" +#include "gaccess.h" static int diag_release_pages(struct kvm_vcpu *vcpu) { @@ -46,6 +47,87 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) return 0; } +static int __diag_page_ref_service(struct kvm_vcpu *vcpu) +{ + struct prs_parm { + u16 code; + u16 subcode; + u16 parm_len; + u16 parm_version; + u64 token_addr; + u64 select_mask; + u64 compare_mask; + u64 zarch; + }; + struct prs_parm parm; + int rc; + u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4; + u16 ry = (vcpu->arch.sie_block->ipa & 0x0f); + unsigned long hva_token = KVM_HVA_ERR_BAD; + + if (vcpu->run->s.regs.gprs[rx] & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm))) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + switch (parm.subcode) { + case 0: /* TOKEN */ + if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) { + /* + * If the pagefault handshake is already activated, + * the token must not be changed. We have to return + * decimal 8 instead, as mandated in SC24-6084. + */ + vcpu->run->s.regs.gprs[ry] = 8; + return 0; + } + + if ((parm.compare_mask & parm.select_mask) != parm.compare_mask || + parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr)); + if (kvm_is_error_hva(hva_token)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + vcpu->arch.pfault_token = parm.token_addr; + vcpu->arch.pfault_select = parm.select_mask; + vcpu->arch.pfault_compare = parm.compare_mask; + vcpu->run->s.regs.gprs[ry] = 0; + rc = 0; + break; + case 1: /* + * CANCEL + * Specification allows to let already pending tokens survive + * the cancel, therefore to reduce code complexity, we assume + * all outstanding tokens are already pending. + */ + if (parm.token_addr || parm.select_mask || + parm.compare_mask || parm.zarch) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + vcpu->run->s.regs.gprs[ry] = 0; + /* + * If the pfault handling was not established or is already + * canceled SC24-6084 requests to return decimal 4. + */ + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + vcpu->run->s.regs.gprs[ry] = 4; + else + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + + return rc; +} + static int __diag_time_slice_end(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); @@ -150,6 +232,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) return __diag_time_slice_end(vcpu); case 0x9c: return __diag_time_slice_end_directed(vcpu); + case 0x258: + return __diag_page_ref_service(vcpu); case 0x308: return __diag_ipl_functions(vcpu); case 0x500: diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9c9192b..1848080 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -31,7 +31,7 @@ static int is_ioint(u64 type) return ((type & 0xfffe0000u) != 0xfffe0000u); } -static int psw_extint_disabled(struct kvm_vcpu *vcpu) +int psw_extint_disabled(struct kvm_vcpu *vcpu) { return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT); } @@ -78,11 +78,8 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, return 1; return 0; case KVM_S390_INT_SERVICE: - if (psw_extint_disabled(vcpu)) - return 0; - if (vcpu->arch.sie_block->gcr[0] & 0x200ul) - return 1; - return 0; + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: if (psw_extint_disabled(vcpu)) return 0; @@ -150,6 +147,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu, case KVM_S390_INT_EXTERNAL_CALL: case KVM_S390_INT_EMERGENCY: case KVM_S390_INT_SERVICE: + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: if (psw_extint_disabled(vcpu)) __set_cpuflag(vcpu, CPUSTAT_EXT_INT); @@ -223,6 +222,30 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, rc |= put_guest(vcpu, inti->ext.ext_params, (u32 __user *)__LC_EXT_PARAMS); break; + case KVM_S390_INT_PFAULT_INIT: + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, + inti->ext.ext_params2); + rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE); + rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + rc |= put_guest(vcpu, inti->ext.ext_params2, + (u64 __user *) __LC_EXT_PARAMS2); + break; + case KVM_S390_INT_PFAULT_DONE: + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, + inti->ext.ext_params2); + rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE); + rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + rc |= put_guest(vcpu, inti->ext.ext_params2, + (u64 __user *) __LC_EXT_PARAMS2); + break; case KVM_S390_INT_VIRTIO: VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", inti->ext.ext_params, inti->ext.ext_params2); @@ -357,7 +380,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) return 1; } -static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; @@ -737,6 +760,10 @@ int kvm_s390_inject_vm(struct kvm *kvm, VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm); inti->ext.ext_params = s390int->parm; break; + case KVM_S390_INT_PFAULT_DONE: + inti->type = s390int->type; + inti->ext.ext_params2 = s390int->parm64; + break; case KVM_S390_MCHK: VM_EVENT(kvm, 5, "inject: machine check parm64:%llx", s390int->parm64); @@ -823,6 +850,10 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, inti->type = s390int->type; inti->mchk.mcic = s390int->parm64; break; + case KVM_S390_INT_PFAULT_INIT: + inti->type = s390int->type; + inti->ext.ext_params2 = s390int->parm64; + break; case KVM_S390_INT_VIRTIO: case KVM_S390_INT_SERVICE: case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: @@ -877,6 +908,8 @@ static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti, irq.type = inti->type; switch (inti->type) { + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: case KVM_S390_INT_SERVICE: irq.u.ext = inti->ext; @@ -956,6 +989,8 @@ static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti, return -EFAULT; switch (inti->type) { + case KVM_S390_INT_PFAULT_INIT: + case KVM_S390_INT_PFAULT_DONE: case KVM_S390_INT_VIRTIO: case KVM_S390_INT_SERVICE: target = (void *) &inti->ext; @@ -1019,6 +1054,8 @@ static int enqueue_floating_irq(struct kvm_device *dev, static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; + unsigned int i; + struct kvm_vcpu *vcpu; switch (attr->group) { case KVM_DEV_FLIC_ENQUEUE: @@ -1028,6 +1065,20 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = 0; clear_floating_interrupts(dev->kvm); break; + case KVM_DEV_FLIC_APF_ENABLE: + dev->kvm->arch.gmap->pfault_enabled = 1; + break; + case KVM_DEV_FLIC_APF_DISABLE_WAIT: + dev->kvm->arch.gmap->pfault_enabled = 0; + /* + * Make sure no async faults are in transition when + * clearing the queues. So we don't need to worry + * about late coming workers. + */ + synchronize_srcu(&dev->kvm->srcu); + kvm_for_each_vcpu(i, vcpu, dev->kvm) + kvm_clear_async_pf_completion_queue(vcpu); + break; default: r = -EINVAL; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9eec794..d8e9f04 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -152,6 +152,7 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif + case KVM_CAP_ASYNC_PF: case KVM_CAP_SYNC_REGS: case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: @@ -273,6 +274,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); + kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) { clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn); @@ -322,6 +324,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* Section: vcpu related */ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) { vcpu->arch.gmap = gmap_alloc(current->mm); if (!vcpu->arch.gmap) @@ -382,6 +386,8 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.guest_fpregs.fpc = 0; asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); vcpu->arch.sie_block->gbea = 1; + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); } @@ -713,10 +719,89 @@ static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu) return rc; } +static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, + unsigned long token) +{ + struct kvm_s390_interrupt inti; + inti.parm64 = token; + + if (start_token) { + inti.type = KVM_S390_INT_PFAULT_INIT; + WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &inti)); + } else { + inti.type = KVM_S390_INT_PFAULT_DONE; + WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti)); + } +} + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token); + __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token); +} + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token); + __kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token); +} + +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + /* s390 will always inject the page directly */ +} + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +{ + /* + * s390 will always inject the page directly, + * but we still want check_async_completion to cleanup + */ + return true; +} + +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +{ + hva_t hva; + struct kvm_arch_async_pf arch; + int rc; + + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + return 0; + if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != + vcpu->arch.pfault_compare) + return 0; + if (psw_extint_disabled(vcpu)) + return 0; + if (kvm_cpu_has_interrupt(vcpu)) + return 0; + if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul)) + return 0; + if (!vcpu->arch.gmap->pfault_enabled) + return 0; + + hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap); + if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8)) + return 0; + + rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); + return rc; +} + static int vcpu_pre_run(struct kvm_vcpu *vcpu) { int rc, cpuflags; + /* + * On s390 notifications for arriving pages will be delivered directly + * to the guest but the house keeping for completed pfaults is + * handled outside the worker. + */ + kvm_check_async_pf_completion(vcpu); + memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16); if (need_resched()) @@ -758,8 +843,10 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) rc = -EREMOTE; } else if (current->thread.gmap_pfault) { + trace_kvm_s390_major_guest_pfault(vcpu); current->thread.gmap_pfault = 0; - if (kvm_arch_fault_in_sync(vcpu) >= 0) + if (kvm_arch_setup_async_pf(vcpu) || + (kvm_arch_fault_in_sync(vcpu) >= 0)) rc = 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index f9559b0..ed4750a 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -159,4 +159,8 @@ void exit_sie_sync(struct kvm_vcpu *vcpu); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); +/* implemented in interrupt.c */ +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int psw_extint_disabled(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 87c2b3a..fe9442d 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -224,6 +224,8 @@ unlock: static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) { int rc; + unsigned int i; + struct kvm_vcpu *v; switch (parameter & 0xff) { case 0: @@ -231,6 +233,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) break; case 1: case 2: + kvm_for_each_vcpu(i, v, vcpu->kvm) { + v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(v); + } + rc = SIGP_CC_ORDER_CODE_ACCEPTED; break; default: diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 3db76b2..e8e7213 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -30,6 +30,52 @@ TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \ __entry->pswmask, __entry->pswaddr, p_args) +TRACE_EVENT(kvm_s390_major_guest_pfault, + TP_PROTO(VCPU_PROTO_COMMON), + TP_ARGS(VCPU_ARGS_COMMON), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + ), + VCPU_TP_PRINTK("%s", "major fault, maybe applicable for pfault") + ); + +TRACE_EVENT(kvm_s390_pfault_init, + TP_PROTO(VCPU_PROTO_COMMON, long pfault_token), + TP_ARGS(VCPU_ARGS_COMMON, pfault_token), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(long, pfault_token) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->pfault_token = pfault_token; + ), + VCPU_TP_PRINTK("init pfault token %ld", __entry->pfault_token) + ); + +TRACE_EVENT(kvm_s390_pfault_done, + TP_PROTO(VCPU_PROTO_COMMON, long pfault_token), + TP_ARGS(VCPU_ARGS_COMMON, pfault_token), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(long, pfault_token) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->pfault_token = pfault_token; + ), + VCPU_TP_PRINTK("done pfault token %ld", __entry->pfault_token) + ); + /* * Tracepoints for SIE entry and exit. */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 19f717b..7d76401 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -413,6 +413,8 @@ struct kvm_s390_psw { #define KVM_S390_PROGRAM_INT 0xfffe0001u #define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u #define KVM_S390_RESTART 0xfffe0003u +#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u +#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u #define KVM_S390_MCHK 0xfffe1000u #define KVM_S390_INT_VIRTIO 0xffff2603u #define KVM_S390_INT_SERVICE 0xffff2401u -- cgit v0.10.2 From 536336c21697551ceca44bdffb9f53e6cc5f2f20 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Mon, 30 Sep 2013 10:55:33 +0200 Subject: KVM: async_pf: Exploit one reg interface for pfault To enable pfault after live migration we need to expose pfault_token, pfault_select and pfault_compare, as one reg registers to userspace. So that qemu is able to transfer this between the source and the target. Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index ccfd0b1..cb4c1eb8 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -73,4 +73,7 @@ struct kvm_sync_regs { #define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2) #define KVM_REG_S390_CPU_TIMER (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3) #define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4) +#define KVM_REG_S390_PFTOKEN (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5) +#define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6) +#define KVM_REG_S390_PFSELECT (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7) #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d8e9f04..a5da2cc 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -561,6 +561,18 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, r = put_user(vcpu->arch.sie_block->ckc, (u64 __user *)reg->addr); break; + case KVM_REG_S390_PFTOKEN: + r = put_user(vcpu->arch.pfault_token, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFCOMPARE: + r = put_user(vcpu->arch.pfault_compare, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFSELECT: + r = put_user(vcpu->arch.pfault_select, + (u64 __user *)reg->addr); + break; default: break; } @@ -590,6 +602,18 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, r = get_user(vcpu->arch.sie_block->ckc, (u64 __user *)reg->addr); break; + case KVM_REG_S390_PFTOKEN: + r = get_user(vcpu->arch.pfault_token, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFCOMPARE: + r = get_user(vcpu->arch.pfault_compare, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_PFSELECT: + r = get_user(vcpu->arch.pfault_select, + (u64 __user *)reg->addr); + break; default: break; } -- cgit v0.10.2 From 4f34d683e52271197e1ee17b7095e8ba27761ba6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 29 Jan 2014 17:31:38 -0200 Subject: KVM: x86: remove unused last_kernel_ns variable Remove unused last_kernel_ns variable. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fdf83af..0ffe714 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -444,7 +444,6 @@ struct kvm_vcpu_arch { } st; u64 last_guest_tsc; - u64 last_kernel_ns; u64 last_host_tsc; u64 tsc_offset_adjustment; u64 this_tsc_nsec; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39c28f09..151e8c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1581,7 +1581,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; /* -- cgit v0.10.2 From 1179ba539541347d5427cde8bcfdaa5ead14f3aa Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 31 Jan 2014 14:32:46 +0100 Subject: KVM: async_pf: Add missing call for async page present Commit KVM: async_pf: Provide additional direct page notification missed the call from kvm_check_async_pf_completion to the new introduced function. Reported-by: Paolo Bonzini Signed-off-by: Dominik Dingel Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 889aad0..10df100 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -151,7 +151,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->async_pf.lock); kvm_arch_async_page_ready(vcpu, work); - kvm_arch_async_page_present(vcpu, work); + kvm_async_page_present_async(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; -- cgit v0.10.2 From 5befdc385ddb2d5ae8995ad89004529a3acf58fc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 18 Feb 2014 17:22:47 +0900 Subject: KVM: Simplify kvm->tlbs_dirty handling When this was introduced, kvm_flush_remote_tlbs() could be called without holding mmu_lock. It is now acknowledged that the function must be called before releasing mmu_lock, and all callers have already been changed to do so. There is no need to use smp_mb() and cmpxchg() any more. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cba218a..b1e6c1b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't * used by guest then tlbs are not flushed, so guest is allowed to access the * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * We set tlbs_dirty to let the notifier know this change and delay the flush + * until such a case actually happens. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } @@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f5937b8..9816b68 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -401,7 +401,9 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif - long tlbs_dirty; + /* Protected by mmu_lock */ + bool tlbs_dirty; + struct list_head devices; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a9e999a..f5668a4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -186,12 +186,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { - long dirty_count = kvm->tlbs_dirty; - - smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); + kvm->tlbs_dirty = false; } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); -- cgit v0.10.2 From f303b4ce8b386558b2b92aeb0c6af96685fcd4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 17 Jan 2014 20:52:42 +0100 Subject: KVM: SVM: fix NMI window after iret MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should open NMI window right after an iret, but SVM exits before it. We wanted to single step using the trap flag and then open it. (or we could emulate the iret instead) We don't do it since commit 3842d135ff2 (likely), because the iret exit handler does not request an event, so NMI window remains closed until the next exit. Fix this by making KVM_REQ_EVENT request in the iret handler. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e81df8f..64d9bb9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2842,6 +2842,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } -- cgit v0.10.2 From 0d75de4a65d99ba042b050620d479ab74b1919d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?= Date: Tue, 18 Feb 2014 19:09:11 +0900 Subject: kvm: remove redundant registration of BSP's hv_clock area These days hv_clock allocation is memblock based (i.e. the percpu allocator is not involved), which means that the physical address of each of the per-cpu hv_clock areas is guaranteed to remain unchanged through all its lifetime and we do not need to update its location after CPU bring-up. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 713f1b3..0331cb3 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -417,7 +417,6 @@ void kvm_disable_steal_time(void) #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { - WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e604109..d9156ce 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -242,7 +242,7 @@ void __init kvmclock_init(void) hv_clock = __va(mem); memset(hv_clock, 0, size); - if (kvm_register_clock("boot clock")) { + if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; memblock_free(mem, size); return; -- cgit v0.10.2 From 0c79893b2bad49e0c391a9499f50fcd5b0f80874 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Fri, 21 Feb 2014 17:33:32 +0000 Subject: KVM: x86: expose new instruction RDSEED to guest From 24ffdce9efebf13c6ed4882f714b2b57ef1141eb Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Thu, 20 Feb 2014 17:38:26 +0800 Subject: [PATCH] KVM: x86: expose new instruction RDSEED to guest RDSEED instruction return a random number, which supplied by a cryptographically secure, deterministic random bit generator(DRBG). Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c697625..abe18b4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -303,7 +303,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v0.10.2 From 49345f13f0830741b94b867cf906c4aad3988306 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Fri, 21 Feb 2014 17:36:12 +0000 Subject: KVM: x86: expose ADX feature to guest From 0750e335eb5860b0b483e217e8a08bd743cbba16 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Thu, 20 Feb 2014 17:39:32 +0800 Subject: [PATCH] KVM: x86: expose ADX feature to guest ADCX and ADOX instructions perform an unsigned addition with Carry flag and Overflow flag respectively. Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index abe18b4..a951ae4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -303,7 +303,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED) | + F(ADX); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v0.10.2 From 56c103ec040b1944c8866f79aa768265c0dd2986 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Fri, 21 Feb 2014 17:39:02 +0000 Subject: KVM: x86: Fix xsave cpuid exposing bug From 00c920c96127d20d4c3bb790082700ae375c39a0 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Fri, 21 Feb 2014 23:47:18 +0800 Subject: [PATCH] KVM: x86: Fix xsave cpuid exposing bug EBX of cpuid(0xD, 0) is dynamic per XCR0 features enable/disable. Bit 63 of XCR0 is reserved for future expansion. Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 5547389..dcd047b 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,6 +13,8 @@ #define XSTATE_BNDCSR 0x10 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) #define FXSAVE_SIZE 512 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a951ae4..b241325 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= ~XSTATE_FPSSE; + xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx; @@ -74,8 +74,8 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & host_xcr0 & KVM_SUPPORTED_XCR0; - vcpu->arch.guest_xstate_size = - xstate_required_size(vcpu->arch.guest_supported_xcr0); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); } kvm_pmu_cpuid_update(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 151e8c3..3da8df8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -618,6 +618,9 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } -- cgit v0.10.2 From da8999d31818fdc8508d527ba3aac2e128005af4 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 24 Feb 2014 10:55:46 +0000 Subject: KVM: x86: Intel MPX vmx and msr handle From caddc009a6d2019034af8f2346b2fd37a81608d0 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Mon, 24 Feb 2014 18:11:11 +0800 Subject: [PATCH v5 1/3] KVM: x86: Intel MPX vmx and msr handle This patch handle vmx and msr of Intel MPX feature. Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0ffe714..e714f8c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -764,6 +764,7 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); + bool (*mpx_supported)(void); }; struct kvm_arch_async_pf { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2067264..7004d21 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -85,6 +85,7 @@ #define VM_EXIT_SAVE_IA32_EFER 0x00100000 #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -95,6 +96,7 @@ #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 +#define VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -174,6 +176,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + GUEST_BNDCFGS = 0x00002812, + GUEST_BNDCFGS_HIGH = 0x00002813, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c19fc60..ed821ed 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -295,6 +295,7 @@ #define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define MSR_IA32_TSC_ADJUST 0x0000003b +#define MSR_IA32_BNDCFGS 0x00000d90 #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a06f101..e4e4b50 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -441,6 +441,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -1710,6 +1711,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1747,6 +1750,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -2837,7 +2842,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2854,7 +2859,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -7052,6 +7057,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8634,6 +8645,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, }; static int __init vmx_init(void) @@ -8721,6 +8733,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, -- cgit v0.10.2 From 0dd376e709975779cf43f368498c5c0eec843b02 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 24 Feb 2014 10:56:53 +0000 Subject: KVM: x86: add MSR_IA32_BNDCFGS to msrs_to_save From 5d5a80cd172ea6fb51786369bcc23356b1e9e956 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Mon, 24 Feb 2014 18:11:55 +0800 Subject: [PATCH v5 2/3] KVM: x86: add MSR_IA32_BNDCFGS to msrs_to_save Add MSR_IA32_BNDCFGS to msrs_to_save, and corresponding logic to kvm_get/set_msr(). Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e4e4b50..83ee24f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2484,6 +2484,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + data = vmcs_read64(GUEST_BNDCFGS); + break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2552,6 +2555,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3da8df8..33fa9e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -882,7 +882,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; -- cgit v0.10.2 From 390bd528ae1c14d0b7f5db8225984f98617b3357 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 24 Feb 2014 10:58:09 +0000 Subject: KVM: x86: Enable Intel MPX for guest From 44c2abca2c2eadc6f2f752b66de4acc8131880c4 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Mon, 24 Feb 2014 18:12:31 +0800 Subject: [PATCH v5 3/3] KVM: x86: Enable Intel MPX for guest This patch enable Intel MPX feature to guest. Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b241325..ddc8a7e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -256,6 +256,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported ? + (kvm_x86_ops->mpx_supported() ? F(MPX) : 0) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -303,7 +305,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED) | + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX); /* all calls to cpuid_count() should be made on the same cpu */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33fa9e3..6530019 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -616,6 +616,9 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (xcr0 & ~valid_bits) return 1; + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8da5823..392ecbf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,7 +122,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; extern unsigned int min_timer_period_us; -- cgit v0.10.2 From d3714010c307d26df251c45be9cd12ab6d41f0c4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 Feb 2014 22:44:54 -0300 Subject: KVM: x86: emulator_cmpxchg_emulated should mark_page_dirty emulator_cmpxchg_emulated writes to guest memory, therefore it should update the dirty bitmap accordingly. Signed-off-by: Marcelo Tosatti Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6530019..4cca458 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4399,6 +4399,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; -- cgit v0.10.2 From f87618e870d03ac114dd5496b23f6f628af54152 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 26 Feb 2014 16:14:17 +0100 Subject: KVM: s390: implementation of kvm_arch_vcpu_runnable() A vcpu is defined to be runnable if an interrupt is pending. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a5da2cc..18959bb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -483,9 +483,7 @@ out: int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - /* kvm common code refers to this, but never calls it */ - BUG(); - return 0; + return kvm_cpu_has_interrupt(vcpu); } void s390_vcpu_block(struct kvm_vcpu *vcpu) -- cgit v0.10.2 From 98f4a14676127397c54cab7d6119537ed4d113a2 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 26 Feb 2014 16:14:18 +0100 Subject: KVM: add kvm_arch_vcpu_runnable() test to kvm_vcpu_on_spin() loop Use the arch specific function kvm_arch_vcpu_runnable() to add a further criterium to identify a suitable vcpu to yield to during undirected yield processing. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f5668a4..5fd4cf8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1801,7 +1801,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) continue; if (vcpu == me) continue; - if (waitqueue_active(&vcpu->wq)) + if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; -- cgit v0.10.2 From 9cac38dd5dc41c943d711b96f9755a29c8b854ea Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Wed, 26 Feb 2014 16:14:19 +0100 Subject: KVM/s390: Set preempted flag during vcpu wakeup and interrupt delivery Commit "kvm: Record the preemption status of vcpus using preempt notifiers" caused a performance regression on s390. It turned out that in the case that if a former sleeping cpu, that was woken up, this cpu is not a yield candidate since it gave up the cpu voluntarily. To retain this candiate its preempted flag is set during wakeup and interrupt delivery time. Significant performance measurement work and code analysis to solve this issue was provided by Mao Chuan Li and his team in Beijing. Signed-off-by: Michael Mueller Reviewed-by: Christian Borntraeger Signed-off-by: Paolo Bonzini diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1848080..fff070b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -505,6 +505,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) struct kvm_vcpu *vcpu; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + vcpu->preempted = true; tasklet_schedule(&vcpu->arch.tasklet); return HRTIMER_NORESTART; @@ -732,6 +733,7 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); if (waitqueue_active(li->wq)) wake_up_interruptible(li->wq); + kvm_get_vcpu(kvm, sigcpu)->preempted = true; spin_unlock_bh(&li->lock); unlock_fi: spin_unlock(&fi->lock); @@ -877,6 +879,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); + vcpu->preempted = true; spin_unlock_bh(&li->lock); mutex_unlock(&vcpu->kvm->lock); return 0; -- cgit v0.10.2 From 684851a15744355f294ee3fee4ca2e9108382b47 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 27 Feb 2014 15:08:31 +0900 Subject: KVM: x86: Break kvm_for_each_vcpu loop after finding the VP_INDEX No need to scan the entire VCPU array. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cca458..773eba7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2328,9 +2328,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: -- cgit v0.10.2 From 2d58b733c87689d3d5144e4ac94ea861cc729145 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 Jan 2014 19:13:10 +0000 Subject: arm64: KVM: force cache clean on page fault when caches are off In order for the guest with caches off to observe data written contained in a given page, we need to make sure that page is committed to memory, and not just hanging in the cache (as guest accesses are completely bypassing the cache until it decides to enable it). For this purpose, hook into the coherent_icache_guest_page function and flush the region if the guest SCTLR_EL1 register doesn't show the MMU and caches as being enabled. The function also get renamed to coherent_cache_guest_page. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Reviewed-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 2d122ad..6d0f3d3 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -116,8 +116,8 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, - unsigned long size) +static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, + unsigned long size) { /* * If we are going to insert an instruction page and the icache is diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7789857..fc71a8d 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -715,7 +715,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_s2pmd_writable(&new_pmd); kvm_set_pfn_dirty(pfn); } - coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); + coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE); ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); @@ -723,7 +723,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_s2pte_writable(&new_pte); kvm_set_pfn_dirty(pfn); } - coherent_icache_guest_page(kvm, hva, PAGE_SIZE); + coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 7f1f940..6eaf69b 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -106,7 +106,6 @@ static inline bool kvm_is_write_fault(unsigned long esr) return true; } -static inline void kvm_clean_dcache_area(void *addr, size_t size) {} static inline void kvm_clean_pgd(pgd_t *pgd) {} static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} static inline void kvm_clean_pte(pte_t *pte) {} @@ -124,9 +123,19 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, - unsigned long size) +#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) + +static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) { + return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; +} + +static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, + unsigned long size) +{ + if (!vcpu_has_cache_enabled(vcpu)) + kvm_flush_dcache_to_poc((void *)hva, size); + if (!icache_is_aliasing()) { /* PIPT */ flush_icache_range(hva, hva + size); } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ @@ -135,7 +144,6 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, } } -#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) #define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x)) #endif /* __ASSEMBLY__ */ -- cgit v0.10.2 From 2072d29c46b73e39b3c6c56c6027af77086f45fd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 21 Jan 2014 10:55:17 +0000 Subject: arm64: KVM: allows discrimination of AArch32 sysreg access The current handling of AArch32 trapping is slightly less than perfect, as it is not possible (from a handler point of view) to distinguish it from an AArch64 access, nor to tell a 32bit from a 64bit access either. Fix this by introducing two additional flags: - is_aarch32: true if the access was made in AArch32 mode - is_32bit: true if is_aarch32 == true and a MCR/MRC instruction was used to perform the access (as opposed to MCRR/MRRC). This allows a handler to cover all the possible conditions in which a system register gets trapped. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 02e9d09..bf03e0f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -437,6 +437,8 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) u32 hsr = kvm_vcpu_get_hsr(vcpu); int Rt2 = (hsr >> 10) & 0xf; + params.is_aarch32 = true; + params.is_32bit = false; params.CRm = (hsr >> 1) & 0xf; params.Rt = (hsr >> 5) & 0xf; params.is_write = ((hsr & 1) == 0); @@ -480,6 +482,8 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) struct sys_reg_params params; u32 hsr = kvm_vcpu_get_hsr(vcpu); + params.is_aarch32 = true; + params.is_32bit = true; params.CRm = (hsr >> 1) & 0xf; params.Rt = (hsr >> 5) & 0xf; params.is_write = ((hsr & 1) == 0); @@ -549,6 +553,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run) struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_hsr(vcpu); + params.is_aarch32 = false; + params.is_32bit = false; params.Op0 = (esr >> 20) & 3; params.Op1 = (esr >> 14) & 0x7; params.CRn = (esr >> 10) & 0xf; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index d50d372..d411e25 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -30,6 +30,8 @@ struct sys_reg_params { u8 Op2; u8 Rt; bool is_write; + bool is_aarch32; + bool is_32bit; /* Only valid if is_aarch32 is true */ }; struct sys_reg_desc { -- cgit v0.10.2 From 4d44923b17bff283c002ed961373848284aaff1b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 Jan 2014 18:00:55 +0000 Subject: arm64: KVM: trap VM system registers until MMU and caches are ON In order to be able to detect the point where the guest enables its MMU and caches, trap all the VM related system registers. Once we see the guest enabling both the MMU and the caches, we can go back to a saner mode of operation, which is to leave these registers in complete control of the guest. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Reviewed-by: Christoffer Dall diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 0eb3986..00fbaa7 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -62,6 +62,7 @@ * RW: 64bit by default, can be overriden for 32bit VMs * TAC: Trap ACTLR * TSC: Trap SMC + * TVM: Trap VM ops (until M+C set in SCTLR_EL1) * TSW: Trap cache operations by set/way * TWE: Trap WFE * TWI: Trap WFI @@ -74,7 +75,7 @@ * SWIO: Turn set/way invalidates into set/way clean+invalidate */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ - HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \ HCR_AMO | HCR_IMO | HCR_FMO | \ HCR_SWIO | HCR_TIDCP | HCR_RW) #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index b25763b..9fcd54b 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -79,7 +79,8 @@ #define c13_TID_URW (TPIDR_EL0 * 2) /* Thread ID, User R/W */ #define c13_TID_URO (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */ #define c13_TID_PRIV (TPIDR_EL1 * 2) /* Thread ID, Privileged */ -#define c10_AMAIR (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */ +#define c10_AMAIR0 (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */ +#define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */ #define c14_CNTKCTL (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */ #define NR_CP15_REGS (NR_SYS_REGS * 2) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index bf03e0f..2097e5e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -121,6 +121,46 @@ done: } /* + * Generic accessor for VM registers. Only called as long as HCR_TVM + * is set. + */ +static bool access_vm_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned long val; + + BUG_ON(!p->is_write); + + val = *vcpu_reg(vcpu, p->Rt); + if (!p->is_aarch32) { + vcpu_sys_reg(vcpu, r->reg) = val; + } else { + vcpu_cp15(vcpu, r->reg) = val & 0xffffffffUL; + if (!p->is_32bit) + vcpu_cp15(vcpu, r->reg + 1) = val >> 32; + } + return true; +} + +/* + * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set. If the + * guest enables the MMU, we stop trapping the VM sys_regs and leave + * it in complete control of the caches. + */ +static bool access_sctlr(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + access_vm_reg(vcpu, p, r); + + if (vcpu_has_cache_enabled(vcpu)) /* MMU+Caches enabled? */ + vcpu->arch.hcr_el2 &= ~HCR_TVM; + + return true; +} + +/* * We could trap ID_DFR0 and tell the guest we don't support performance * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was * NAKed, so it will read the PMCR anyway. @@ -185,32 +225,32 @@ static const struct sys_reg_desc sys_reg_descs[] = { NULL, reset_mpidr, MPIDR_EL1 }, /* SCTLR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000), - NULL, reset_val, SCTLR_EL1, 0x00C50078 }, + access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 }, /* CPACR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010), NULL, reset_val, CPACR_EL1, 0 }, /* TTBR0_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, TTBR0_EL1 }, + access_vm_reg, reset_unknown, TTBR0_EL1 }, /* TTBR1_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001), - NULL, reset_unknown, TTBR1_EL1 }, + access_vm_reg, reset_unknown, TTBR1_EL1 }, /* TCR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010), - NULL, reset_val, TCR_EL1, 0 }, + access_vm_reg, reset_val, TCR_EL1, 0 }, /* AFSR0_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000), - NULL, reset_unknown, AFSR0_EL1 }, + access_vm_reg, reset_unknown, AFSR0_EL1 }, /* AFSR1_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001), - NULL, reset_unknown, AFSR1_EL1 }, + access_vm_reg, reset_unknown, AFSR1_EL1 }, /* ESR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000), - NULL, reset_unknown, ESR_EL1 }, + access_vm_reg, reset_unknown, ESR_EL1 }, /* FAR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, FAR_EL1 }, + access_vm_reg, reset_unknown, FAR_EL1 }, /* PAR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000), NULL, reset_unknown, PAR_EL1 }, @@ -224,17 +264,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* MAIR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000), - NULL, reset_unknown, MAIR_EL1 }, + access_vm_reg, reset_unknown, MAIR_EL1 }, /* AMAIR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000), - NULL, reset_amair_el1, AMAIR_EL1 }, + access_vm_reg, reset_amair_el1, AMAIR_EL1 }, /* VBAR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), NULL, reset_val, VBAR_EL1, 0 }, /* CONTEXTIDR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001), - NULL, reset_val, CONTEXTIDR_EL1, 0 }, + access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, /* TPIDR_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100), NULL, reset_unknown, TPIDR_EL1 }, @@ -305,14 +345,32 @@ static const struct sys_reg_desc sys_reg_descs[] = { NULL, reset_val, FPEXC32_EL2, 0x70 }, }; -/* Trapped cp15 registers */ +/* + * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding, + * depending on the way they are accessed (as a 32bit or a 64bit + * register). + */ static const struct sys_reg_desc cp15_regs[] = { + { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, + { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, + { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR }, + { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR }, + { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR }, + { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR }, + { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, c5_ADFSR }, + { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, c5_AIFSR }, + { Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, c6_DFAR }, + { Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, c6_IFAR }, + /* * DC{C,I,CI}SW operations: */ { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw }, { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw }, { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw }, + { Op1( 0), CRn( 9), CRm(12), Op2( 0), pm_fake }, { Op1( 0), CRn( 9), CRm(12), Op2( 1), pm_fake }, { Op1( 0), CRn( 9), CRm(12), Op2( 2), pm_fake }, @@ -326,6 +384,14 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn( 9), CRm(14), Op2( 0), pm_fake }, { Op1( 0), CRn( 9), CRm(14), Op2( 1), pm_fake }, { Op1( 0), CRn( 9), CRm(14), Op2( 2), pm_fake }, + + { Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR }, + { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR }, + { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 }, + { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 }, + { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, + + { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, }; /* Target specific emulation tables */ -- cgit v0.10.2 From a3c8bd31af260a17d626514f636849ee1cd1f63e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 18 Feb 2014 14:29:03 +0000 Subject: ARM: KVM: introduce kvm_p*d_addr_end The use of p*d_addr_end with stage-2 translation is slightly dodgy, as the IPA is 40bits, while all the p*d_addr_end helpers are taking an unsigned long (arm64 is fine with that as unligned long is 64bit). The fix is to introduce 64bit clean versions of the same helpers, and use them in the stage-2 page table code. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Reviewed-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 6d0f3d3..891afe7 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -114,6 +114,19 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +/* Open coded p*d_addr_end that can deal with 64bit addresses */ +#define kvm_pgd_addr_end(addr, end) \ +({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) + +#define kvm_pud_addr_end(addr,end) (end) + +#define kvm_pmd_addr_end(addr, end) \ +({ u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) + struct kvm; static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index fc71a8d..c1c08b2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -145,7 +145,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, pgd = pgdp + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { - addr = pud_addr_end(addr, end); + addr = kvm_pud_addr_end(addr, end); continue; } @@ -155,13 +155,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, * move on. */ clear_pud_entry(kvm, pud, addr); - addr = pud_addr_end(addr, end); + addr = kvm_pud_addr_end(addr, end); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - addr = pmd_addr_end(addr, end); + addr = kvm_pmd_addr_end(addr, end); continue; } @@ -176,10 +176,10 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, */ if (kvm_pmd_huge(*pmd) || page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); - next = pmd_addr_end(addr, end); + next = kvm_pmd_addr_end(addr, end); if (page_empty(pmd) && !page_empty(pud)) { clear_pud_entry(kvm, pud, addr); - next = pud_addr_end(addr, end); + next = kvm_pud_addr_end(addr, end); } } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6eaf69b..00c0cc8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -121,6 +121,10 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= PMD_S2_RDWR; } +#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end) +#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) +#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) + struct kvm; #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) -- cgit v0.10.2 From 9d218a1fcf4c6b759d442ef702842fae92e1ea61 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 15 Jan 2014 12:50:23 +0000 Subject: arm64: KVM: flush VM pages before letting the guest enable caches When the guest runs with caches disabled (like in an early boot sequence, for example), all the writes are diectly going to RAM, bypassing the caches altogether. Once the MMU and caches are enabled, whatever sits in the cache becomes suddenly visible, which isn't what the guest expects. A way to avoid this potential disaster is to invalidate the cache when the MMU is being turned on. For this, we hook into the SCTLR_EL1 trapping code, and scan the stage-2 page tables, invalidating the pages/sections that have already been mapped in. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Reviewed-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 891afe7..eb85b81 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -155,6 +155,8 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, #define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) #define kvm_virt_to_phys(x) virt_to_idmap((unsigned long)(x)) +void stage2_flush_vm(struct kvm *kvm); + #endif /* !__ASSEMBLY__ */ #endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index c1c08b2..d7e998c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -187,6 +187,99 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, } } +static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, + phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); + kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); + kvm_flush_dcache_to_poc((void*)hva, PMD_SIZE); + } else { + stage2_flush_ptes(kvm, pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = pud_offset(pgd, addr); + do { + next = kvm_pud_addr_end(addr, end); + if (!pud_none(*pud)) { + if (pud_huge(*pud)) { + hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT); + kvm_flush_dcache_to_poc((void*)hva, PUD_SIZE); + } else { + stage2_flush_pmds(kvm, pud, addr, next); + } + } + } while (pud++, addr = next, addr != end); +} + +static void stage2_flush_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = addr + PAGE_SIZE * memslot->npages; + phys_addr_t next; + pgd_t *pgd; + + pgd = kvm->arch.pgd + pgd_index(addr); + do { + next = kvm_pgd_addr_end(addr, end); + stage2_flush_puds(kvm, pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 + * @kvm: The struct kvm pointer + * + * Go through the stage 2 page tables and invalidate any cache lines + * backing memory already mapped to the VM. + */ +void stage2_flush_vm(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) + stage2_flush_memslot(kvm, memslot); + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + /** * free_boot_hyp_pgd - free HYP boot page tables * diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 00c0cc8..7d29847 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -150,5 +150,7 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, #define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x)) +void stage2_flush_vm(struct kvm *kvm); + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 2097e5e..0324458 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -154,8 +155,10 @@ static bool access_sctlr(struct kvm_vcpu *vcpu, { access_vm_reg(vcpu, p, r); - if (vcpu_has_cache_enabled(vcpu)) /* MMU+Caches enabled? */ + if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */ vcpu->arch.hcr_el2 &= ~HCR_TVM; + stage2_flush_vm(vcpu->kvm); + } return true; } -- cgit v0.10.2 From 159793001d7d85af17855630c94f0a176848e16b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 Jan 2014 19:13:10 +0000 Subject: ARM: KVM: force cache clean on page fault when caches are off In order for a guest with caches disabled to observe data written contained in a given page, we need to make sure that page is committed to memory, and not just hanging in the cache (as guest accesses are completely bypassing the cache until it decides to enable it). For this purpose, hook into the coherent_cache_guest_page function and flush the region if the guest SCTLR register doesn't show the MMU and caches as being enabled. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Catalin Marinas diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index eb85b81..5c7aa3c 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -129,9 +129,19 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) struct kvm; +#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) + +static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101; +} + static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, unsigned long size) { + if (!vcpu_has_cache_enabled(vcpu)) + kvm_flush_dcache_to_poc((void *)hva, size); + /* * If we are going to insert an instruction page and the icache is * either VIPT or PIPT, there is a potential problem where the host @@ -152,7 +162,6 @@ static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva, } } -#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) #define kvm_virt_to_phys(x) virt_to_idmap((unsigned long)(x)) void stage2_flush_vm(struct kvm *kvm); -- cgit v0.10.2 From 46c214dd595381c880794413facadfa07fba5c95 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 21 Jan 2014 18:56:26 +0000 Subject: ARM: KVM: fix handling of trapped 64bit coprocessor accesses Commit 240e99cbd00a (ARM: KVM: Fix 64-bit coprocessor handling) changed the way we match the 64bit coprocessor access from user space, but didn't update the trap handler for the same set of registers. The effect is that a trapped 64bit access is never matched, leading to a fault being injected into the guest. This went unnoticed as we didn't really trap any 64bit register so far. Placing the CRm field of the access into the CRn field of the matching structure fixes the problem. Also update the debug feature to emit the expected string in case of failing match. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Acked-by: Catalin Marinas diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 78c0885..126c90d 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -443,7 +443,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) { struct coproc_params params; - params.CRm = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf; + params.CRn = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf; params.Rt1 = (kvm_vcpu_get_hsr(vcpu) >> 5) & 0xf; params.is_write = ((kvm_vcpu_get_hsr(vcpu) & 1) == 0); params.is_64bit = true; @@ -451,7 +451,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) params.Op1 = (kvm_vcpu_get_hsr(vcpu) >> 16) & 0xf; params.Op2 = 0; params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf; - params.CRn = 0; + params.CRm = 0; return emulate_cp15(vcpu, ¶ms); } diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index 0461d5c..c5ad7ff 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -58,8 +58,8 @@ static inline void print_cp_instr(const struct coproc_params *p) { /* Look, we even formatted it for you to paste into the table! */ if (p->is_64bit) { - kvm_pr_unimpl(" { CRm(%2lu), Op1(%2lu), is64, func_%s },\n", - p->CRm, p->Op1, p->is_write ? "write" : "read"); + kvm_pr_unimpl(" { CRm64(%2lu), Op1(%2lu), is64, func_%s },\n", + p->CRn, p->Op1, p->is_write ? "write" : "read"); } else { kvm_pr_unimpl(" { CRn(%2lu), CRm(%2lu), Op1(%2lu), Op2(%2lu), is32," " func_%s },\n", -- cgit v0.10.2 From 547f781378a22b65c2ab468f235c23001b5924da Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 21 Jan 2014 18:56:26 +0000 Subject: ARM: KVM: fix ordering of 64bit coprocessor accesses Commit 240e99cbd00a (ARM: KVM: Fix 64-bit coprocessor handling) added an ordering dependency for the 64bit registers. The order described is: CRn, CRm, Op1, Op2, 64bit-first. Unfortunately, the implementation is: CRn, 64bit-first, CRm... Move the 64bit test to be last in order to match the documentation. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Acked-by: Catalin Marinas diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index c5ad7ff..8dda870 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -135,13 +135,13 @@ static inline int cmp_reg(const struct coproc_reg *i1, return -1; if (i1->CRn != i2->CRn) return i1->CRn - i2->CRn; - if (i1->is_64 != i2->is_64) - return i2->is_64 - i1->is_64; if (i1->CRm != i2->CRm) return i1->CRm - i2->CRm; if (i1->Op1 != i2->Op1) return i1->Op1 - i2->Op1; - return i1->Op2 - i2->Op2; + if (i1->Op2 != i2->Op2) + return i1->Op2 - i2->Op2; + return i2->is_64 - i1->is_64; } -- cgit v0.10.2 From ac30a11e8e92a03dbe236b285c5cbae0bf563141 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 22 Jan 2014 09:43:38 +0000 Subject: ARM: KVM: introduce per-vcpu HYP Configuration Register So far, KVM/ARM used a fixed HCR configuration per guest, except for the VI/VF/VA bits to control the interrupt in absence of VGIC. With the upcoming need to dynamically reconfigure trapping, it becomes necessary to allow the HCR to be changed on a per-vcpu basis. The fix here is to mimic what KVM/arm64 already does: a per vcpu HCR field, initialized at setup time. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Acked-by: Catalin Marinas diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 1d3153c..a843e74 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -69,7 +69,6 @@ #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ HCR_TWE | HCR_SWIO | HCR_TIDCP) -#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) /* System Control Register (SCTLR) bits */ #define SCTLR_TE (1 << 30) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 098f7dd..09af149 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -101,6 +101,12 @@ struct kvm_vcpu_arch { /* The CPU type we expose to the VM */ u32 midr; + /* HYP trapping configuration */ + u32 hcr; + + /* Interrupt related fields */ + u32 irq_lines; /* IRQ and FIQ levels */ + /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -128,9 +134,6 @@ struct kvm_vcpu_arch { /* IO related fields */ struct kvm_decode mmio_decode; - /* Interrupt related fields */ - u32 irq_lines; /* IRQ and FIQ levels */ - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index ded0417..85598b5 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -174,6 +174,7 @@ int main(void) DEFINE(VCPU_FIQ_REGS, offsetof(struct kvm_vcpu, arch.regs.fiq_regs)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_pc)); DEFINE(VCPU_CPSR, offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_cpsr)); + DEFINE(VCPU_HCR, offsetof(struct kvm_vcpu, arch.hcr)); DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines)); DEFINE(VCPU_HSR, offsetof(struct kvm_vcpu, arch.fault.hsr)); DEFINE(VCPU_HxFAR, offsetof(struct kvm_vcpu, arch.fault.hxfar)); diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 2786eae..b23a59c 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -38,6 +38,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { + vcpu->arch.hcr = HCR_GUEST_MASK; return 0; } diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index 6f18695..a37270d 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -597,17 +597,14 @@ vcpu .req r0 @ vcpu pointer always in r0 /* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */ .macro configure_hyp_role operation - mrc p15, 4, r2, c1, c1, 0 @ HCR - bic r2, r2, #HCR_VIRT_EXCP_MASK - ldr r3, =HCR_GUEST_MASK .if \operation == vmentry - orr r2, r2, r3 + ldr r2, [vcpu, #VCPU_HCR] ldr r3, [vcpu, #VCPU_IRQ_LINES] orr r2, r2, r3 .else - bic r2, r2, r3 + mov r2, #0 .endif - mcr p15, 4, r2, c1, c1, 0 + mcr p15, 4, r2, c1, c1, 0 @ HCR .endm .macro load_vcpu -- cgit v0.10.2 From af20814ee927ed888288d98917a766b4179c4fe0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 22 Jan 2014 10:20:09 +0000 Subject: ARM: KVM: add world-switch for AMAIR{0,1} HCR.TVM traps (among other things) accesses to AMAIR0 and AMAIR1. In order to minimise the amount of surprise a guest could generate by trying to access these registers with caches off, add them to the list of registers we switch/handle. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Acked-by: Catalin Marinas diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 661da11..53b3c4a 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -48,7 +48,9 @@ #define c13_TID_URO 26 /* Thread ID, User R/O */ #define c13_TID_PRIV 27 /* Thread ID, Privileged */ #define c14_CNTKCTL 28 /* Timer Control Register (PL1) */ -#define NR_CP15_REGS 29 /* Number of regs (incl. invalid) */ +#define c10_AMAIR0 29 /* Auxilary Memory Attribute Indirection Reg0 */ +#define c10_AMAIR1 30 /* Auxilary Memory Attribute Indirection Reg1 */ +#define NR_CP15_REGS 31 /* Number of regs (incl. invalid) */ #define ARM_EXCEPTION_RESET 0 #define ARM_EXCEPTION_UNDEFINED 1 diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 126c90d..a5a54a4 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -328,6 +328,12 @@ static const struct coproc_reg cp15_regs[] = { { CRn(10), CRm( 2), Op1( 0), Op2( 1), is32, NULL, reset_unknown, c10_NMRR}, + /* AMAIR0/AMAIR1: swapped by interrupt.S. */ + { CRn(10), CRm( 3), Op1( 0), Op2( 0), is32, + access_vm_reg, reset_unknown, c10_AMAIR0}, + { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32, + access_vm_reg, reset_unknown, c10_AMAIR1}, + /* VBAR: swapped by interrupt.S. */ { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32, NULL, reset_val, c12_VBAR, 0x00000000 }, diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S index a37270d..76af9302 100644 --- a/arch/arm/kvm/interrupts_head.S +++ b/arch/arm/kvm/interrupts_head.S @@ -303,13 +303,17 @@ vcpu .req r0 @ vcpu pointer always in r0 mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL mrrc p15, 0, r4, r5, c7 @ PAR + mrc p15, 0, r6, c10, c3, 0 @ AMAIR0 + mrc p15, 0, r7, c10, c3, 1 @ AMAIR1 .if \store_to_vcpu == 0 - push {r2,r4-r5} + push {r2,r4-r7} .else str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] add r12, vcpu, #CP15_OFFSET(c7_PAR) strd r4, r5, [r12] + str r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)] + str r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)] .endif .endm @@ -322,15 +326,19 @@ vcpu .req r0 @ vcpu pointer always in r0 */ .macro write_cp15_state read_from_vcpu .if \read_from_vcpu == 0 - pop {r2,r4-r5} + pop {r2,r4-r7} .else ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] add r12, vcpu, #CP15_OFFSET(c7_PAR) ldrd r4, r5, [r12] + ldr r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)] + ldr r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)] .endif mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL mcrr p15, 0, r4, r5, c7 @ PAR + mcr p15, 0, r6, c10, c3, 0 @ AMAIR0 + mcr p15, 0, r7, c10, c3, 1 @ AMAIR1 .if \read_from_vcpu == 0 pop {r2-r12} -- cgit v0.10.2 From 8034699a42d68043b495c7e0cfafccd920707ec8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 Jan 2014 18:00:55 +0000 Subject: ARM: KVM: trap VM system registers until MMU and caches are ON In order to be able to detect the point where the guest enables its MMU and caches, trap all the VM related system registers. Once we see the guest enabling both the MMU and the caches, we can go back to a saner mode of operation, which is to leave these registers in complete control of the guest. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Reviewed-by: Christoffer Dall diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index a843e74..816db0b 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -55,6 +55,7 @@ * The bits we set in HCR: * TAC: Trap ACTLR * TSC: Trap SMC + * TVM: Trap VM ops (until MMU and caches are on) * TSW: Trap cache operations by set/way * TWI: Trap WFI * TWE: Trap WFE @@ -68,7 +69,7 @@ */ #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ - HCR_TWE | HCR_SWIO | HCR_TIDCP) + HCR_TVM | HCR_TWE | HCR_SWIO | HCR_TIDCP) /* System Control Register (SCTLR) bits */ #define SCTLR_TE (1 << 30) diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index a5a54a4..c58a351 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -205,6 +206,44 @@ done: } /* + * Generic accessor for VM registers. Only called as long as HCR_TVM + * is set. + */ +static bool access_vm_reg(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + BUG_ON(!p->is_write); + + vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1); + if (p->is_64bit) + vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2); + + return true; +} + +/* + * SCTLR accessor. Only called as long as HCR_TVM is set. If the + * guest enables the MMU, we stop trapping the VM sys_regs and leave + * it in complete control of the caches. + * + * Used by the cpu-specific code. + */ +bool access_sctlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + access_vm_reg(vcpu, p, r); + + if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */ + vcpu->arch.hcr &= ~HCR_TVM; + stage2_flush_vm(vcpu->kvm); + } + + return true; +} + +/* * We could trap ID_DFR0 and tell the guest we don't support performance * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was * NAKed, so it will read the PMCR anyway. @@ -261,33 +300,36 @@ static const struct coproc_reg cp15_regs[] = { { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32, NULL, reset_val, c1_CPACR, 0x00000000 }, - /* TTBR0/TTBR1: swapped by interrupt.S. */ - { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 }, - { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 }, - - /* TTBCR: swapped by interrupt.S. */ + /* TTBR0/TTBR1/TTBCR: swapped by interrupt.S. */ + { CRm64( 2), Op1( 0), is64, access_vm_reg, reset_unknown64, c2_TTBR0 }, + { CRn(2), CRm( 0), Op1( 0), Op2( 0), is32, + access_vm_reg, reset_unknown, c2_TTBR0 }, + { CRn(2), CRm( 0), Op1( 0), Op2( 1), is32, + access_vm_reg, reset_unknown, c2_TTBR1 }, { CRn( 2), CRm( 0), Op1( 0), Op2( 2), is32, - NULL, reset_val, c2_TTBCR, 0x00000000 }, + access_vm_reg, reset_val, c2_TTBCR, 0x00000000 }, + { CRm64( 2), Op1( 1), is64, access_vm_reg, reset_unknown64, c2_TTBR1 }, + /* DACR: swapped by interrupt.S. */ { CRn( 3), CRm( 0), Op1( 0), Op2( 0), is32, - NULL, reset_unknown, c3_DACR }, + access_vm_reg, reset_unknown, c3_DACR }, /* DFSR/IFSR/ADFSR/AIFSR: swapped by interrupt.S. */ { CRn( 5), CRm( 0), Op1( 0), Op2( 0), is32, - NULL, reset_unknown, c5_DFSR }, + access_vm_reg, reset_unknown, c5_DFSR }, { CRn( 5), CRm( 0), Op1( 0), Op2( 1), is32, - NULL, reset_unknown, c5_IFSR }, + access_vm_reg, reset_unknown, c5_IFSR }, { CRn( 5), CRm( 1), Op1( 0), Op2( 0), is32, - NULL, reset_unknown, c5_ADFSR }, + access_vm_reg, reset_unknown, c5_ADFSR }, { CRn( 5), CRm( 1), Op1( 0), Op2( 1), is32, - NULL, reset_unknown, c5_AIFSR }, + access_vm_reg, reset_unknown, c5_AIFSR }, /* DFAR/IFAR: swapped by interrupt.S. */ { CRn( 6), CRm( 0), Op1( 0), Op2( 0), is32, - NULL, reset_unknown, c6_DFAR }, + access_vm_reg, reset_unknown, c6_DFAR }, { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32, - NULL, reset_unknown, c6_IFAR }, + access_vm_reg, reset_unknown, c6_IFAR }, /* PAR swapped by interrupt.S */ { CRm64( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR }, @@ -324,9 +366,9 @@ static const struct coproc_reg cp15_regs[] = { /* PRRR/NMRR (aka MAIR0/MAIR1): swapped by interrupt.S. */ { CRn(10), CRm( 2), Op1( 0), Op2( 0), is32, - NULL, reset_unknown, c10_PRRR}, + access_vm_reg, reset_unknown, c10_PRRR}, { CRn(10), CRm( 2), Op1( 0), Op2( 1), is32, - NULL, reset_unknown, c10_NMRR}, + access_vm_reg, reset_unknown, c10_NMRR}, /* AMAIR0/AMAIR1: swapped by interrupt.S. */ { CRn(10), CRm( 3), Op1( 0), Op2( 0), is32, @@ -340,7 +382,7 @@ static const struct coproc_reg cp15_regs[] = { /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */ { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32, - NULL, reset_val, c13_CID, 0x00000000 }, + access_vm_reg, reset_val, c13_CID, 0x00000000 }, { CRn(13), CRm( 0), Op1( 0), Op2( 2), is32, NULL, reset_unknown, c13_TID_URW }, { CRn(13), CRm( 0), Op1( 0), Op2( 3), is32, diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index 8dda870..1a44bbe 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -153,4 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1, #define is64 .is_64 = true #define is32 .is_64 = false +bool access_sctlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r); + #endif /* __ARM_KVM_COPROC_LOCAL_H__ */ diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c index bb0cac1..e6f4ae4 100644 --- a/arch/arm/kvm/coproc_a15.c +++ b/arch/arm/kvm/coproc_a15.c @@ -34,7 +34,7 @@ static const struct coproc_reg a15_regs[] = { /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - NULL, reset_val, c1_SCTLR, 0x00C50078 }, + access_sctlr, reset_val, c1_SCTLR, 0x00C50078 }, }; static struct kvm_coproc_target_table a15_target_table = { diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c index 1df7673..17fc7cd 100644 --- a/arch/arm/kvm/coproc_a7.c +++ b/arch/arm/kvm/coproc_a7.c @@ -37,7 +37,7 @@ static const struct coproc_reg a7_regs[] = { /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - NULL, reset_val, c1_SCTLR, 0x00C50878 }, + access_sctlr, reset_val, c1_SCTLR, 0x00C50878 }, }; static struct kvm_coproc_target_table a7_target_table = { -- cgit v0.10.2 From 56041bf920d2937b7cadcb30cb206f0372eee814 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jan 2014 17:38:33 +0000 Subject: ARM: KVM: fix warning in mmu.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with THP enabled leads to the following warning: arch/arm/kvm/mmu.c: In function ‘unmap_range’: arch/arm/kvm/mmu.c:177:39: warning: ‘pte’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (kvm_pmd_huge(*pmd) || page_empty(pte)) { ^ Code inspection reveals that these two cases are mutually exclusive, so GCC is a bit overzealous here. Silence it anyway by initializing pte to NULL and testing it later on. Signed-off-by: Marc Zyngier Acked-by: Christoffer Dall diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index d7e998c..80bb1e6 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -144,6 +144,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, while (addr < end) { pgd = pgdp + pgd_index(addr); pud = pud_offset(pgd, addr); + pte = NULL; if (pud_none(*pud)) { addr = kvm_pud_addr_end(addr, end); continue; @@ -174,7 +175,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, /* * If the pmd entry is to be cleared, walk back up the ladder */ - if (kvm_pmd_huge(*pmd) || page_empty(pte)) { + if (kvm_pmd_huge(*pmd) || (pte && page_empty(pte))) { clear_pmd_entry(kvm, pmd, addr); next = kvm_pmd_addr_end(addr, end); if (page_empty(pmd) && !page_empty(pud)) { -- cgit v0.10.2 From ccf9844e5d99c1ee9a5b8c4f1332ac5211cbce03 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Feb 2014 22:54:11 +0100 Subject: kvm, vmx: Really fix lazy FPU on nested guest Commit e504c9098ed6 (kvm, vmx: Fix lazy FPU on nested guest, 2013-11-13) highlighted a real problem, but the fix was subtly wrong. nested_read_cr0 is the CR0 as read by L2, but here we want to look at the CR0 value reflecting L1's setup. In other words, L2 might think that TS=0 (so nested_read_cr0 has the bit clear); but if L1 is actually running it with TS=1, we should inject the fault into L1. The effective value of CR0 in L2 is contained in vmcs12->guest_cr0, use it. Fixes: e504c9098ed6acd9e1079c5e10e4910724ad429f Reported-by: Kashyap Chamarty Reported-by: Stefan Bader Tested-by: Kashyap Chamarty Tested-by: Anthoine Bourgeois Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 83ee24f..53c324f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6699,7 +6699,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && - !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + !(vmcs12->guest_cr0 & X86_CR0_TS)) return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); -- cgit v0.10.2 From 672550fb682e9935e1a318bf4ac3f611a057dee1 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 10 Feb 2014 15:32:19 +0100 Subject: KVM: s390: Provide access to program parameter commit d208c79d63e06457eef077af770d23dc4cde4d43 (KVM: s390: Enable the LPP facility for guests) enabled the LPP instruction for guests. We should expose the program parameter as a pseudo register for migration/reset etc. Lets also reset this value on initial CPU reset. Signed-off-by: Christian Borntraeger Reviewed-by: Thomas Huth Reviewed-by: Jason J. Herne diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2c69ba2..062b78c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -107,7 +107,9 @@ struct kvm_s390_sie_block { __u64 gbea; /* 0x0180 */ __u8 reserved188[24]; /* 0x0188 */ __u32 fac; /* 0x01a0 */ - __u8 reserved1a4[68]; /* 0x01a4 */ + __u8 reserved1a4[58]; /* 0x01a4 */ + __u64 pp; /* 0x01de */ + __u8 reserved1e6[2]; /* 0x01e6 */ __u64 itdba; /* 0x01e8 */ __u8 reserved1f0[16]; /* 0x01f0 */ } __attribute__((packed)); diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index cb4c1eb8..7663244 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -76,4 +76,5 @@ struct kvm_sync_regs { #define KVM_REG_S390_PFTOKEN (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5) #define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6) #define KVM_REG_S390_PFSELECT (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7) +#define KVM_REG_S390_PP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8) #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 18959bb..0262936 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -386,6 +386,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.guest_fpregs.fpc = 0; asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc)); vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); @@ -571,6 +572,10 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, r = put_user(vcpu->arch.pfault_select, (u64 __user *)reg->addr); break; + case KVM_REG_S390_PP: + r = put_user(vcpu->arch.sie_block->pp, + (u64 __user *)reg->addr); + break; default: break; } @@ -612,6 +617,10 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, r = get_user(vcpu->arch.pfault_select, (u64 __user *)reg->addr); break; + case KVM_REG_S390_PP: + r = get_user(vcpu->arch.sie_block->pp, + (u64 __user *)reg->addr); + break; default: break; } -- cgit v0.10.2 From afa45ff521130cee79a50b565693388be8c8c9c2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 10 Feb 2014 15:39:23 +0100 Subject: KVM: s390: expose gbea register to userspace For migration/reset we want to expose the guest breaking event address register to userspace. Lets use ONE_REG for that purpose. Signed-off-by: Christian Borntraeger Reviewed-by: Jason J. Herne diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 7663244..2f0ade2 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -77,4 +77,5 @@ struct kvm_sync_regs { #define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6) #define KVM_REG_S390_PFSELECT (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7) #define KVM_REG_S390_PP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8) +#define KVM_REG_S390_GBEA (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x9) #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0262936..a3e4c07 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -576,6 +576,10 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, r = put_user(vcpu->arch.sie_block->pp, (u64 __user *)reg->addr); break; + case KVM_REG_S390_GBEA: + r = put_user(vcpu->arch.sie_block->gbea, + (u64 __user *)reg->addr); + break; default: break; } @@ -621,6 +625,10 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, r = get_user(vcpu->arch.sie_block->pp, (u64 __user *)reg->addr); break; + case KVM_REG_S390_GBEA: + r = get_user(vcpu->arch.sie_block->gbea, + (u64 __user *)reg->addr); + break; default: break; } -- cgit v0.10.2 From ff520a6327e83ef55515d1be3d0a1b10c084f59c Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Mon, 24 Feb 2014 10:11:41 +0100 Subject: KVM: s390: Simplify online vcpus counting for stsi We don't need to loop over all cpus to get the number of vcpus. Let's use the available counter online_vcpus instead. Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 75beea6..ae9e8ee 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -396,15 +396,10 @@ static int handle_stidp(struct kvm_vcpu *vcpu) static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; int cpus = 0; int n; - spin_lock(&fi->lock); - for (n = 0; n < KVM_MAX_VCPUS; n++) - if (fi->local_int[n]) - cpus++; - spin_unlock(&fi->lock); + cpus = atomic_read(&vcpu->kvm->online_vcpus); /* deal with other level 3 hypervisors */ if (stsi(mem, 3, 2, 2)) -- cgit v0.10.2 From 13b191ae4afc0c29a5cd768f521ede5c72a608cb Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 27 Jan 2014 12:06:19 +0100 Subject: KVM: s390: Fixed CC of SIGP SET_PREFIX handler When SIGP SET_PREFIX is called with an illegal CPU id, it must return the condition code 3 ("not operational") instead of 1. Also fixed the order in which the checks are done - CC3 has a higher priority than CC1. And while we're at it, this patch also get rid of the floating interrupt lock here by using kvm_get_vcpu() to get the local_int struct of the destination CPU. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index fe9442d..466eefa 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -249,12 +249,18 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, u64 *reg) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; - struct kvm_s390_local_interrupt *li = NULL; + struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; struct kvm_s390_interrupt_info *inti; int rc; u8 tmp; + if (cpu_addr < KVM_MAX_VCPUS) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; + /* make sure that the new value is valid memory */ address = address & 0x7fffe000u; if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || @@ -268,18 +274,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, if (!inti) return SIGP_CC_BUSY; - spin_lock(&fi->lock); - if (cpu_addr < KVM_MAX_VCPUS) - li = fi->local_int[cpu_addr]; - - if (li == NULL) { - *reg &= 0xffffffff00000000UL; - *reg |= SIGP_STATUS_INCORRECT_STATE; - rc = SIGP_CC_STATUS_STORED; - kfree(inti); - goto out_fi; - } - spin_lock_bh(&li->lock); /* cpu must be in stopped state */ if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { @@ -302,8 +296,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address, VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); out_li: spin_unlock_bh(&li->lock); -out_fi: - spin_unlock(&fi->lock); return rc; } -- cgit v0.10.2 From 1ee0bc559dc34fe36a29494faf7b7c91533bd31c Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Tue, 25 Feb 2014 15:36:45 +0100 Subject: KVM: s390: get rid of local_int array We can use kvm_get_vcpu() now and don't need the local_int array in the floating_int struct anymore. This also means we don't have to hold the float_int.lock in some places. Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 062b78c..734d302 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -215,7 +215,6 @@ struct kvm_s390_float_interrupt { int next_rr_cpu; unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1) / sizeof(long)]; - struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS]; unsigned int irq_count; }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fff070b..1d0f9d5 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -692,6 +692,7 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) struct kvm_s390_local_interrupt *li; struct kvm_s390_float_interrupt *fi; struct kvm_s390_interrupt_info *iter; + struct kvm_vcpu *dst_vcpu = NULL; int sigcpu; int rc = 0; @@ -726,9 +727,10 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) sigcpu = fi->next_rr_cpu++; if (sigcpu == KVM_MAX_VCPUS) sigcpu = fi->next_rr_cpu = 0; - } while (fi->local_int[sigcpu] == NULL); + } while (kvm_get_vcpu(kvm, sigcpu) == NULL); } - li = fi->local_int[sigcpu]; + dst_vcpu = kvm_get_vcpu(kvm, sigcpu); + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); if (waitqueue_active(li->wq)) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a3e4c07..9136f8d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -460,11 +460,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, spin_lock_init(&vcpu->arch.local_int.lock); INIT_LIST_HEAD(&vcpu->arch.local_int.list); vcpu->arch.local_int.float_int = &kvm->arch.float_int; - spin_lock(&kvm->arch.float_int.lock); - kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int; vcpu->arch.local_int.wq = &vcpu->wq; vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; - spin_unlock(&kvm->arch.float_int.lock); rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) @@ -952,7 +949,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); - BUG_ON(vcpu->kvm->arch.float_int.local_int[vcpu->vcpu_id] == NULL); + BUG_ON(kvm_get_vcpu(vcpu->kvm, vcpu->vcpu_id) == NULL); switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 466eefa..3fe44c4 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -23,29 +23,30 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, u64 *reg) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; + int cpuflags; int rc; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - if (fi->local_int[cpu_addr] == NULL) - rc = SIGP_CC_NOT_OPERATIONAL; - else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags) - & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; + + cpuflags = atomic_read(li->cpuflags); + if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) rc = SIGP_CC_ORDER_CODE_ACCEPTED; else { *reg &= 0xffffffff00000000UL; - if (atomic_read(fi->local_int[cpu_addr]->cpuflags) - & CPUSTAT_ECALL_PEND) + if (cpuflags & CPUSTAT_ECALL_PEND) *reg |= SIGP_STATUS_EXT_CALL_PENDING; - if (atomic_read(fi->local_int[cpu_addr]->cpuflags) - & CPUSTAT_STOPPED) + if (cpuflags & CPUSTAT_STOPPED) *reg |= SIGP_STATUS_STOPPED; rc = SIGP_CC_STATUS_STORED; } - spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc); return rc; @@ -53,10 +54,9 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr, static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; struct kvm_s390_interrupt_info *inti; - int rc; + struct kvm_vcpu *dst_vcpu = NULL; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; @@ -68,13 +68,10 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EMERGENCY; inti->emerg.code = vcpu->vcpu_id; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - kfree(inti); - goto unlock; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); @@ -82,11 +79,9 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) if (waitqueue_active(li->wq)) wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); - rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); -unlock: - spin_unlock(&fi->lock); - return rc; + + return SIGP_CC_ORDER_CODE_ACCEPTED; } static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr, @@ -122,10 +117,9 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr, static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; struct kvm_s390_interrupt_info *inti; - int rc; + struct kvm_vcpu *dst_vcpu = NULL; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; @@ -137,13 +131,10 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EXTERNAL_CALL; inti->extcall.code = vcpu->vcpu_id; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - kfree(inti); - goto unlock; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); atomic_set(&li->active, 1); @@ -151,11 +142,9 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) if (waitqueue_active(li->wq)) wake_up_interruptible(li->wq); spin_unlock_bh(&li->lock); - rc = SIGP_CC_ORDER_CODE_ACCEPTED; VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); -unlock: - spin_unlock(&fi->lock); - return rc; + + return SIGP_CC_ORDER_CODE_ACCEPTED; } static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) @@ -189,31 +178,26 @@ out: static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; int rc; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - goto unlock; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; rc = __inject_sigp_stop(li, action); -unlock: - spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); if ((action & ACTION_STORE_ON_STOP) != 0 && rc == -ESHUTDOWN) { /* If the CPU has already been stopped, we still have * to save the status when doing stop-and-store. This * has to be done after unlocking all spinlocks. */ - struct kvm_vcpu *dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); rc = kvm_s390_store_status_unloaded(dst_vcpu, KVM_S390_STORE_STATUS_NOADDR); } @@ -333,28 +317,26 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, u16 cpu_id, static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, u64 *reg) { + struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu = NULL; int rc; - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - if (fi->local_int[cpu_addr] == NULL) - rc = SIGP_CC_NOT_OPERATIONAL; - else { - if (atomic_read(fi->local_int[cpu_addr]->cpuflags) - & CPUSTAT_RUNNING) { - /* running */ - rc = SIGP_CC_ORDER_CODE_ACCEPTED; - } else { - /* not running */ - *reg &= 0xffffffff00000000UL; - *reg |= SIGP_STATUS_NOT_RUNNING; - rc = SIGP_CC_STATUS_STORED; - } + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; + if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) { + /* running */ + rc = SIGP_CC_ORDER_CODE_ACCEPTED; + } else { + /* not running */ + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_NOT_RUNNING; + rc = SIGP_CC_STATUS_STORED; } - spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sensed running status of cpu %x rc %x", cpu_addr, rc); @@ -365,26 +347,22 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, /* Test whether the destination CPU is available and not busy */ static int sigp_check_callable(struct kvm_vcpu *vcpu, u16 cpu_addr) { - struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; int rc = SIGP_CC_ORDER_CODE_ACCEPTED; + struct kvm_vcpu *dst_vcpu = NULL; if (cpu_addr >= KVM_MAX_VCPUS) return SIGP_CC_NOT_OPERATIONAL; - spin_lock(&fi->lock); - li = fi->local_int[cpu_addr]; - if (li == NULL) { - rc = SIGP_CC_NOT_OPERATIONAL; - goto out; - } - + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); if (li->action_bits & ACTION_STOP_ON_STOP) rc = SIGP_CC_BUSY; spin_unlock_bh(&li->lock); -out: - spin_unlock(&fi->lock); + return rc; } -- cgit v0.10.2 From 2e0210432d34bc7f01644905c2bb2d5d9be5b6ac Mon Sep 17 00:00:00 2001 From: Heinz Graalfs Date: Thu, 27 Feb 2014 14:34:35 +0100 Subject: virtio_ccw: fix vcdev pointer handling issues The interrupt handler virtio_ccw_int_handler() using the vcdev pointer is protected by the ccw_device lock. Resetting the pointer within the ccw_device structure should be done when holding this lock. Also resetting the vcdev pointer (under the ccw_device lock) prior to freeing the vcdev pointer memory removes a critical path. Signed-off-by: Heinz Graalfs Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 0fc5848..413c630 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -636,6 +636,8 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, struct virtqueue *vq; struct virtio_driver *drv; + if (!vcdev) + return; /* Check if it's a notification from the host. */ if ((intparm == 0) && (scsw_stctl(&irb->scsw) == @@ -734,23 +736,37 @@ static int virtio_ccw_probe(struct ccw_device *cdev) return 0; } +static struct virtio_ccw_device *virtio_grab_drvdata(struct ccw_device *cdev) +{ + unsigned long flags; + struct virtio_ccw_device *vcdev; + + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); + vcdev = dev_get_drvdata(&cdev->dev); + if (!vcdev) { + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + return NULL; + } + dev_set_drvdata(&cdev->dev, NULL); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + return vcdev; +} + static void virtio_ccw_remove(struct ccw_device *cdev) { - struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev); + struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev); - if (cdev->online) { + if (vcdev && cdev->online) unregister_virtio_device(&vcdev->vdev); - dev_set_drvdata(&cdev->dev, NULL); - } cdev->handler = NULL; } static int virtio_ccw_offline(struct ccw_device *cdev) { - struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev); + struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev); - unregister_virtio_device(&vcdev->vdev); - dev_set_drvdata(&cdev->dev, NULL); + if (vcdev) + unregister_virtio_device(&vcdev->vdev); return 0; } @@ -759,6 +775,7 @@ static int virtio_ccw_online(struct ccw_device *cdev) { int ret; struct virtio_ccw_device *vcdev; + unsigned long flags; vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL); if (!vcdev) { @@ -786,7 +803,9 @@ static int virtio_ccw_online(struct ccw_device *cdev) INIT_LIST_HEAD(&vcdev->virtqueues); spin_lock_init(&vcdev->lock); + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); dev_set_drvdata(&cdev->dev, vcdev); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); vcdev->vdev.id.vendor = cdev->id.cu_type; vcdev->vdev.id.device = cdev->id.cu_model; ret = register_virtio_device(&vcdev->vdev); @@ -797,7 +816,9 @@ static int virtio_ccw_online(struct ccw_device *cdev) } return 0; out_put: + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); dev_set_drvdata(&cdev->dev, NULL); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); put_device(&vcdev->vdev.dev); return ret; out_free: -- cgit v0.10.2 From 84ec96a6150477b9509664557bc6ad4eaa21f72a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 13 Feb 2014 13:02:32 +0100 Subject: s390/airq: add support for irq ranges Add airq_iv_alloc and airq_iv_free to allocate and free consecutive ranges of irqs from the interrupt vector. Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 4bbb595..bd93ff6 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -44,11 +44,21 @@ struct airq_iv { struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); void airq_iv_release(struct airq_iv *iv); -unsigned long airq_iv_alloc_bit(struct airq_iv *iv); -void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit); +unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); +void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); unsigned long airq_iv_scan(struct airq_iv *iv, unsigned long start, unsigned long end); +static inline unsigned long airq_iv_alloc_bit(struct airq_iv *iv) +{ + return airq_iv_alloc(iv, 1); +} + +static inline void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit) +{ + airq_iv_free(iv, bit, 1); +} + static inline unsigned long airq_iv_end(struct airq_iv *iv) { return iv->end; diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index f055df0..445564c 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -186,55 +186,71 @@ void airq_iv_release(struct airq_iv *iv) EXPORT_SYMBOL(airq_iv_release); /** - * airq_iv_alloc_bit - allocate an irq bit from an interrupt vector + * airq_iv_alloc - allocate irq bits from an interrupt vector * @iv: pointer to an interrupt vector structure + * @num: number of consecutive irq bits to allocate * - * Returns the bit number of the allocated irq, or -1UL if no bit - * is available or the AIRQ_IV_ALLOC flag has not been specified + * Returns the bit number of the first irq in the allocated block of irqs, + * or -1UL if no bit is available or the AIRQ_IV_ALLOC flag has not been + * specified */ -unsigned long airq_iv_alloc_bit(struct airq_iv *iv) +unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num) { - unsigned long bit; + unsigned long bit, i; - if (!iv->avail) + if (!iv->avail || num == 0) return -1UL; spin_lock(&iv->lock); bit = find_first_bit_inv(iv->avail, iv->bits); - if (bit < iv->bits) { - clear_bit_inv(bit, iv->avail); - if (bit >= iv->end) - iv->end = bit + 1; - } else + while (bit + num <= iv->bits) { + for (i = 1; i < num; i++) + if (!test_bit_inv(bit + i, iv->avail)) + break; + if (i >= num) { + /* Found a suitable block of irqs */ + for (i = 0; i < num; i++) + clear_bit_inv(bit + i, iv->avail); + if (bit + num >= iv->end) + iv->end = bit + num + 1; + break; + } + bit = find_next_bit_inv(iv->avail, iv->bits, bit + i + 1); + } + if (bit + num > iv->bits) bit = -1UL; spin_unlock(&iv->lock); return bit; } -EXPORT_SYMBOL(airq_iv_alloc_bit); +EXPORT_SYMBOL(airq_iv_alloc); /** - * airq_iv_free_bit - free an irq bit of an interrupt vector + * airq_iv_free - free irq bits of an interrupt vector * @iv: pointer to interrupt vector structure - * @bit: number of the irq bit to free + * @bit: number of the first irq bit to free + * @num: number of consecutive irq bits to free */ -void airq_iv_free_bit(struct airq_iv *iv, unsigned long bit) +void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num) { - if (!iv->avail) + unsigned long i; + + if (!iv->avail || num == 0) return; spin_lock(&iv->lock); - /* Clear (possibly left over) interrupt bit */ - clear_bit_inv(bit, iv->vector); - /* Make the bit position available again */ - set_bit_inv(bit, iv->avail); - if (bit == iv->end - 1) { + for (i = 0; i < num; i++) { + /* Clear (possibly left over) interrupt bit */ + clear_bit_inv(bit + i, iv->vector); + /* Make the bit positions available again */ + set_bit_inv(bit + i, iv->avail); + } + if (bit + num >= iv->end) { /* Find new end of bit-field */ - while (--iv->end > 0) - if (!test_bit_inv(iv->end - 1, iv->avail)) - break; + while (iv->end > 0 && !test_bit_inv(iv->end - 1, iv->avail)) + iv->end--; } spin_unlock(&iv->lock); } -EXPORT_SYMBOL(airq_iv_free_bit); +EXPORT_SYMBOL(airq_iv_free); /** * airq_iv_scan - scan interrupt vector for non-zero bits -- cgit v0.10.2 From 96b14536d935848cffd904f583f67c66169002d8 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 6 Feb 2013 10:23:39 +0100 Subject: virtio-ccw: virtio-ccw adapter interrupt support. Implement the new CCW_CMD_SET_IND_ADAPTER command and try to enable adapter interrupts for every device on the first startup. If the host does not support adapter interrupts, fall back to normal I/O interrupts. virtio-ccw adapter interrupts use the same isc as normal I/O subchannels and share a summary indicator for all devices sharing the same indicator area. Indicator bits for the individual virtqueues may be contained in the same indicator area for different devices. Signed-off-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index 5f8bcc5..35f0faa 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -53,6 +53,7 @@ enum interruption_class { IRQIO_PCI, IRQIO_MSI, IRQIO_VIR, + IRQIO_VAI, NMI_NMI, CPU_RST, NR_ARCH_IRQS diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index bb27a26..c288ef7 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -84,6 +84,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = { [IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" }, [IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" }, [IRQIO_VIR] = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, + [IRQIO_VAI] = {.name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, [NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"}, [CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"}, }; diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 413c630..6a2b5fd 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -1,7 +1,7 @@ /* * ccw based virtio transport * - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012, 2014 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -32,6 +32,8 @@ #include #include #include +#include +#include /* * virtio related functions @@ -58,6 +60,8 @@ struct virtio_ccw_device { unsigned long indicators; unsigned long indicators2; struct vq_config_block *config_block; + bool is_thinint; + void *airq_info; }; struct vq_info_block { @@ -72,15 +76,38 @@ struct virtio_feature_desc { __u8 index; } __packed; +struct virtio_thinint_area { + unsigned long summary_indicator; + unsigned long indicator; + u64 bit_nr; + u8 isc; +} __packed; + struct virtio_ccw_vq_info { struct virtqueue *vq; int num; void *queue; struct vq_info_block *info_block; + int bit_nr; struct list_head node; long cookie; }; +#define VIRTIO_AIRQ_ISC IO_SCH_ISC /* inherit from subchannel */ + +#define VIRTIO_IV_BITS (L1_CACHE_BYTES * 8) +#define MAX_AIRQ_AREAS 20 + +static int virtio_ccw_use_airq = 1; + +struct airq_info { + rwlock_t lock; + u8 summary_indicator; + struct airq_struct airq; + struct airq_iv *aiv; +}; +static struct airq_info *airq_areas[MAX_AIRQ_AREAS]; + #define CCW_CMD_SET_VQ 0x13 #define CCW_CMD_VDEV_RESET 0x33 #define CCW_CMD_SET_IND 0x43 @@ -91,6 +118,7 @@ struct virtio_ccw_vq_info { #define CCW_CMD_WRITE_CONF 0x21 #define CCW_CMD_WRITE_STATUS 0x31 #define CCW_CMD_READ_VQ_CONF 0x32 +#define CCW_CMD_SET_IND_ADAPTER 0x73 #define VIRTIO_CCW_DOING_SET_VQ 0x00010000 #define VIRTIO_CCW_DOING_RESET 0x00040000 @@ -102,6 +130,7 @@ struct virtio_ccw_vq_info { #define VIRTIO_CCW_DOING_SET_IND 0x01000000 #define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000 #define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000 +#define VIRTIO_CCW_DOING_SET_IND_ADAPTER 0x08000000 #define VIRTIO_CCW_INTPARM_MASK 0xffff0000 static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev) @@ -109,6 +138,125 @@ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev) return container_of(vdev, struct virtio_ccw_device, vdev); } +static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info) +{ + unsigned long i, flags; + + write_lock_irqsave(&info->lock, flags); + for (i = 0; i < airq_iv_end(info->aiv); i++) { + if (vq == (void *)airq_iv_get_ptr(info->aiv, i)) { + airq_iv_free_bit(info->aiv, i); + airq_iv_set_ptr(info->aiv, i, 0); + break; + } + } + write_unlock_irqrestore(&info->lock, flags); +} + +static void virtio_airq_handler(struct airq_struct *airq) +{ + struct airq_info *info = container_of(airq, struct airq_info, airq); + unsigned long ai; + + inc_irq_stat(IRQIO_VAI); + read_lock(&info->lock); + /* Walk through indicators field, summary indicator active. */ + for (ai = 0;;) { + ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv)); + if (ai == -1UL) + break; + vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai)); + } + info->summary_indicator = 0; + smp_wmb(); + /* Walk through indicators field, summary indicator not active. */ + for (ai = 0;;) { + ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv)); + if (ai == -1UL) + break; + vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai)); + } + read_unlock(&info->lock); +} + +static struct airq_info *new_airq_info(void) +{ + struct airq_info *info; + int rc; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return NULL; + rwlock_init(&info->lock); + info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR); + if (!info->aiv) { + kfree(info); + return NULL; + } + info->airq.handler = virtio_airq_handler; + info->airq.lsi_ptr = &info->summary_indicator; + info->airq.lsi_mask = 0xff; + info->airq.isc = VIRTIO_AIRQ_ISC; + rc = register_adapter_interrupt(&info->airq); + if (rc) { + airq_iv_release(info->aiv); + kfree(info); + return NULL; + } + return info; +} + +static void destroy_airq_info(struct airq_info *info) +{ + if (!info) + return; + + unregister_adapter_interrupt(&info->airq); + airq_iv_release(info->aiv); + kfree(info); +} + +static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs, + u64 *first, void **airq_info) +{ + int i, j; + struct airq_info *info; + unsigned long indicator_addr = 0; + unsigned long bit, flags; + + for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) { + if (!airq_areas[i]) + airq_areas[i] = new_airq_info(); + info = airq_areas[i]; + if (!info) + return 0; + write_lock_irqsave(&info->lock, flags); + bit = airq_iv_alloc(info->aiv, nvqs); + if (bit == -1UL) { + /* Not enough vacancies. */ + write_unlock_irqrestore(&info->lock, flags); + continue; + } + *first = bit; + *airq_info = info; + indicator_addr = (unsigned long)info->aiv->vector; + for (j = 0; j < nvqs; j++) { + airq_iv_set_ptr(info->aiv, bit + j, + (unsigned long)vqs[j]); + } + write_unlock_irqrestore(&info->lock, flags); + } + return indicator_addr; +} + +static void virtio_ccw_drop_indicators(struct virtio_ccw_device *vcdev) +{ + struct virtio_ccw_vq_info *info; + + list_for_each_entry(info, &vcdev->virtqueues, node) + drop_airq_indicator(info->vq, vcdev->airq_info); +} + static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag) { unsigned long flags; @@ -145,6 +293,51 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev, return ret ? ret : vcdev->err; } +static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, + struct ccw1 *ccw) +{ + int ret; + unsigned long *indicatorp = NULL; + struct virtio_thinint_area *thinint_area = NULL; + struct airq_info *airq_info = vcdev->airq_info; + + if (vcdev->is_thinint) { + thinint_area = kzalloc(sizeof(*thinint_area), + GFP_DMA | GFP_KERNEL); + if (!thinint_area) + return; + thinint_area->summary_indicator = + (unsigned long) &airq_info->summary_indicator; + thinint_area->isc = VIRTIO_AIRQ_ISC; + ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER; + ccw->count = sizeof(*thinint_area); + ccw->cda = (__u32)(unsigned long) thinint_area; + } else { + indicatorp = kmalloc(sizeof(&vcdev->indicators), + GFP_DMA | GFP_KERNEL); + if (!indicatorp) + return; + *indicatorp = 0; + ccw->cmd_code = CCW_CMD_SET_IND; + ccw->count = sizeof(vcdev->indicators); + ccw->cda = (__u32)(unsigned long) indicatorp; + } + /* Deregister indicators from host. */ + vcdev->indicators = 0; + ccw->flags = 0; + ret = ccw_io_helper(vcdev, ccw, + vcdev->is_thinint ? + VIRTIO_CCW_DOING_SET_IND_ADAPTER : + VIRTIO_CCW_DOING_SET_IND); + if (ret && (ret != -ENODEV)) + dev_info(&vcdev->cdev->dev, + "Failed to deregister indicators (%d)\n", ret); + else if (vcdev->is_thinint) + virtio_ccw_drop_indicators(vcdev); + kfree(indicatorp); + kfree(thinint_area); +} + static inline long do_kvm_notify(struct subchannel_id schid, unsigned long queue_index, long cookie) @@ -232,11 +425,13 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev) { struct virtqueue *vq, *n; struct ccw1 *ccw; + struct virtio_ccw_device *vcdev = to_vc_device(vdev); ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL); if (!ccw) return; + virtio_ccw_drop_indicator(vcdev, ccw); list_for_each_entry_safe(vq, n, &vdev->vqs, list) virtio_ccw_del_vq(vq, ccw); @@ -326,6 +521,54 @@ out_err: return ERR_PTR(err); } +static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev, + struct virtqueue *vqs[], int nvqs, + struct ccw1 *ccw) +{ + int ret; + struct virtio_thinint_area *thinint_area = NULL; + struct airq_info *info; + + thinint_area = kzalloc(sizeof(*thinint_area), GFP_DMA | GFP_KERNEL); + if (!thinint_area) { + ret = -ENOMEM; + goto out; + } + /* Try to get an indicator. */ + thinint_area->indicator = get_airq_indicator(vqs, nvqs, + &thinint_area->bit_nr, + &vcdev->airq_info); + if (!thinint_area->indicator) { + ret = -ENOSPC; + goto out; + } + info = vcdev->airq_info; + thinint_area->summary_indicator = + (unsigned long) &info->summary_indicator; + thinint_area->isc = VIRTIO_AIRQ_ISC; + ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER; + ccw->flags = CCW_FLAG_SLI; + ccw->count = sizeof(*thinint_area); + ccw->cda = (__u32)(unsigned long)thinint_area; + ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND_ADAPTER); + if (ret) { + if (ret == -EOPNOTSUPP) { + /* + * The host does not support adapter interrupts + * for virtio-ccw, stop trying. + */ + virtio_ccw_use_airq = 0; + pr_info("Adapter interrupts unsupported on host\n"); + } else + dev_warn(&vcdev->cdev->dev, + "enabling adapter interrupts = %d\n", ret); + virtio_ccw_drop_indicators(vcdev); + } +out: + kfree(thinint_area); + return ret; +} + static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], @@ -355,15 +598,23 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, if (!indicatorp) goto out; *indicatorp = (unsigned long) &vcdev->indicators; - /* Register queue indicators with host. */ - vcdev->indicators = 0; - ccw->cmd_code = CCW_CMD_SET_IND; - ccw->flags = 0; - ccw->count = sizeof(vcdev->indicators); - ccw->cda = (__u32)(unsigned long) indicatorp; - ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND); - if (ret) - goto out; + if (vcdev->is_thinint) { + ret = virtio_ccw_register_adapter_ind(vcdev, vqs, nvqs, ccw); + if (ret) + /* no error, just fall back to legacy interrupts */ + vcdev->is_thinint = 0; + } + if (!vcdev->is_thinint) { + /* Register queue indicators with host. */ + vcdev->indicators = 0; + ccw->cmd_code = CCW_CMD_SET_IND; + ccw->flags = 0; + ccw->count = sizeof(vcdev->indicators); + ccw->cda = (__u32)(unsigned long) indicatorp; + ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND); + if (ret) + goto out; + } /* Register indicators2 with host for config changes */ *indicatorp = (unsigned long) &vcdev->indicators2; vcdev->indicators2 = 0; @@ -665,6 +916,7 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, case VIRTIO_CCW_DOING_SET_CONF_IND: case VIRTIO_CCW_DOING_RESET: case VIRTIO_CCW_DOING_READ_VQ_CONF: + case VIRTIO_CCW_DOING_SET_IND_ADAPTER: vcdev->curr_io &= ~activity; wake_up(&vcdev->wait_q); break; @@ -795,6 +1047,8 @@ static int virtio_ccw_online(struct ccw_device *cdev) goto out_free; } + vcdev->is_thinint = virtio_ccw_use_airq; /* at least try */ + vcdev->vdev.dev.parent = &cdev->dev; vcdev->vdev.dev.release = virtio_ccw_release_dev; vcdev->vdev.config = &virtio_ccw_config_ops; @@ -956,6 +1210,10 @@ module_init(virtio_ccw_init); static void __exit virtio_ccw_exit(void) { + int i; + ccw_driver_unregister(&virtio_ccw_driver); + for (i = 0; i < MAX_AIRQ_AREAS; i++) + destroy_airq_info(airq_areas[i]); } module_exit(virtio_ccw_exit); -- cgit v0.10.2 From 7e44e4495a398eb553ce561f29f9148f40a3448f Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 28 Feb 2014 12:52:54 +0100 Subject: x86: kvm: rate-limit global clock updates When we update a vcpu's local clock it may pick up an NTP correction. We can't wait an indeterminate amount of time for other vcpus to pick up that correction, so commit 0061d53daf26f introduced a global clock update. However, we can't request a global clock update on every vcpu load either (which is what happens if the tsc is marked as unstable). The solution is to rate-limit the global clock updates. Marcelo calculated that we should delay the global clock updates no more than 0.1s as follows: Assume an NTP correction c is applied to one vcpu, but not the other, then in n seconds the delta of the vcpu system_timestamps will be c * n. If we assume a correction of 500ppm (worst-case), then the two vcpus will diverge 50us in 0.1s, which is a considerable amount. Signed-off-by: Andrew Jones Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e714f8c..9aa09d3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -598,6 +598,7 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; cycle_t master_cycle_now; + struct delayed_work kvmclock_update_work; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 773eba7..5ed9293 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1628,14 +1628,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) { int i; - struct kvm *kvm = v->kvm; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1644,6 +1651,15 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -7022,6 +7038,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + return 0; } @@ -7059,6 +7077,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } -- cgit v0.10.2 From 332967a3eac06f6379283cf155c84fe7cd0537c2 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 28 Feb 2014 12:52:55 +0100 Subject: x86: kvm: introduce periodic global clock updates commit 0061d53daf26f introduced a mechanism to execute a global clock update for a vm. We can apply this periodically in order to propagate host NTP corrections. Also, if all vcpus of a vm are pinned, then without an additional trigger, no guest NTP corrections can propagate either, as the current trigger is only vcpu cpu migration. Signed-off-by: Andrew Jones Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9aa09d3..85be627 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -599,6 +599,7 @@ struct kvm_arch { u64 master_kernel_ns; cycle_t master_cycle_now; struct delayed_work kvmclock_update_work; + struct delayed_work kvmclock_sync_work; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ed9293..1e91a24 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1660,6 +1660,20 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) KVMCLOCK_UPDATE_DELAY); } +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -6736,6 +6750,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; struct msr_data msr; + struct kvm *kvm = vcpu->kvm; r = vcpu_load(vcpu); if (r) @@ -6746,6 +6761,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } @@ -7039,6 +7057,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); return 0; } @@ -7077,6 +7096,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); -- cgit v0.10.2 From 79629b208fc0484ee448c4acfa3762f0350e97ce Mon Sep 17 00:00:00 2001 From: Heinz Graalfs Date: Wed, 5 Mar 2014 15:23:54 +0100 Subject: virtio_ccw: fix hang in set offline processing During set offline processing virtio_grab_drvdata() incorrectly calls dev_set_drvdata() to remove the virtio_ccw_device from the parent ccw_device's driver data. This is wrong and ends up in a hang during virtio_ccw_reset(), as the interrupt handler still has need of the virtio_ccw_device. A new field 'going_away' is introduced in struct virtio_ccw_device to control the usage of the ccw_device's driver data pointer in virtio_grab_drvdata(). Signed-off-by: Heinz Graalfs Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 6a2b5fd..1e1fc67 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -61,6 +61,7 @@ struct virtio_ccw_device { unsigned long indicators2; struct vq_config_block *config_block; bool is_thinint; + bool going_away; void *airq_info; }; @@ -995,30 +996,39 @@ static struct virtio_ccw_device *virtio_grab_drvdata(struct ccw_device *cdev) spin_lock_irqsave(get_ccwdev_lock(cdev), flags); vcdev = dev_get_drvdata(&cdev->dev); - if (!vcdev) { + if (!vcdev || vcdev->going_away) { spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); return NULL; } - dev_set_drvdata(&cdev->dev, NULL); + vcdev->going_away = true; spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); return vcdev; } static void virtio_ccw_remove(struct ccw_device *cdev) { + unsigned long flags; struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev); if (vcdev && cdev->online) unregister_virtio_device(&vcdev->vdev); + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); + dev_set_drvdata(&cdev->dev, NULL); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); cdev->handler = NULL; } static int virtio_ccw_offline(struct ccw_device *cdev) { + unsigned long flags; struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev); - if (vcdev) + if (vcdev) { unregister_virtio_device(&vcdev->vdev); + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); + dev_set_drvdata(&cdev->dev, NULL); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + } return 0; } -- cgit v0.10.2 From b6b8a1451fc40412c57d10c94b62e22acab28f94 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:12 +0100 Subject: KVM: nVMX: Rework interception of IRQs and NMIs Move the check for leaving L2 on pending and intercepted IRQs or NMIs from the *_allowed handler into a dedicated callback. Invoke this callback at the relevant points before KVM checks if IRQs/NMIs can be injected. The callback has the task to switch from L2 to L1 if needed and inject the proper vmexit events. The rework fixes L2 wakeups from HLT and provides the foundation for preemption timer emulation. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 85be627..461d00a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -767,6 +767,8 @@ struct kvm_x86_ops { enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); + + int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 53c324f..11718b44 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -8172,6 +8147,35 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8512,6 +8516,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } /* @@ -8652,6 +8659,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a45bcac45..7382625 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5821,8 +5821,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { + int r; + /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5832,17 +5834,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.reinject); - return; + return 0; } if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); - return; + return 0; } if (vcpu->arch.interrupt.pending) { kvm_x86_ops->set_irq(vcpu); - return; + return 0; + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; } /* try to inject new event if pending */ @@ -5859,6 +5867,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) kvm_x86_ops->set_irq(vcpu); } } + return 0; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -5963,10 +5972,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu); - + if (inject_pending_event(vcpu, req_int_win) != 0) + req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + else if (vcpu->arch.nmi_pending) req_immediate_exit = kvm_x86_ops->enable_nmi_window(vcpu) != 0; else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -7296,6 +7305,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) -- cgit v0.10.2 From f4124500c2c13eb1208c6143b3f6d469709dea10 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:13 +0100 Subject: KVM: nVMX: Fully emulate preemption timer We cannot rely on the hardware-provided preemption timer support because we are holding L2 in HLT outside non-root mode. Furthermore, emulating the preemption will resolve tick rate errata on older Intel CPUs. The emulation is based on hrtimer which is started on L2 entry, stopped on L2 exit and evaluated via the new check_nested_events hook. As we no longer rely on hardware features, we can enable both the preemption timer support and value saving unconditionally. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 11718b44..e559675 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -110,6 +111,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -374,6 +377,9 @@ struct nested_vmx { */ struct page *apic_access_page; u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -1048,6 +1054,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -2253,9 +2265,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls @@ -2270,15 +2282,10 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || - !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { - nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - } - nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2347,9 +2354,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -5713,6 +5720,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5777,6 +5796,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -6753,9 +6776,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6771,27 +6791,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 delta_tsc_l1; - u32 preempt_val_l1, preempt_val_l2, preempt_scale; - - if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER)) - return; - preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & - MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; - preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); - delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) - - vcpu->arch.last_guest_tsc; - preempt_val_l1 = delta_tsc_l1 >> preempt_scale; - if (preempt_val_l2 <= preempt_val_l1) - preempt_val_l2 = 0; - else - preempt_val_l2 -= preempt_val_l1; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); -} - /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7210,8 +7209,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) - nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7608,6 +7605,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7621,7 +7640,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7679,13 +7697,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7713,7 +7732,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ @@ -7800,10 +7819,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - exit_control = vmcs_config.vmexit_ctrl; - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - vm_exit_controls_init(vmx, exit_control); + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -8151,6 +8167,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { if (vmx->nested.nested_run_pending) return -EBUSY; @@ -8176,6 +8200,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8246,10 +8284,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && - (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) - vmcs12->vmx_preemption_timer_value = - vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } /* * In some cases (usually, nested EPT), L2 is allowed to change its -- cgit v0.10.2 From 220c56729766444f3dd823f740a147ca6d82c4c6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:14 +0100 Subject: KVM: nVMX: Do not inject NMI vmexits when L2 has a pending interrupt According to SDM 27.2.3, IDT vectoring information will not be valid on vmexits caused by external NMIs. So we have to avoid creating such scenarios by delaying EXIT_REASON_EXCEPTION_NMI injection as long as we have a pending interrupt because that one would be migrated to L1's IDT vectoring info on nested exit. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e559675..2c9d21e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8176,7 +8176,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) } if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (vmx->nested.nested_run_pending) + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | -- cgit v0.10.2 From c9a7953f09bbe2b66050ebf97e0532eaeefbc9f3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:15 +0100 Subject: KVM: x86: Remove return code from enable_irq/nmi_window It's no longer possible to enter enable_irq_window in guest mode when L1 intercepts external interrupts and we are entering L2. This is now caught in vcpu_enter_guest. So we can remove the check from the VMX version of enable_irq_window, thus the need to return an error code from both enable_irq_window and enable_nmi_window. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 461d00a..7930c29 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -729,8 +729,8 @@ struct kvm_x86_ops { int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); - int (*enable_nmi_window)(struct kvm_vcpu *vcpu); - int (*enable_irq_window)(struct kvm_vcpu *vcpu); + void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 64d9bb9..1e8616e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3650,7 +3650,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return ret; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3664,16 +3664,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return 0; /* IRET will cause a vm exit */ + return; /* IRET will cause a vm exit */ /* * Something prevents NMI from been injected. Single step over possible @@ -3682,7 +3681,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu) svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_bp_intercept(vcpu); - return 0; } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2c9d21e..fcc1947 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4514,39 +4514,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7382625..6223121 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5976,11 +5976,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ else if (vcpu->arch.nmi_pending) - req_immediate_exit = - kvm_x86_ops->enable_nmi_window(vcpu) != 0; + kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - req_immediate_exit = - kvm_x86_ops->enable_irq_window(vcpu) != 0; + kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { /* -- cgit v0.10.2 From c845f9c646e646e6a5fe416c2e835342984249f7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:55:44 +0100 Subject: KVM: vmx: we do rely on loading DR7 on entry Currently, this works even if the bit is not in "min", because the bit is always set in MSR_IA32_VMX_ENTRY_CTLS. Mention it for the sake of documentation, and to avoid surprises if we later switch to MSR_IA32_VMX_TRUE_ENTRY_CTLS. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fcc1947..b2a913b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2871,7 +2871,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) -- cgit v0.10.2 From 360b948d88bf30ef4b10b693adf497f51fb46a08 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 09:55:56 +0100 Subject: KVM: x86: change vcpu->arch.switch_db_regs to a bit mask The next patch will add another bit that we can test with the same "if". Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7930c29..35f538b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -337,6 +337,10 @@ struct kvm_pmu { u64 reprogram_pmi; }; +enum { + KVM_DEBUGREG_BP_ENABLED = 1, +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -463,7 +467,7 @@ struct kvm_vcpu_arch { struct mtrr_state_type mtrr_state; u32 pat; - int switch_db_regs; + unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6223121..85c74e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -759,7 +759,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -- cgit v0.10.2 From c77fb5fe6f031bee9403397ae7b94ea22ea19aa7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:17:24 +0100 Subject: KVM: x86: Allow the guest to run with dirty debug registers When not running in guest-debug mode, the guest controls the debug registers and having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. After this patch, VMX- and SVM-specific code can set a flag in switch_db_regs, telling vcpu_enter_guest that on the next exit the debug registers might be dirty and need to be reloaded (syncing will be taken care of by a new callback in kvm_x86_ops). This flag can be set on the first access to a debug registers, so that multiple accesses to the debug registers only cause one vmexit. Note that since the guest will be able to read debug registers and enable breakpoints in DR7, we need to ensure that they are synchronized on entry to the guest---including DR6 that was not synced before. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 35f538b..fcaf9c9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -339,6 +339,7 @@ struct kvm_pmu { enum { KVM_DEBUGREG_BP_ENABLED = 1, + KVM_DEBUGREG_WONT_EXIT = 2, }; struct kvm_vcpu_arch { @@ -707,6 +708,7 @@ struct kvm_x86_ops { void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); u64 (*get_dr6)(struct kvm_vcpu *vcpu); void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 85c74e7..d906391 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6040,12 +6040,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + set_debugreg(vcpu->arch.dr6, 6); } trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. + */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_ops->sync_dirty_debug_regs(vcpu); + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't -- cgit v0.10.2 From 81908bf44340eb5ebc9969f67e6c8be0c92f2857 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:32:27 +0100 Subject: KVM: vmx: Allow the guest to run with dirty debug registers When not running in guest-debug mode (i.e. the guest controls the debug registers, having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. If the guest is running with vcpu->arch.db == vcpu->arch.eff_db, we can let it write freely to the debug registers and reload them on the next exit. We still need to exit on the first access, so that the KVM_DEBUGREG_WONT_EXIT flag is set in switch_db_regs; after that, further accesses to the debug registers will not cause a vmexit. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b2a913b..a9940ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "trace.h" @@ -2850,7 +2851,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif @@ -5084,6 +5085,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); @@ -5110,6 +5127,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -8628,6 +8663,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, -- cgit v0.10.2 From d16c293e4ecbddedfc1d64095ce56f0569adc12b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:36:37 +0100 Subject: KVM: nVMX: Allow nested guests to run with dirty debug registers When preparing the VMCS02, the CPU-based execution controls is computed by vmx_exec_control. Turn off DR access exits there, too, if the KVM_DEBUGREG_WONT_EXIT bit is set in switch_db_regs. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a9940ec..f4e5aed 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4242,6 +4242,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 -- cgit v0.10.2 From 5315c716b69f47e1751d09e16c7bd5b559419531 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 3 Mar 2014 13:08:29 +0100 Subject: KVM: svm: set/clear all DR intercepts in one swoop Unlike other intercepts, debug register intercepts will be modified in hot paths if the guest OS is bad or otherwise gets tricked into doing so. Avoid calling recalc_intercepts 16 times for debug registers. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1e8616e..86d802b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -303,20 +303,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) return vmcb->control.intercept_cr & (1U << bit); } -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr |= (1U << bit); + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr &= ~(1U << bit); + vmcb->control.intercept_dr = 0; recalc_intercepts(svm); } @@ -1080,23 +1095,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR4_WRITE); set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + set_dr_intercepts(svm); set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); -- cgit v0.10.2 From facb0139698923dc7b7d15cafbb319219969f4fd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:32:27 +0100 Subject: KVM: svm: Allow the guest to run with dirty debug registers When not running in guest-debug mode (i.e. the guest controls the debug registers, having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. If the guest is running with vcpu->arch.db == vcpu->arch.eff_db, we can let it write freely to the debug registers and reload them on the next exit. We still need to exit on the first access, so that the KVM_DEBUGREG_WONT_EXIT flag is set in switch_db_regs; after that, further accesses to the debug registers will not cause a vmexit. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 86d802b..a449c3d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1683,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) mark_dirty(svm->vmcb, VMCB_DR); } +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr7 = svm->vmcb->save.dr7; + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + set_dr_intercepts(svm); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm) unsigned long val; int err; + if (svm->vcpu.guest_debug == 0) { + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + clr_dr_intercepts(svm); + svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) return emulate_on_interception(svm); @@ -4300,6 +4327,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_dr6 = svm_get_dr6, .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, -- cgit v0.10.2 From 100943c54e0947a07d2c0185368fc2fd848f7f28 Mon Sep 17 00:00:00 2001 From: "Gabriel L. Somlo" Date: Thu, 27 Feb 2014 23:06:17 -0500 Subject: kvm: x86: ignore ioapic polarity Both QEMU and KVM have already accumulated a significant number of optimizations based on the hard-coded assumption that ioapic polarity will always use the ActiveHigh convention, where the logical and physical states of level-triggered irq lines always match (i.e., active(asserted) == high == 1, inactive == low == 0). QEMU guests are expected to follow directions given via ACPI and configure the ioapic with polarity 0 (ActiveHigh). However, even when misbehaving guests (e.g. OS X <= 10.9) set the ioapic polarity to 1 (ActiveLow), QEMU will still use the ActiveHigh signaling convention when interfacing with KVM. This patch modifies KVM to completely ignore ioapic polarity as set by the guest OS, enabling misbehaving guests to work alongside those which comply with the ActiveHigh polarity specified by QEMU's ACPI tables. Signed-off-by: Michael S. Tsirkin Signed-off-by: Gabriel L. Somlo [Move documentation to KVM_IRQ_LINE, add ia64. - Paolo] Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6cd63a9..4714f28 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -612,6 +612,20 @@ On some architectures it is required that an interrupt controller model has been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level to be set to 1 and then back to 0. +On real hardware, interrupt pins can be active-low or active-high. This +does not matter for the level field of struct kvm_irq_level: 1 always +means active (asserted), 0 means inactive (deasserted). + +x86 allows the operating system to program the interrupt polarity +(active-low/active-high) for level-triggered interrupts, and KVM used +to consider the polarity. However, due to bitrot in the handling of +active-low interrupts, the above convention is now valid on x86 too. +This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace +should not present interrupts to the guest as active-low unless this +capability is present (or unless it is not using the in-kernel irqchip, +of course). + + ARM/arm64 can signal an interrupt either at the CPU level, or at the in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to use PPIs designated for specific cpus. The irq field is interpreted @@ -628,7 +642,7 @@ The irq_type field has the following values: (The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) -In both cases, level is used to raise/lower the line. +In both cases, level is used to assert/deassert the line. struct kvm_irq_level { union { diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 53f44be..6a4309b 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -199,6 +199,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQCHIP: case KVM_CAP_MP_STATE: case KVM_CAP_IRQ_INJECT_STATUS: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d906391..a37da6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2657,6 +2657,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7d76401..a7518be 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -740,6 +740,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_SPAPR_MULTITCE 94 #define KVM_CAP_EXT_EMUL_CPUID 95 #define KVM_CAP_HYPERV_TIME 96 +#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ce9ed99..1539d37 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -328,7 +328,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], irq_source_id, level); entry = ioapic->redirtbl[irq]; - irq_level ^= entry.fields.polarity; if (!irq_level) { ioapic->irr &= ~mask; ret = 1; -- cgit v0.10.2 From fed495d25e0137028c37678a14cd1d8466c02dd3 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Mon, 10 Mar 2014 15:23:34 +0100 Subject: KVM: s390: Removing untriggerable BUG_ONs The BUG_ON in kvm-s390.c is unreachable, as we get the vcpu per common code, which itself does this from the private_data field of the file descriptor, and there is no KVM_UNCREATE_VCPU. The __{set,unset}_cpu_idle BUG_ONs are not triggerable because the vcpu creation code already checks against KVM_MAX_VCPUS. Signed-off-by: Dominik Dingel Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1d0f9d5..79d2e4f 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -114,14 +114,12 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, static void __set_cpu_idle(struct kvm_vcpu *vcpu) { - BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1); atomic_set_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { - BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1); atomic_clear_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9136f8d..6268357 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -949,8 +949,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); - BUG_ON(kvm_get_vcpu(vcpu->kvm, vcpu->vcpu_id) == NULL); - switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: case KVM_EXIT_UNKNOWN: -- cgit v0.10.2 From 2955c83f72801245afd0fe5c560cc75b82bea9aa Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 6 Mar 2014 16:01:38 +0100 Subject: KVM: s390: Optimize ucontrol path Since commit 7c470539c95630c1f2a10f109e96f249730b75eb (s390/kvm: avoid automatic sie reentry) we will run through the C code of KVM on host interrupts instead of just reentering the guest. This will result in additional ucontrol exits (at least HZ per second). Let handle a 0 intercept in the kernel and dont return to userspace, even if in ucontrol mode. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck CC: stable@vger.kernel.org diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6268357..7337c57 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -896,7 +896,8 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) if (rc == 0) { if (kvm_is_ucontrol(vcpu->kvm)) - rc = -EOPNOTSUPP; + /* Don't exit for host interrupts. */ + rc = vcpu->arch.sie_block->icptcode ? -EOPNOTSUPP : 0; else rc = kvm_handle_sie_intercept(vcpu); } -- cgit v0.10.2 From 27ce825823a145eb72bd5a5832c6dbb3168b720e Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Sat, 15 Mar 2014 21:01:59 +0100 Subject: KVM: x86 emulator: emulate MOVAPS HCK memory driver test fails when testing 32-bit Windows 8.1 with baloon driver. tracing KVM shows error: reason EXIT_ERR rip 0x81c18326 info 0 0 x/10i 0x81c18326-20 0x0000000081c18312: add %al,(%eax) 0x0000000081c18314: add %cl,-0x7127711d(%esi) 0x0000000081c1831a: rolb $0x0,0x80ec(%ecx) 0x0000000081c18321: and $0xfffffff0,%esp 0x0000000081c18324: mov %esp,%esi 0x0000000081c18326: movaps %xmm0,(%esi) 0x0000000081c18329: movaps %xmm1,0x10(%esi) 0x0000000081c1832d: movaps %xmm2,0x20(%esi) 0x0000000081c18331: movaps %xmm3,0x30(%esi) 0x0000000081c18335: movaps %xmm4,0x40(%esi) which points to MOVAPS instruction currently no emulated by KVM. Fix it by adding appropriate entries to opcode table in KVM's emulator. Signed-off-by: Igor Mammedov Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 07ffca0..a26d075 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct gprefix pfx_0f_28_0f_29 = { + I(Aligned, em_mov), N, N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), -- cgit v0.10.2 From 6fec27d80feb12f88babcfe75f70f955c51723e8 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Sat, 15 Mar 2014 21:02:00 +0100 Subject: KVM: x86 emulator: emulate MOVAPD Add emulation for 0x66 prefixed instruction of 0f 28 opcode that has been added earlier. Signed-off-by: Igor Mammedov Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a26d075..205b17e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3669,7 +3669,7 @@ static const struct gprefix pfx_vmovntpx = { }; static const struct gprefix pfx_0f_28_0f_29 = { - I(Aligned, em_mov), N, N, N, + I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; static const struct escape escape_d9 = { { -- cgit v0.10.2 From 4ff417320c2dfc984ec1939a7da888976441a881 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Feb 2014 12:15:16 +0100 Subject: KVM: x86: introduce kvm_supported_xcr0() XSAVE support for KVM is already using host_xcr0 & KVM_SUPPORTED_XCR0 as a "dynamic" version of KVM_SUPPORTED_XCR0. However, this is not enough because the MPX bits should not be presented to the guest unless kvm_x86_ops confirms the support. So, replace all instances of host_xcr0 & KVM_SUPPORTED_XCR0 with a new function kvm_supported_xcr0() that also has this check. Note that here: if (xstate_bv & ~KVM_SUPPORTED_XCR0) return -EINVAL; if (xstate_bv & ~host_cr0) return -EINVAL; the code is equivalent to if ((xstate_bv & ~KVM_SUPPORTED_XCR0) || (xstate_bv & ~host_cr0) return -EINVAL; i.e. "xstate_bv & (~KVM_SUPPORTED_XCR0 | ~host_cr0)" which is in turn equal to "xstate_bv & ~(KVM_SUPPORTED_XCR0 & host_cr0)". So we should also use the new function there. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ddc8a7e..18aefb4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv) return ret; } +u64 kvm_supported_xcr0(void) +{ + u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + + if (!kvm_x86_ops->mpx_supported || !kvm_x86_ops->mpx_supported()) + xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + + return xcr0; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -73,7 +83,7 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & - host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_supported_xcr0(); vcpu->arch.guest_xstate_size = best->ebx = xstate_required_size(vcpu->arch.xcr0); } @@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & KVM_SUPPORTED_XCR0 & host_xcr0; -} - #define F(x) bit(X86_FEATURE_##x) static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, @@ -439,16 +442,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 0xd: { int idx, i; + u64 supported = kvm_supported_xcr0(); - entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; - entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; + entry->eax &= supported; + entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { + u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + if (entry[i].eax == 0 || !(supported & mask)) continue; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a37da6b..3f5fb45 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3084,9 +3084,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~KVM_SUPPORTED_XCR0) - return -EINVAL; - if (xstate_bv & ~host_xcr0) + if (xstate_bv & ~kvm_supported_xcr0()) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, guest_xsave->region, vcpu->arch.guest_xstate_size); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 392ecbf..8c97bac 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -126,6 +126,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; +extern u64 kvm_supported_xcr0(void); + extern unsigned int min_timer_period_us; extern struct static_key kvm_no_apic_vcpu; -- cgit v0.10.2 From 36be0b9deb23161e9eba962c215aece551113a15 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Feb 2014 12:30:04 +0100 Subject: KVM: x86: Add nested virtualization support for MPX This is simple to do, the "host" BNDCFGS is either 0 or the guest value. However, both controls have to be present. We cannot provide MPX if we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4e5aed..c95bea1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -206,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -541,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -596,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -736,6 +739,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2287,6 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2300,6 +2306,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_ENTRY_LOAD_IA32_PAT; nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -7866,6 +7874,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -8351,6 +8362,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ @@ -8460,6 +8473,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; -- cgit v0.10.2 From 93c4adc7afedf9b0ec190066d45b6d67db5270da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Mar 2014 23:19:52 +0100 Subject: KVM: x86: handle missing MPX in nested virtualization When doing nested virtualization, we may be able to read BNDCFGS but still not be allowed to write to GUEST_BNDCFGS in the VMCS. Guard writes to the field with vmx_mpx_supported(), and similarly hide the MSR from userspace if the processor does not support the field. We could work around this with the generic MSR save/load machinery, but there is only a limited number of MSR save/load slots and it is not really worthwhile to waste one for a scenario that should not happen except in the nested virtualization case. Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 18aefb4..64fae65 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -47,7 +47,7 @@ u64 kvm_supported_xcr0(void) { u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; - if (!kvm_x86_ops->mpx_supported || !kvm_x86_ops->mpx_supported()) + if (!kvm_x86_ops->mpx_supported()) xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); return xcr0; @@ -259,8 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_x86_ops->mpx_supported ? - (kvm_x86_ops->mpx_supported() ? F(MPX) : 0) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a449c3d..2136cb6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4089,6 +4089,11 @@ static bool svm_invpcid_supported(void) return false; } +static bool svm_mpx_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4371,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, + .mpx_supported = svm_mpx_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c95bea1..1320e0f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -729,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -2501,6 +2502,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: @@ -2572,6 +2575,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_TSC: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f5fb45..aa98695 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3937,6 +3937,23 @@ static void kvm_init_msr_list(void) for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; + + /* + * Even MSRs that are valid in the host may not be exposed + * to the guests in some cases. We could work around this + * in VMX with the generic MSR save/load machinery, but it + * is not really worthwhile since it will really only + * happen with nested virtualization. + */ + switch (msrs_to_save[i]) { + case MSR_IA32_BNDCFGS: + if (!kvm_x86_ops->mpx_supported()) + continue; + break; + default: + break; + } + if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; -- cgit v0.10.2 From 684a0b719ddbbafe1c7e6646b9bc239453a1773d Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 17 Mar 2014 19:11:35 +0100 Subject: KVM: eventfd: Fix lock order inversion. When registering a new irqfd, we call its ->poll method to collect any event that might have previously been pending so that we can trigger it. This is done under the kvm->irqfds.lock, which means the eventfd's ctx lock is taken under it. However, if we get a POLLHUP in irqfd_wakeup, we will be called with the ctx lock held before getting the irqfds.lock to deactivate the irqfd, causing lockdep to complain. Calling the ->poll method does not really need the irqfds.lock, so let's just move it after we've given up the irqfds.lock in kvm_irqfd_assign(). Signed-off-by: Cornelia Huck Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index abe4d60..29c2a04 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -391,19 +391,19 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) lockdep_is_held(&kvm->irqfds.lock)); irqfd_update(kvm, irqfd, irq_rt); - events = f.file->f_op->poll(f.file, &irqfd->pt); - list_add_tail(&irqfd->list, &kvm->irqfds.items); + spin_unlock_irq(&kvm->irqfds.lock); + /* * Check if there was an event already pending on the eventfd * before we registered, and trigger it as if we didn't miss it. */ + events = f.file->f_op->poll(f.file, &irqfd->pt); + if (events & POLLIN) schedule_work(&irqfd->inject); - spin_unlock_irq(&kvm->irqfds.lock); - /* * do not drop the file until the irqfd is fully initialized, otherwise * we might race against the POLLHUP -- cgit v0.10.2 From 22027945482303573b3600c0e3d7445020c2f29b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 14 Mar 2014 13:06:08 +0000 Subject: MIPS: KVM: asm/kvm_host.h: Clean up whitespace The whitespace in asm/kvm_host.h is quite inconsistent in places. Clean up the whole file to use tabs more consistently. When you use the --ignore-space-change argument to git diff this patch only changes line wrapping in TLB_IS_GLOBAL and TLB_IS_VALID macros. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Sanjay Lal Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index a995fce..502c8da 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -30,16 +30,16 @@ /* Special address that contains the comm page, used for reducing # of traps */ -#define KVM_GUEST_COMMPAGE_ADDR 0x0 +#define KVM_GUEST_COMMPAGE_ADDR 0x0 #define KVM_GUEST_KERNEL_MODE(vcpu) ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \ ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0)) -#define KVM_GUEST_KUSEG 0x00000000UL -#define KVM_GUEST_KSEG0 0x40000000UL -#define KVM_GUEST_KSEG23 0x60000000UL -#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0x60000000) -#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff) +#define KVM_GUEST_KUSEG 0x00000000UL +#define KVM_GUEST_KSEG0 0x40000000UL +#define KVM_GUEST_KSEG23 0x60000000UL +#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0x60000000) +#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff) #define KVM_GUEST_CKSEG0ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG0) #define KVM_GUEST_CKSEG1ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1) @@ -52,17 +52,17 @@ #define KVM_GUEST_KSEG1ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1) #define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23) -#define KVM_INVALID_PAGE 0xdeadbeef -#define KVM_INVALID_INST 0xdeadbeef -#define KVM_INVALID_ADDR 0xdeadbeef +#define KVM_INVALID_PAGE 0xdeadbeef +#define KVM_INVALID_INST 0xdeadbeef +#define KVM_INVALID_ADDR 0xdeadbeef -#define KVM_MALTA_GUEST_RTC_ADDR 0xb8000070UL +#define KVM_MALTA_GUEST_RTC_ADDR 0xb8000070UL -#define GUEST_TICKS_PER_JIFFY (40000000/HZ) -#define MS_TO_NS(x) (x * 1E6L) +#define GUEST_TICKS_PER_JIFFY (40000000/HZ) +#define MS_TO_NS(x) (x * 1E6L) -#define CAUSEB_DC 27 -#define CAUSEF_DC (_ULCAST_(1) << 27) +#define CAUSEB_DC 27 +#define CAUSEF_DC (_ULCAST_(1) << 27) struct kvm; struct kvm_run; @@ -126,8 +126,8 @@ struct kvm_arch { int commpage_tlb; }; -#define N_MIPS_COPROC_REGS 32 -#define N_MIPS_COPROC_SEL 8 +#define N_MIPS_COPROC_REGS 32 +#define N_MIPS_COPROC_SEL 8 struct mips_coproc { unsigned long reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL]; @@ -139,124 +139,124 @@ struct mips_coproc { /* * Coprocessor 0 register names */ -#define MIPS_CP0_TLB_INDEX 0 -#define MIPS_CP0_TLB_RANDOM 1 -#define MIPS_CP0_TLB_LOW 2 -#define MIPS_CP0_TLB_LO0 2 -#define MIPS_CP0_TLB_LO1 3 -#define MIPS_CP0_TLB_CONTEXT 4 -#define MIPS_CP0_TLB_PG_MASK 5 -#define MIPS_CP0_TLB_WIRED 6 -#define MIPS_CP0_HWRENA 7 -#define MIPS_CP0_BAD_VADDR 8 -#define MIPS_CP0_COUNT 9 -#define MIPS_CP0_TLB_HI 10 -#define MIPS_CP0_COMPARE 11 -#define MIPS_CP0_STATUS 12 -#define MIPS_CP0_CAUSE 13 -#define MIPS_CP0_EXC_PC 14 -#define MIPS_CP0_PRID 15 -#define MIPS_CP0_CONFIG 16 -#define MIPS_CP0_LLADDR 17 -#define MIPS_CP0_WATCH_LO 18 -#define MIPS_CP0_WATCH_HI 19 -#define MIPS_CP0_TLB_XCONTEXT 20 -#define MIPS_CP0_ECC 26 -#define MIPS_CP0_CACHE_ERR 27 -#define MIPS_CP0_TAG_LO 28 -#define MIPS_CP0_TAG_HI 29 -#define MIPS_CP0_ERROR_PC 30 -#define MIPS_CP0_DEBUG 23 -#define MIPS_CP0_DEPC 24 -#define MIPS_CP0_PERFCNT 25 -#define MIPS_CP0_ERRCTL 26 -#define MIPS_CP0_DATA_LO 28 -#define MIPS_CP0_DATA_HI 29 -#define MIPS_CP0_DESAVE 31 - -#define MIPS_CP0_CONFIG_SEL 0 -#define MIPS_CP0_CONFIG1_SEL 1 -#define MIPS_CP0_CONFIG2_SEL 2 -#define MIPS_CP0_CONFIG3_SEL 3 +#define MIPS_CP0_TLB_INDEX 0 +#define MIPS_CP0_TLB_RANDOM 1 +#define MIPS_CP0_TLB_LOW 2 +#define MIPS_CP0_TLB_LO0 2 +#define MIPS_CP0_TLB_LO1 3 +#define MIPS_CP0_TLB_CONTEXT 4 +#define MIPS_CP0_TLB_PG_MASK 5 +#define MIPS_CP0_TLB_WIRED 6 +#define MIPS_CP0_HWRENA 7 +#define MIPS_CP0_BAD_VADDR 8 +#define MIPS_CP0_COUNT 9 +#define MIPS_CP0_TLB_HI 10 +#define MIPS_CP0_COMPARE 11 +#define MIPS_CP0_STATUS 12 +#define MIPS_CP0_CAUSE 13 +#define MIPS_CP0_EXC_PC 14 +#define MIPS_CP0_PRID 15 +#define MIPS_CP0_CONFIG 16 +#define MIPS_CP0_LLADDR 17 +#define MIPS_CP0_WATCH_LO 18 +#define MIPS_CP0_WATCH_HI 19 +#define MIPS_CP0_TLB_XCONTEXT 20 +#define MIPS_CP0_ECC 26 +#define MIPS_CP0_CACHE_ERR 27 +#define MIPS_CP0_TAG_LO 28 +#define MIPS_CP0_TAG_HI 29 +#define MIPS_CP0_ERROR_PC 30 +#define MIPS_CP0_DEBUG 23 +#define MIPS_CP0_DEPC 24 +#define MIPS_CP0_PERFCNT 25 +#define MIPS_CP0_ERRCTL 26 +#define MIPS_CP0_DATA_LO 28 +#define MIPS_CP0_DATA_HI 29 +#define MIPS_CP0_DESAVE 31 + +#define MIPS_CP0_CONFIG_SEL 0 +#define MIPS_CP0_CONFIG1_SEL 1 +#define MIPS_CP0_CONFIG2_SEL 2 +#define MIPS_CP0_CONFIG3_SEL 3 /* Config0 register bits */ -#define CP0C0_M 31 -#define CP0C0_K23 28 -#define CP0C0_KU 25 -#define CP0C0_MDU 20 -#define CP0C0_MM 17 -#define CP0C0_BM 16 -#define CP0C0_BE 15 -#define CP0C0_AT 13 -#define CP0C0_AR 10 -#define CP0C0_MT 7 -#define CP0C0_VI 3 -#define CP0C0_K0 0 +#define CP0C0_M 31 +#define CP0C0_K23 28 +#define CP0C0_KU 25 +#define CP0C0_MDU 20 +#define CP0C0_MM 17 +#define CP0C0_BM 16 +#define CP0C0_BE 15 +#define CP0C0_AT 13 +#define CP0C0_AR 10 +#define CP0C0_MT 7 +#define CP0C0_VI 3 +#define CP0C0_K0 0 /* Config1 register bits */ -#define CP0C1_M 31 -#define CP0C1_MMU 25 -#define CP0C1_IS 22 -#define CP0C1_IL 19 -#define CP0C1_IA 16 -#define CP0C1_DS 13 -#define CP0C1_DL 10 -#define CP0C1_DA 7 -#define CP0C1_C2 6 -#define CP0C1_MD 5 -#define CP0C1_PC 4 -#define CP0C1_WR 3 -#define CP0C1_CA 2 -#define CP0C1_EP 1 -#define CP0C1_FP 0 +#define CP0C1_M 31 +#define CP0C1_MMU 25 +#define CP0C1_IS 22 +#define CP0C1_IL 19 +#define CP0C1_IA 16 +#define CP0C1_DS 13 +#define CP0C1_DL 10 +#define CP0C1_DA 7 +#define CP0C1_C2 6 +#define CP0C1_MD 5 +#define CP0C1_PC 4 +#define CP0C1_WR 3 +#define CP0C1_CA 2 +#define CP0C1_EP 1 +#define CP0C1_FP 0 /* Config2 Register bits */ -#define CP0C2_M 31 -#define CP0C2_TU 28 -#define CP0C2_TS 24 -#define CP0C2_TL 20 -#define CP0C2_TA 16 -#define CP0C2_SU 12 -#define CP0C2_SS 8 -#define CP0C2_SL 4 -#define CP0C2_SA 0 +#define CP0C2_M 31 +#define CP0C2_TU 28 +#define CP0C2_TS 24 +#define CP0C2_TL 20 +#define CP0C2_TA 16 +#define CP0C2_SU 12 +#define CP0C2_SS 8 +#define CP0C2_SL 4 +#define CP0C2_SA 0 /* Config3 Register bits */ -#define CP0C3_M 31 -#define CP0C3_ISA_ON_EXC 16 -#define CP0C3_ULRI 13 -#define CP0C3_DSPP 10 -#define CP0C3_LPA 7 -#define CP0C3_VEIC 6 -#define CP0C3_VInt 5 -#define CP0C3_SP 4 -#define CP0C3_MT 2 -#define CP0C3_SM 1 -#define CP0C3_TL 0 +#define CP0C3_M 31 +#define CP0C3_ISA_ON_EXC 16 +#define CP0C3_ULRI 13 +#define CP0C3_DSPP 10 +#define CP0C3_LPA 7 +#define CP0C3_VEIC 6 +#define CP0C3_VInt 5 +#define CP0C3_SP 4 +#define CP0C3_MT 2 +#define CP0C3_SM 1 +#define CP0C3_TL 0 /* Have config1, Cacheable, noncoherent, write-back, write allocate*/ -#define MIPS_CONFIG0 \ +#define MIPS_CONFIG0 \ ((1 << CP0C0_M) | (0x3 << CP0C0_K0)) /* Have config2, no coprocessor2 attached, no MDMX support attached, no performance counters, watch registers present, no code compression, EJTAG present, no FPU, no watch registers */ -#define MIPS_CONFIG1 \ -((1 << CP0C1_M) | \ - (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) | \ - (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) | \ +#define MIPS_CONFIG1 \ +((1 << CP0C1_M) | \ + (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) | \ + (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) | \ (0 << CP0C1_FP)) /* Have config3, no tertiary/secondary caches implemented */ -#define MIPS_CONFIG2 \ +#define MIPS_CONFIG2 \ ((1 << CP0C2_M)) /* No config4, no DSP ASE, no large physaddr (PABITS), no external interrupt controller, no vectored interrupts, no 1kb pages, no SmartMIPS ASE, no trace logic */ -#define MIPS_CONFIG3 \ -((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) | \ - (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) | \ +#define MIPS_CONFIG3 \ +((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) | \ + (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) | \ (0 << CP0C3_SM) | (0 << CP0C3_TL)) /* MMU types, the first four entries have the same layout as the @@ -274,36 +274,36 @@ enum mips_mmu_types { /* * Trap codes */ -#define T_INT 0 /* Interrupt pending */ -#define T_TLB_MOD 1 /* TLB modified fault */ -#define T_TLB_LD_MISS 2 /* TLB miss on load or ifetch */ -#define T_TLB_ST_MISS 3 /* TLB miss on a store */ -#define T_ADDR_ERR_LD 4 /* Address error on a load or ifetch */ -#define T_ADDR_ERR_ST 5 /* Address error on a store */ -#define T_BUS_ERR_IFETCH 6 /* Bus error on an ifetch */ -#define T_BUS_ERR_LD_ST 7 /* Bus error on a load or store */ -#define T_SYSCALL 8 /* System call */ -#define T_BREAK 9 /* Breakpoint */ -#define T_RES_INST 10 /* Reserved instruction exception */ -#define T_COP_UNUSABLE 11 /* Coprocessor unusable */ -#define T_OVFLOW 12 /* Arithmetic overflow */ +#define T_INT 0 /* Interrupt pending */ +#define T_TLB_MOD 1 /* TLB modified fault */ +#define T_TLB_LD_MISS 2 /* TLB miss on load or ifetch */ +#define T_TLB_ST_MISS 3 /* TLB miss on a store */ +#define T_ADDR_ERR_LD 4 /* Address error on a load or ifetch */ +#define T_ADDR_ERR_ST 5 /* Address error on a store */ +#define T_BUS_ERR_IFETCH 6 /* Bus error on an ifetch */ +#define T_BUS_ERR_LD_ST 7 /* Bus error on a load or store */ +#define T_SYSCALL 8 /* System call */ +#define T_BREAK 9 /* Breakpoint */ +#define T_RES_INST 10 /* Reserved instruction exception */ +#define T_COP_UNUSABLE 11 /* Coprocessor unusable */ +#define T_OVFLOW 12 /* Arithmetic overflow */ /* * Trap definitions added for r4000 port. */ -#define T_TRAP 13 /* Trap instruction */ -#define T_VCEI 14 /* Virtual coherency exception */ -#define T_FPE 15 /* Floating point exception */ -#define T_WATCH 23 /* Watch address reference */ -#define T_VCED 31 /* Virtual coherency data */ +#define T_TRAP 13 /* Trap instruction */ +#define T_VCEI 14 /* Virtual coherency exception */ +#define T_FPE 15 /* Floating point exception */ +#define T_WATCH 23 /* Watch address reference */ +#define T_VCED 31 /* Virtual coherency data */ /* Resume Flags */ -#define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */ -#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ +#define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */ +#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ -#define RESUME_GUEST 0 -#define RESUME_GUEST_DR RESUME_FLAG_DR -#define RESUME_HOST RESUME_FLAG_HOST +#define RESUME_GUEST 0 +#define RESUME_GUEST_DR RESUME_FLAG_DR +#define RESUME_HOST RESUME_FLAG_HOST enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -313,24 +313,27 @@ enum emulation_result { EMULATE_PRIV_FAIL, }; -#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */ -#define MIPS3_PG_V 0x00000002 /* Valid */ -#define MIPS3_PG_NV 0x00000000 -#define MIPS3_PG_D 0x00000004 /* Dirty */ +#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */ +#define MIPS3_PG_V 0x00000002 /* Valid */ +#define MIPS3_PG_NV 0x00000000 +#define MIPS3_PG_D 0x00000004 /* Dirty */ #define mips3_paddr_to_tlbpfn(x) \ - (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME) + (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME) #define mips3_tlbpfn_to_paddr(x) \ - ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT) + ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT) -#define MIPS3_PG_SHIFT 6 -#define MIPS3_PG_FRAME 0x3fffffc0 +#define MIPS3_PG_SHIFT 6 +#define MIPS3_PG_FRAME 0x3fffffc0 -#define VPN2_MASK 0xffffe000 -#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && ((x).tlb_lo1 & MIPS3_PG_G)) -#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK) -#define TLB_ASID(x) ((x).tlb_hi & ASID_MASK) -#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) ? ((x).tlb_lo1 & MIPS3_PG_V) : ((x).tlb_lo0 & MIPS3_PG_V)) +#define VPN2_MASK 0xffffe000 +#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \ + ((x).tlb_lo1 & MIPS3_PG_G)) +#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK) +#define TLB_ASID(x) ((x).tlb_hi & ASID_MASK) +#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \ + ? ((x).tlb_lo1 & MIPS3_PG_V) \ + : ((x).tlb_lo0 & MIPS3_PG_V)) struct kvm_mips_tlb { long tlb_mask; @@ -339,7 +342,7 @@ struct kvm_mips_tlb { long tlb_lo1; }; -#define KVM_MIPS_GUEST_TLB_SIZE 64 +#define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { void *host_ebase, *guest_ebase; unsigned long host_stack; @@ -400,65 +403,65 @@ struct kvm_vcpu_arch { }; -#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0]) -#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val) -#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0]) -#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0]) -#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) -#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) -#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) -#define kvm_read_c0_guest_pagemask(cop0) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0]) -#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val)) -#define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0]) -#define kvm_write_c0_guest_wired(cop0, val) (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val)) -#define kvm_read_c0_guest_badvaddr(cop0) (cop0->reg[MIPS_CP0_BAD_VADDR][0]) -#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val)) -#define kvm_read_c0_guest_count(cop0) (cop0->reg[MIPS_CP0_COUNT][0]) -#define kvm_write_c0_guest_count(cop0, val) (cop0->reg[MIPS_CP0_COUNT][0] = (val)) -#define kvm_read_c0_guest_entryhi(cop0) (cop0->reg[MIPS_CP0_TLB_HI][0]) -#define kvm_write_c0_guest_entryhi(cop0, val) (cop0->reg[MIPS_CP0_TLB_HI][0] = (val)) -#define kvm_read_c0_guest_compare(cop0) (cop0->reg[MIPS_CP0_COMPARE][0]) -#define kvm_write_c0_guest_compare(cop0, val) (cop0->reg[MIPS_CP0_COMPARE][0] = (val)) -#define kvm_read_c0_guest_status(cop0) (cop0->reg[MIPS_CP0_STATUS][0]) -#define kvm_write_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] = (val)) -#define kvm_read_c0_guest_intctl(cop0) (cop0->reg[MIPS_CP0_STATUS][1]) -#define kvm_write_c0_guest_intctl(cop0, val) (cop0->reg[MIPS_CP0_STATUS][1] = (val)) -#define kvm_read_c0_guest_cause(cop0) (cop0->reg[MIPS_CP0_CAUSE][0]) -#define kvm_write_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] = (val)) -#define kvm_read_c0_guest_epc(cop0) (cop0->reg[MIPS_CP0_EXC_PC][0]) -#define kvm_write_c0_guest_epc(cop0, val) (cop0->reg[MIPS_CP0_EXC_PC][0] = (val)) -#define kvm_read_c0_guest_prid(cop0) (cop0->reg[MIPS_CP0_PRID][0]) -#define kvm_write_c0_guest_prid(cop0, val) (cop0->reg[MIPS_CP0_PRID][0] = (val)) -#define kvm_read_c0_guest_ebase(cop0) (cop0->reg[MIPS_CP0_PRID][1]) -#define kvm_write_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] = (val)) -#define kvm_read_c0_guest_config(cop0) (cop0->reg[MIPS_CP0_CONFIG][0]) -#define kvm_read_c0_guest_config1(cop0) (cop0->reg[MIPS_CP0_CONFIG][1]) -#define kvm_read_c0_guest_config2(cop0) (cop0->reg[MIPS_CP0_CONFIG][2]) -#define kvm_read_c0_guest_config3(cop0) (cop0->reg[MIPS_CP0_CONFIG][3]) -#define kvm_read_c0_guest_config7(cop0) (cop0->reg[MIPS_CP0_CONFIG][7]) -#define kvm_write_c0_guest_config(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][0] = (val)) -#define kvm_write_c0_guest_config1(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][1] = (val)) -#define kvm_write_c0_guest_config2(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][2] = (val)) -#define kvm_write_c0_guest_config3(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][3] = (val)) -#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val)) -#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0]) -#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val)) - -#define kvm_set_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] |= (val)) -#define kvm_clear_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val)) -#define kvm_set_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] |= (val)) -#define kvm_clear_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val)) -#define kvm_change_c0_guest_cause(cop0, change, val) \ -{ \ - kvm_clear_c0_guest_cause(cop0, change); \ - kvm_set_c0_guest_cause(cop0, ((val) & (change))); \ +#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0]) +#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val) +#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0]) +#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0]) +#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) +#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) +#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) +#define kvm_read_c0_guest_pagemask(cop0) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0]) +#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val)) +#define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0]) +#define kvm_write_c0_guest_wired(cop0, val) (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val)) +#define kvm_read_c0_guest_badvaddr(cop0) (cop0->reg[MIPS_CP0_BAD_VADDR][0]) +#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val)) +#define kvm_read_c0_guest_count(cop0) (cop0->reg[MIPS_CP0_COUNT][0]) +#define kvm_write_c0_guest_count(cop0, val) (cop0->reg[MIPS_CP0_COUNT][0] = (val)) +#define kvm_read_c0_guest_entryhi(cop0) (cop0->reg[MIPS_CP0_TLB_HI][0]) +#define kvm_write_c0_guest_entryhi(cop0, val) (cop0->reg[MIPS_CP0_TLB_HI][0] = (val)) +#define kvm_read_c0_guest_compare(cop0) (cop0->reg[MIPS_CP0_COMPARE][0]) +#define kvm_write_c0_guest_compare(cop0, val) (cop0->reg[MIPS_CP0_COMPARE][0] = (val)) +#define kvm_read_c0_guest_status(cop0) (cop0->reg[MIPS_CP0_STATUS][0]) +#define kvm_write_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] = (val)) +#define kvm_read_c0_guest_intctl(cop0) (cop0->reg[MIPS_CP0_STATUS][1]) +#define kvm_write_c0_guest_intctl(cop0, val) (cop0->reg[MIPS_CP0_STATUS][1] = (val)) +#define kvm_read_c0_guest_cause(cop0) (cop0->reg[MIPS_CP0_CAUSE][0]) +#define kvm_write_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] = (val)) +#define kvm_read_c0_guest_epc(cop0) (cop0->reg[MIPS_CP0_EXC_PC][0]) +#define kvm_write_c0_guest_epc(cop0, val) (cop0->reg[MIPS_CP0_EXC_PC][0] = (val)) +#define kvm_read_c0_guest_prid(cop0) (cop0->reg[MIPS_CP0_PRID][0]) +#define kvm_write_c0_guest_prid(cop0, val) (cop0->reg[MIPS_CP0_PRID][0] = (val)) +#define kvm_read_c0_guest_ebase(cop0) (cop0->reg[MIPS_CP0_PRID][1]) +#define kvm_write_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] = (val)) +#define kvm_read_c0_guest_config(cop0) (cop0->reg[MIPS_CP0_CONFIG][0]) +#define kvm_read_c0_guest_config1(cop0) (cop0->reg[MIPS_CP0_CONFIG][1]) +#define kvm_read_c0_guest_config2(cop0) (cop0->reg[MIPS_CP0_CONFIG][2]) +#define kvm_read_c0_guest_config3(cop0) (cop0->reg[MIPS_CP0_CONFIG][3]) +#define kvm_read_c0_guest_config7(cop0) (cop0->reg[MIPS_CP0_CONFIG][7]) +#define kvm_write_c0_guest_config(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][0] = (val)) +#define kvm_write_c0_guest_config1(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][1] = (val)) +#define kvm_write_c0_guest_config2(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][2] = (val)) +#define kvm_write_c0_guest_config3(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][3] = (val)) +#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val)) +#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0]) +#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val)) + +#define kvm_set_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] |= (val)) +#define kvm_clear_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val)) +#define kvm_set_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] |= (val)) +#define kvm_clear_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val)) +#define kvm_change_c0_guest_cause(cop0, change, val) \ +{ \ + kvm_clear_c0_guest_cause(cop0, change); \ + kvm_set_c0_guest_cause(cop0, ((val) & (change))); \ } -#define kvm_set_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] |= (val)) -#define kvm_clear_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] &= ~(val)) -#define kvm_change_c0_guest_ebase(cop0, change, val) \ -{ \ - kvm_clear_c0_guest_ebase(cop0, change); \ - kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \ +#define kvm_set_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] |= (val)) +#define kvm_clear_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] &= ~(val)) +#define kvm_change_c0_guest_ebase(cop0, change, val) \ +{ \ + kvm_clear_c0_guest_ebase(cop0, change); \ + kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \ } -- cgit v0.10.2 From 15505679362270d02c449626385cb74af8905514 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 14 Mar 2014 13:06:07 +0000 Subject: MIPS: KVM: Pass reserved instruction exceptions to guest Previously a reserved instruction exception while in guest code would cause a KVM internal error if kvm_mips_handle_ri() didn't recognise the instruction (including a RDHWR from an unrecognised hardware register). However the guest OS should really have the opportunity to catch the exception so that it can take the appropriate actions such as sending a SIGILL to the guest user process or emulating the instruction itself. Therefore in these cases emulate a guest RI exception and only return EMULATE_FAIL if that fails, being careful to revert the PC first in case the exception occurred in a branch delay slot in which case the PC will already point to the branch target. Also turn the printk messages relating to these cases into kvm_debug messages so that they aren't usually visible. This allows crashme to run in the guest without killing the entire VM. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Sanjay Lal Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c index 4b6274b..e75ef82 100644 --- a/arch/mips/kvm/kvm_mips_emul.c +++ b/arch/mips/kvm/kvm_mips_emul.c @@ -1571,17 +1571,17 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0); #else /* UserLocal not implemented */ - er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); + er = EMULATE_FAIL; #endif break; default: - printk("RDHWR not supported\n"); + kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc); er = EMULATE_FAIL; break; } } else { - printk("Emulate RI not supported @ %p: %#x\n", opc, inst); + kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst); er = EMULATE_FAIL; } @@ -1590,6 +1590,7 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, */ if (er == EMULATE_FAIL) { vcpu->arch.pc = curr_pc; + er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); } return er; } -- cgit v0.10.2 From 26f4f3b57862642296a2e613674e7f00d91c022f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 14 Mar 2014 13:06:09 +0000 Subject: MIPS: KVM: Consult HWREna before emulating RDHWR The ability to read hardware registers from userland with the RDHWR instruction should depend upon the corresponding bit of the HWREna register being set, otherwise a reserved instruction exception should be generated. However KVM's current emulation ignores the guest's HWREna and always emulates RDHWR instructions even if the guest OS has disallowed them. Therefore rework the RDHWR emulation code to check for privilege or the corresponding bit in the guest HWREna bit. Also remove the #if 0 case for the UserLocal register. I presume it was there for debug purposes but it seems unnecessary now that the guest can control whether it causes a guest exception. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Sanjay Lal Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 502c8da..060aaa6 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -414,6 +414,8 @@ struct kvm_vcpu_arch { #define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val)) #define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0]) #define kvm_write_c0_guest_wired(cop0, val) (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val)) +#define kvm_read_c0_guest_hwrena(cop0) (cop0->reg[MIPS_CP0_HWRENA][0]) +#define kvm_write_c0_guest_hwrena(cop0, val) (cop0->reg[MIPS_CP0_HWRENA][0] = (val)) #define kvm_read_c0_guest_badvaddr(cop0) (cop0->reg[MIPS_CP0_BAD_VADDR][0]) #define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val)) #define kvm_read_c0_guest_count(cop0) (cop0->reg[MIPS_CP0_COUNT][0]) diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c index e75ef82..d562572 100644 --- a/arch/mips/kvm/kvm_mips_emul.c +++ b/arch/mips/kvm/kvm_mips_emul.c @@ -1542,8 +1542,15 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, } if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) { + int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); int rd = (inst & RD) >> 11; int rt = (inst & RT) >> 16; + /* If usermode, check RDHWR rd is allowed by guest HWREna */ + if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) { + kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n", + rd, opc); + goto emulate_ri; + } switch (rd) { case 0: /* CPU number */ arch->gprs[rt] = 0; @@ -1567,32 +1574,27 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, } break; case 29: -#if 1 arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0); -#else - /* UserLocal not implemented */ - er = EMULATE_FAIL; -#endif break; default: kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc); - er = EMULATE_FAIL; - break; + goto emulate_ri; } } else { kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst); - er = EMULATE_FAIL; + goto emulate_ri; } + return EMULATE_DONE; + +emulate_ri: /* - * Rollback PC only if emulation was unsuccessful + * Rollback PC (if in branch delay slot then the PC already points to + * branch target), and pass the RI exception to the guest OS. */ - if (er == EMULATE_FAIL) { - vcpu->arch.pc = curr_pc; - er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); - } - return er; + vcpu->arch.pc = curr_pc; + return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); } enum emulation_result -- cgit v0.10.2 From 36c95494609cd6d2541c08e806b4d6bc401bc53f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 14 Mar 2014 13:06:10 +0000 Subject: MIPS: KVM: Remove dead code in CP0 emulation The code to check whether rd > MIPS_CP0_DESAVE is dead code, since MIPS_CP0_DESAVE = 31 and rd is already masked with 0x1f. Remove it. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Sanjay Lal Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c index d562572..e3fec99 100644 --- a/arch/mips/kvm/kvm_mips_emul.c +++ b/arch/mips/kvm/kvm_mips_emul.c @@ -436,13 +436,6 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause, sel = inst & 0x7; co_bit = (inst >> 25) & 1; - /* Verify that the register is valid */ - if (rd > MIPS_CP0_DESAVE) { - printk("Invalid rd: %d\n", rd); - er = EMULATE_FAIL; - goto done; - } - if (co_bit) { op = (inst) & 0xff; -- cgit v0.10.2 From 0b10a1c87a2b0fb459baaefba9cb163dbb8d3344 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Mar 2014 11:51:29 +0100 Subject: KVM: ioapic: merge ioapic_deliver into ioapic_service Commonize the handling of masking, which was absent for kvm_ioapic_set_irq. Setting remote_irr does not need a separate function either, and merging the two functions avoids confusion. Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1539d37..0b49141 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -50,7 +50,7 @@ #else #define ioapic_debug(fmt, arg...) #endif -static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq, +static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, @@ -163,23 +163,6 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) return false; } -static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx, - bool line_status) -{ - union kvm_ioapic_redirect_entry *pent; - int injected = -1; - - pent = &ioapic->redirtbl[idx]; - - if (!pent->fields.mask) { - injected = ioapic_deliver(ioapic, idx, line_status); - if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) - pent->fields.remote_irr = 1; - } - - return injected; -} - static void update_handled_vectors(struct kvm_ioapic *ioapic) { DECLARE_BITMAP(handled_vectors, 256); @@ -282,12 +265,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } } -static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status) +static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) { union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; struct kvm_lapic_irq irqe; int ret; + if (entry->fields.mask) + return -1; + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", entry->fields.dest_id, entry->fields.dest_mode, @@ -310,6 +296,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status) } else ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG) + entry->fields.remote_irr = 1; + return ret; } @@ -393,7 +382,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << i))) + if (ioapic->irr & (1 << i)) ioapic_service(ioapic, i, false); } } -- cgit v0.10.2 From 0bc830b05c667218d703f2026ec866c49df974fc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Mar 2014 10:47:17 +0100 Subject: KVM: ioapic: clear IRR for edge-triggered interrupts at delivery This ensures that IRR bits are set in the KVM_GET_IRQCHIP result only if the interrupt is still sitting in the IOAPIC. After the next patches, it avoids spurious reinjection of the interrupt when KVM_SET_IRQCHIP is called. Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 0b49141..25e16a6 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -288,6 +288,9 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.level = 1; irqe.shorthand = 0; + if (irqe.trig_mode == IOAPIC_EDGE_TRIG) + ioapic->irr &= ~(1 << irq); + if (irq == RTC_GSI && line_status) { BUG_ON(ioapic->rtc_status.pending_eoi != 0); ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, -- cgit v0.10.2 From 44847dea79751e95665a439f8c63a65e51da8e1f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Mar 2014 12:00:14 +0100 Subject: KVM: ioapic: extract body of kvm_ioapic_set_irq We will reuse it to process a nonzero IRR that is passed to KVM_SET_IRQCHIP. Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 25e16a6..270f7fe 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -163,6 +163,55 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) return false; } +static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, + int irq_level, bool line_status) +{ + union kvm_ioapic_redirect_entry entry; + u32 mask = 1 << irq; + u32 old_irr; + int edge, ret; + + entry = ioapic->redirtbl[irq]; + edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + + if (!irq_level) { + ioapic->irr &= ~mask; + ret = 1; + goto out; + } + + /* + * Return 0 for coalesced interrupts; for edge-triggered interrupts, + * this only happens if a previous edge has not been delivered due + * do masking. For level interrupts, the remote_irr field tells + * us if the interrupt is waiting for an EOI. + * + * RTC is special: it is edge-triggered, but userspace likes to know + * if it has been already ack-ed via EOI because coalesced RTC + * interrupts lead to time drift in Windows guests. So we track + * EOI manually for the RTC interrupt. + */ + if (irq == RTC_GSI && line_status && + rtc_irq_check_coalesced(ioapic)) { + ret = 0; + goto out; + } + + old_irr = ioapic->irr; + ioapic->irr |= mask; + if ((edge && old_irr == ioapic->irr) || + (!edge && entry.fields.remote_irr)) { + ret = 0; + goto out; + } + + ret = ioapic_service(ioapic, irq, line_status); + +out: + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); + return ret; +} + static void update_handled_vectors(struct kvm_ioapic *ioapic) { DECLARE_BITMAP(handled_vectors, 256); @@ -308,38 +357,15 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status) { - u32 old_irr; - u32 mask = 1 << irq; - union kvm_ioapic_redirect_entry entry; int ret, irq_level; BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); spin_lock(&ioapic->lock); - old_irr = ioapic->irr; irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], irq_source_id, level); - entry = ioapic->redirtbl[irq]; - if (!irq_level) { - ioapic->irr &= ~mask; - ret = 1; - } else { - int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + ret = ioapic_set_irq(ioapic, irq, irq_level, line_status); - if (irq == RTC_GSI && line_status && - rtc_irq_check_coalesced(ioapic)) { - ret = 0; /* coalesced */ - goto out; - } - ioapic->irr |= mask; - if ((edge && old_irr != ioapic->irr) || - (!edge && !entry.fields.remote_irr)) - ret = ioapic_service(ioapic, irq, line_status); - else - ret = 0; /* report coalesced interrupt */ - } -out: - trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); spin_unlock(&ioapic->lock); return ret; -- cgit v0.10.2 From d938dc55225a7212e7f31c5a8571da304cc3de16 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 23 Oct 2013 18:26:34 +0200 Subject: KVM: Add per-vm capability enablement. Allow KVM_ENABLE_CAP to act on a vm as well as on a vcpu. This makes more sense when the caller wants to enable a vm-related capability. s390 will be the first user; wire it up. Reviewed-by: Thomas Huth Reviewed-by: Christian Borntraeger Signed-off-by: Cornelia Huck diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4714f28..faf6fe9 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -932,9 +932,9 @@ documentation when it pops into existence). 4.37 KVM_ENABLE_CAP -Capability: KVM_CAP_ENABLE_CAP +Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM Architectures: ppc, s390 -Type: vcpu ioctl +Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error @@ -965,6 +965,8 @@ function properly, this is the place to put them. __u8 pad[64]; }; +The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl +for vm-wide capabilities. 4.38 KVM_GET_MP_STATE diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7337c57..9f1e99f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -159,6 +159,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: + case KVM_CAP_ENABLE_CAP_VM: r = 1; break; case KVM_CAP_NR_VCPUS: @@ -187,6 +188,21 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, return 0; } +static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + default: + r = -EINVAL; + break; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -204,6 +220,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_inject_vm(kvm, &s390int); break; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + break; + r = kvm_vm_ioctl_enable_cap(kvm, &cap); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a7518be..46ea1b4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -741,6 +741,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_EXT_EMUL_CPUID 95 #define KVM_CAP_HYPERV_TIME 96 #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 +#define KVM_CAP_ENABLE_CAP_VM 98 #ifdef KVM_CAP_IRQ_ROUTING @@ -1076,6 +1077,10 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_DEBUGREGS */ #define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) #define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) +/* + * vcpu version available with KVM_ENABLE_CAP + * vm version available with KVM_CAP_ENABLE_CAP_VM + */ #define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) /* Available with KVM_CAP_XSAVE */ #define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) -- cgit v0.10.2 From 841b91c584b6d1e2a2cb508bd2d0236cd37e1750 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 15 Jul 2013 13:36:01 +0200 Subject: KVM: s390: adapter interrupt sources Add a new interface to register/deregister sources of adapter interrupts identified by an unique id via the flic. Adapters may also be maskable and carry a list of pinned pages. These adapters will be used by irq routing later. Acked-by: Christian Borntraeger Signed-off-by: Cornelia Huck diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt index 410fa67..4ceef53 100644 --- a/Documentation/virtual/kvm/devices/s390_flic.txt +++ b/Documentation/virtual/kvm/devices/s390_flic.txt @@ -12,6 +12,7 @@ FLIC provides support to - inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) - purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) - enable/disable for the guest transparent async page faults +- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) Groups: KVM_DEV_FLIC_ENQUEUE @@ -44,3 +45,47 @@ Groups: Disables async page faults for the guest and waits until already pending async page faults are done. This is necessary to trigger a completion interrupt for every init interrupt before migrating the interrupt list. + + KVM_DEV_FLIC_ADAPTER_REGISTER + Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter + describing the adapter to register: + +struct kvm_s390_io_adapter { + __u32 id; + __u8 isc; + __u8 maskable; + __u8 swap; + __u8 pad; +}; + + id contains the unique id for the adapter, isc the I/O interruption subclass + to use, maskable whether this adapter may be masked (interrupts turned off) + and swap whether the indicators need to be byte swapped. + + + KVM_DEV_FLIC_ADAPTER_MODIFY + Modifies attributes of an existing I/O adapter interrupt source. Takes + a kvm_s390_io_adapter_req specifiying the adapter and the operation: + +struct kvm_s390_io_adapter_req { + __u32 id; + __u8 type; + __u8 mask; + __u16 pad0; + __u64 addr; +}; + + id specifies the adapter and type the operation. The supported operations + are: + + KVM_S390_IO_ADAPTER_MASK + mask or unmask the adapter, as specified in mask + + KVM_S390_IO_ADAPTER_MAP + perform a gmap translation for the guest address provided in addr, + pin a userspace page for the translated address and add it to the + list of mappings + + KVM_S390_IO_ADAPTER_UNMAP + release a userspace page for the translated address specified in addr + from the list of mappings diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 734d302..0d52352 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -19,6 +19,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS 64 #define KVM_USER_MEM_SLOTS 32 @@ -245,6 +246,27 @@ struct kvm_vm_stat { struct kvm_arch_memory_slot { }; +struct s390_map_info { + struct list_head list; + __u64 guest_addr; + __u64 addr; + struct page *page; +}; + +struct s390_io_adapter { + unsigned int id; + int isc; + bool maskable; + bool masked; + bool swap; + struct rw_semaphore maps_lock; + struct list_head maps; + atomic_t nr_maps; +}; + +#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) +#define MAX_S390_ADAPTER_MAPS 256 + struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; @@ -252,6 +274,7 @@ struct kvm_arch{ struct kvm_device *flic; struct gmap *gmap; int css_support; + struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 2f0ade2..c003c6a 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -22,6 +22,8 @@ #define KVM_DEV_FLIC_CLEAR_IRQS 3 #define KVM_DEV_FLIC_APF_ENABLE 4 #define KVM_DEV_FLIC_APF_DISABLE_WAIT 5 +#define KVM_DEV_FLIC_ADAPTER_REGISTER 6 +#define KVM_DEV_FLIC_ADAPTER_MODIFY 7 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. @@ -32,6 +34,26 @@ #define KVM_S390_MAX_FLOAT_IRQS 266250 #define KVM_S390_FLIC_MAX_BUFFER 0x2000000 +struct kvm_s390_io_adapter { + __u32 id; + __u8 isc; + __u8 maskable; + __u8 swap; + __u8 pad; +}; + +#define KVM_S390_IO_ADAPTER_MASK 1 +#define KVM_S390_IO_ADAPTER_MAP 2 +#define KVM_S390_IO_ADAPTER_UNMAP 3 + +struct kvm_s390_io_adapter_req { + __u32 id; + __u8 type; + __u8 mask; + __u16 pad0; + __u64 addr; +}; + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 79d2e4f..7ecef5a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1,7 +1,7 @@ /* * handling kvm guest interrupts * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2014 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -1054,6 +1054,171 @@ static int enqueue_floating_irq(struct kvm_device *dev, return r; } +static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id) +{ + if (id >= MAX_S390_IO_ADAPTERS) + return NULL; + return kvm->arch.adapters[id]; +} + +static int register_io_adapter(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct s390_io_adapter *adapter; + struct kvm_s390_io_adapter adapter_info; + + if (copy_from_user(&adapter_info, + (void __user *)attr->addr, sizeof(adapter_info))) + return -EFAULT; + + if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) || + (dev->kvm->arch.adapters[adapter_info.id] != NULL)) + return -EINVAL; + + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); + if (!adapter) + return -ENOMEM; + + INIT_LIST_HEAD(&adapter->maps); + init_rwsem(&adapter->maps_lock); + atomic_set(&adapter->nr_maps, 0); + adapter->id = adapter_info.id; + adapter->isc = adapter_info.isc; + adapter->maskable = adapter_info.maskable; + adapter->masked = false; + adapter->swap = adapter_info.swap; + dev->kvm->arch.adapters[adapter->id] = adapter; + + return 0; +} + +int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked) +{ + int ret; + struct s390_io_adapter *adapter = get_io_adapter(kvm, id); + + if (!adapter || !adapter->maskable) + return -EINVAL; + ret = adapter->masked; + adapter->masked = masked; + return ret; +} + +static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) +{ + struct s390_io_adapter *adapter = get_io_adapter(kvm, id); + struct s390_map_info *map; + int ret; + + if (!adapter || !addr) + return -EINVAL; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) { + ret = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&map->list); + map->guest_addr = addr; + map->addr = gmap_translate(addr, kvm->arch.gmap); + if (map->addr == -EFAULT) { + ret = -EFAULT; + goto out; + } + ret = get_user_pages_fast(map->addr, 1, 1, &map->page); + if (ret < 0) + goto out; + BUG_ON(ret != 1); + down_write(&adapter->maps_lock); + if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) { + list_add_tail(&map->list, &adapter->maps); + ret = 0; + } else { + put_page(map->page); + ret = -EINVAL; + } + up_write(&adapter->maps_lock); +out: + if (ret) + kfree(map); + return ret; +} + +static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr) +{ + struct s390_io_adapter *adapter = get_io_adapter(kvm, id); + struct s390_map_info *map, *tmp; + int found = 0; + + if (!adapter || !addr) + return -EINVAL; + + down_write(&adapter->maps_lock); + list_for_each_entry_safe(map, tmp, &adapter->maps, list) { + if (map->guest_addr == addr) { + found = 1; + atomic_dec(&adapter->nr_maps); + list_del(&map->list); + put_page(map->page); + kfree(map); + break; + } + } + up_write(&adapter->maps_lock); + + return found ? 0 : -EINVAL; +} + +void kvm_s390_destroy_adapters(struct kvm *kvm) +{ + int i; + struct s390_map_info *map, *tmp; + + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) { + if (!kvm->arch.adapters[i]) + continue; + list_for_each_entry_safe(map, tmp, + &kvm->arch.adapters[i]->maps, list) { + list_del(&map->list); + put_page(map->page); + kfree(map); + } + kfree(kvm->arch.adapters[i]); + } +} + +static int modify_io_adapter(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvm_s390_io_adapter_req req; + struct s390_io_adapter *adapter; + int ret; + + if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) + return -EFAULT; + + adapter = get_io_adapter(dev->kvm, req.id); + if (!adapter) + return -EINVAL; + switch (req.type) { + case KVM_S390_IO_ADAPTER_MASK: + ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask); + if (ret > 0) + ret = 0; + break; + case KVM_S390_IO_ADAPTER_MAP: + ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr); + break; + case KVM_S390_IO_ADAPTER_UNMAP: + ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr); + break; + default: + ret = -EINVAL; + } + + return ret; +} + static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; @@ -1082,6 +1247,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) kvm_for_each_vcpu(i, vcpu, dev->kvm) kvm_clear_async_pf_completion_queue(vcpu); break; + case KVM_DEV_FLIC_ADAPTER_REGISTER: + r = register_io_adapter(dev, attr); + break; + case KVM_DEV_FLIC_ADAPTER_MODIFY: + r = modify_io_adapter(dev, attr); + break; default: r = -EINVAL; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9f1e99f..2e6fbb0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -343,6 +343,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); if (!kvm_is_ucontrol(kvm)) gmap_free(kvm->arch.gmap); + kvm_s390_destroy_adapters(kvm); } /* Section: vcpu related */ diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ed4750a..5502cc95 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -136,6 +136,7 @@ int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 cr6, u64 schid); +int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked); /* implemented in priv.c */ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); @@ -162,5 +163,6 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); /* implemented in interrupt.c */ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int psw_extint_disabled(struct kvm_vcpu *vcpu); +void kvm_s390_destroy_adapters(struct kvm *kvm); #endif -- cgit v0.10.2 From 84223598778ba08041f4297fda485df83414d57e Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 15 Jul 2013 13:36:01 +0200 Subject: KVM: s390: irq routing for adapter interrupts. Introduce a new interrupt class for s390 adapter interrupts and enable irqfds for s390. This is depending on a new s390 specific vm capability, KVM_CAP_S390_IRQCHIP, that needs to be enabled by userspace. Acked-by: Christian Borntraeger Signed-off-by: Cornelia Huck diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index faf6fe9..2cb1640 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -586,8 +586,8 @@ struct kvm_fpu { 4.24 KVM_CREATE_IRQCHIP -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64, ARM, arm64 +Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) +Architectures: x86, ia64, ARM, arm64, s390 Type: vm ioctl Parameters: none Returns: 0 on success, -1 on error @@ -596,7 +596,10 @@ Creates an interrupt controller model in the kernel. On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. On ia64, a IOSAPIC is created. On ARM/arm64, a GIC is -created. +created. On s390, a dummy irq routing table is created. + +Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled +before KVM_CREATE_IRQCHIP can be used. 4.25 KVM_IRQ_LINE @@ -1336,7 +1339,7 @@ KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. 4.52 KVM_SET_GSI_ROUTING Capability: KVM_CAP_IRQ_ROUTING -Architectures: x86 ia64 +Architectures: x86 ia64 s390 Type: vm ioctl Parameters: struct kvm_irq_routing (in) Returns: 0 on success, -1 on error @@ -1359,6 +1362,7 @@ struct kvm_irq_routing_entry { union { struct kvm_irq_routing_irqchip irqchip; struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; __u32 pad[8]; } u; }; @@ -1366,6 +1370,7 @@ struct kvm_irq_routing_entry { /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 +#define KVM_IRQ_ROUTING_S390_ADAPTER 3 No flags are specified so far, the corresponding field must be set to zero. @@ -1381,6 +1386,14 @@ struct kvm_irq_routing_msi { __u32 pad; }; +struct kvm_irq_routing_s390_adapter { + __u64 ind_addr; + __u64 summary_addr; + __u64 ind_offset; + __u32 summary_offset; + __u32 adapter_id; +}; + 4.53 KVM_ASSIGN_SET_MSIX_NR diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 0d52352..dd39337 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -24,6 +24,14 @@ #define KVM_MAX_VCPUS 64 #define KVM_USER_MEM_SLOTS 32 +/* + * These seem to be used for allocating ->chip in the routing table, + * which we don't use. 4096 is an out-of-thin-air value. If we need + * to look at ->chip later on, we'll need to revisit this. + */ +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 4096 + struct sca_entry { atomic_t scn; __u32 reserved; @@ -274,6 +282,7 @@ struct kvm_arch{ struct kvm_device *flic; struct gmap *gmap; int css_support; + int use_irqchip; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; }; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index c8bacbc..10d529a 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -25,6 +25,8 @@ config KVM select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index a47d2c3..d3adb37 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -7,7 +7,7 @@ # as published by the Free Software Foundation. KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 7ecef5a..2e2814e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1284,3 +1285,123 @@ struct kvm_device_ops kvm_flic_ops = { .create = flic_create, .destroy = flic_destroy, }; + +static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) +{ + unsigned long bit; + + bit = bit_nr + (addr % PAGE_SIZE) * 8; + + return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; +} + +static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter, + u64 addr) +{ + struct s390_map_info *map; + + if (!adapter) + return NULL; + + list_for_each_entry(map, &adapter->maps, list) { + if (map->guest_addr == addr) + return map; + } + return NULL; +} + +static int adapter_indicators_set(struct kvm *kvm, + struct s390_io_adapter *adapter, + struct kvm_s390_adapter_int *adapter_int) +{ + unsigned long bit; + int summary_set, idx; + struct s390_map_info *info; + void *map; + + info = get_map_info(adapter, adapter_int->ind_addr); + if (!info) + return -1; + map = page_address(info->page); + bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap); + set_bit(bit, map); + idx = srcu_read_lock(&kvm->srcu); + mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); + set_page_dirty_lock(info->page); + info = get_map_info(adapter, adapter_int->summary_addr); + if (!info) { + srcu_read_unlock(&kvm->srcu, idx); + return -1; + } + map = page_address(info->page); + bit = get_ind_bit(info->addr, adapter_int->summary_offset, + adapter->swap); + summary_set = test_and_set_bit(bit, map); + mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); + set_page_dirty_lock(info->page); + srcu_read_unlock(&kvm->srcu, idx); + return summary_set ? 0 : 1; +} + +/* + * < 0 - not injected due to error + * = 0 - coalesced, summary indicator already active + * > 0 - injected interrupt + */ +static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + int ret; + struct s390_io_adapter *adapter; + + /* We're only interested in the 0->1 transition. */ + if (!level) + return 0; + adapter = get_io_adapter(kvm, e->adapter.adapter_id); + if (!adapter) + return -1; + down_read(&adapter->maps_lock); + ret = adapter_indicators_set(kvm, adapter, &e->adapter); + up_read(&adapter->maps_lock); + if ((ret > 0) && !adapter->masked) { + struct kvm_s390_interrupt s390int = { + .type = KVM_S390_INT_IO(1, 0, 0, 0), + .parm = 0, + .parm64 = (adapter->isc << 27) | 0x80000000, + }; + ret = kvm_s390_inject_vm(kvm, &s390int); + if (ret == 0) + ret = 1; + } + return ret; +} + +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int ret; + + switch (ue->type) { + case KVM_IRQ_ROUTING_S390_ADAPTER: + e->set = set_adapter_int; + e->adapter.summary_addr = ue->u.adapter.summary_addr; + e->adapter.ind_addr = ue->u.adapter.ind_addr; + e->adapter.summary_offset = ue->u.adapter.summary_offset; + e->adapter.ind_offset = ue->u.adapter.ind_offset; + e->adapter.adapter_id = ue->u.adapter.adapter_id; + ret = 0; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + return -EINVAL; +} diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h new file mode 100644 index 0000000..d98e415 --- /dev/null +++ b/arch/s390/kvm/irq.h @@ -0,0 +1,22 @@ +/* + * s390 irqchip routines + * + * Copyright IBM Corp. 2014 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): Cornelia Huck + */ +#ifndef __KVM_IRQ_H +#define __KVM_IRQ_H + +#include + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return 1; +} + +#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2e6fbb0..ce5b659 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -196,6 +196,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) return -EINVAL; switch (cap->cap) { + case KVM_CAP_S390_IRQCHIP: + kvm->arch.use_irqchip = 1; + r = 0; + break; default: r = -EINVAL; break; @@ -228,6 +232,18 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap(kvm, &cap); break; } + case KVM_CREATE_IRQCHIP: { + struct kvm_irq_routing_entry routing; + + r = -EINVAL; + if (kvm->arch.use_irqchip) { + /* Set up dummy routing. */ + memset(&routing, 0, sizeof(routing)); + kvm_set_irq_routing(kvm, &routing, 0, 0); + r = 0; + } + break; + } default: r = -ENOTTY; } @@ -284,6 +300,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) } kvm->arch.css_support = 0; + kvm->arch.use_irqchip = 0; return 0; out_nogmap: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9816b68..da7510b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -297,6 +297,14 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl return ALIGN(memslot->npages, BITS_PER_LONG) / 8; } +struct kvm_s390_adapter_int { + u64 ind_addr; + u64 summary_addr; + u64 ind_offset; + u32 summary_offset; + u32 adapter_id; +}; + struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -309,6 +317,7 @@ struct kvm_kernel_irq_routing_entry { unsigned pin; } irqchip; struct msi_msg msi; + struct kvm_s390_adapter_int adapter; }; struct hlist_node link; }; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 46ea1b4..a8f4ee5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -742,6 +742,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_HYPERV_TIME 96 #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 #define KVM_CAP_ENABLE_CAP_VM 98 +#define KVM_CAP_S390_IRQCHIP 99 #ifdef KVM_CAP_IRQ_ROUTING @@ -757,9 +758,18 @@ struct kvm_irq_routing_msi { __u32 pad; }; +struct kvm_irq_routing_s390_adapter { + __u64 ind_addr; + __u64 summary_addr; + __u64 ind_offset; + __u32 summary_offset; + __u32 adapter_id; +}; + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 +#define KVM_IRQ_ROUTING_S390_ADAPTER 3 struct kvm_irq_routing_entry { __u32 gsi; @@ -769,6 +779,7 @@ struct kvm_irq_routing_entry { union { struct kvm_irq_routing_irqchip irqchip; struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; __u32 pad[8]; } u; }; -- cgit v0.10.2 From f3f710bc64e121c10c67ce58c893d3bc8c72abe4 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 25 Feb 2014 12:48:01 +0100 Subject: KVM: Bump KVM_MAX_IRQ_ROUTES for s390 The maximum number for irq routes is currently 1024, which is a bit on the small size for s390: We support up to 4 x 64k virtual devices with up to 64 queues, and we need one route for each of the queues if we want to operate it via irqfd. Let's bump this to 4k on s390 for now, as this at least covers the saner setups. We need to find a more general solution, though, as we can't just grow the routing table indefinitly. Acked-by: Paolo Bonzini Signed-off-by: Cornelia Huck diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index da7510b..7d21cf9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -922,7 +922,11 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +#ifdef CONFIG_S390 +#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that... +#else #define KVM_MAX_IRQ_ROUTES 1024 +#endif int kvm_setup_default_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, -- cgit v0.10.2 From 673f7b4257a1fe7b181e1a1182ecc2b6b2b795f1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Mar 2014 11:39:23 +0100 Subject: KVM: ioapic: reinject pending interrupts on KVM_SET_IRQCHIP After the previous patches, an interrupt whose bit is set in the IRR register will never be in the LAPIC's IRR and has never been injected on the migration source. So inject it on the destination. This fixes migration of Windows guests without HPET (they use the RTC to trigger the scheduler tick, and lose it after migration). Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 270f7fe..d4b6015 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -212,6 +212,18 @@ out: return ret; } +static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) +{ + u32 idx; + + rtc_irq_eoi_tracking_reset(ioapic); + for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS) + ioapic_set_irq(ioapic, idx, 1, true); + + kvm_rtc_eoi_tracking_restore_all(ioapic); +} + + static void update_handled_vectors(struct kvm_ioapic *ioapic) { DECLARE_BITMAP(handled_vectors, 256); @@ -612,9 +624,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + ioapic->irr = 0; update_handled_vectors(ioapic); kvm_vcpu_request_scan_ioapic(kvm); - kvm_rtc_eoi_tracking_restore_all(ioapic); + kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); return 0; } -- cgit v0.10.2 From f6c137ff00a478ae619deea8650829dd2f8e71b9 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 19 Mar 2014 11:18:29 +0100 Subject: KVM: s390: randomize sca address We allocate a page for the 2k sca, so lets use the space to improve hit rate of some internal cpu caches. No need to change the freeing of the page, as this will shift away the page offset bits anyway. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7337c57..a02979f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -215,6 +215,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int rc; char debug_name[16]; + static unsigned long sca_offset; rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL @@ -236,6 +237,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); if (!kvm->arch.sca) goto out_err; + spin_lock(&kvm_lock); + sca_offset = (sca_offset + 16) & 0x7f0; + kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset); + spin_unlock(&kvm_lock); sprintf(debug_name, "kvm-%u", current->pid); -- cgit v0.10.2 From 609433fbed4f25e11e8b058ab7e9478b212879a9 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Tue, 18 Mar 2014 16:34:18 +0100 Subject: KVM: s390: fix calculation of idle_mask array size We need BITS_TO_LONGS, not sizeof(long) to calculate the correct size. idle_mask is a bitmask, each bit representing the state of a cpu. The desired outcome is an array of unsigned long fields that can fit KVM_MAX_VCPUS bits. We should not use sizeof(long) which returnes the size in bytes, but BITS_TO_LONGS Signed-off-by: Jens Freimann Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 734d302..c36cd35 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -213,8 +213,7 @@ struct kvm_s390_float_interrupt { struct list_head list; atomic_t active; int next_rr_cpu; - unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1) - / sizeof(long)]; + unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int irq_count; }; -- cgit v0.10.2 From 91880d07fc9b2b6b7a726765039897af99d9ca78 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 20 Mar 2014 13:20:46 +0100 Subject: KVM: s390: Fix possible memory leak in SIGP functions When kvm_get_vcpu() returned NULL for the destination CPU in __sigp_emergency() or __sigp_external_call(), the memory for the "inti" structure was not released anymore. This patch fixes this issue by moving the check for !dst_vcpu before the kzalloc() call. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 3fe44c4..26caeb5 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -58,7 +58,9 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) struct kvm_s390_interrupt_info *inti; struct kvm_vcpu *dst_vcpu = NULL; - if (cpu_addr >= KVM_MAX_VCPUS) + if (cpu_addr < KVM_MAX_VCPUS) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) return SIGP_CC_NOT_OPERATIONAL; inti = kzalloc(sizeof(*inti), GFP_KERNEL); @@ -68,9 +70,6 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EMERGENCY; inti->emerg.code = vcpu->vcpu_id; - dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); - if (!dst_vcpu) - return SIGP_CC_NOT_OPERATIONAL; li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); @@ -121,7 +120,9 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) struct kvm_s390_interrupt_info *inti; struct kvm_vcpu *dst_vcpu = NULL; - if (cpu_addr >= KVM_MAX_VCPUS) + if (cpu_addr < KVM_MAX_VCPUS) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) return SIGP_CC_NOT_OPERATIONAL; inti = kzalloc(sizeof(*inti), GFP_KERNEL); @@ -131,9 +132,6 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) inti->type = KVM_S390_INT_EXTERNAL_CALL; inti->extcall.code = vcpu->vcpu_id; - dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); - if (!dst_vcpu) - return SIGP_CC_NOT_OPERATIONAL; li = &dst_vcpu->arch.local_int; spin_lock_bh(&li->lock); list_add_tail(&inti->list, &li->list); -- cgit v0.10.2 From 2ed10cc15e7edf2daf22ce807a877a1266e97711 Mon Sep 17 00:00:00 2001 From: Jens Freimann Date: Tue, 11 Feb 2014 13:48:07 +0100 Subject: KVM: s390: clear local interrupts at cpu initial reset Empty list of local interrupts when vcpu goes through initial reset to provide a clean state Signed-off-by: Jens Freimann Signed-off-by: Christian Borntraeger diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 79d2e4f..05bffd7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -509,6 +509,20 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } +void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_interrupt_info *n, *inti = NULL; + + spin_lock_bh(&li->lock); + list_for_each_entry_safe(inti, n, &li->list, list) { + list_del(&inti->list); + kfree(inti); + } + atomic_set(&li->active, 0); + spin_unlock_bh(&li->lock); +} + void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a02979f..83b7944 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -395,6 +395,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_clear_local_irqs(vcpu); } int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ed4750a..6311170 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -129,6 +129,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); void kvm_s390_tasklet(unsigned long parm); void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu); +void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu); int __must_check kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt *s390int); int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, -- cgit v0.10.2 From e59d24e61269de34d79d2f39d3d581c219ac7a94 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 6 Feb 2014 17:36:56 +0100 Subject: KVM: PPC: Book3S HV: Fix incorrect userspace exit on ioeventfd write When the guest does an MMIO write which is handled successfully by an ioeventfd, ioeventfd_write() returns 0 (success) and kvmppc_handle_store() returns EMULATE_DONE. Then kvmppc_emulate_mmio() converts EMULATE_DONE to RESUME_GUEST_NV and this causes an exit from the loop in kvmppc_vcpu_run_hv(), causing an exit back to userspace with a bogus exit reason code, typically causing userspace (e.g. qemu) to crash with a message about an unknown exit code. This adds handling of RESUME_GUEST_NV in kvmppc_vcpu_run_hv() in order to fix that. For generality, we define a helper to check for either of the return-to-guest codes we use, RESUME_GUEST and RESUME_GUEST_NV, to make it easy to check for either and provide one place to update if any other return-to-guest code gets defined in future. Since it only affects Book3S HV for now, the helper is added to the kvm_book3s.h header file. We use the helper in two places in kvmppc_run_core() as well for future-proofing, though we don't see RESUME_GUEST_NV in either place at present. [paulus@samba.org - combined 4 patches into one, rewrote description] Suggested-by: Paul Mackerras Signed-off-by: Alexey Kardashevskiy Signed-off-by: Greg Kurz Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 83851aa..bb1e38a2 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -304,6 +304,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) return vcpu->arch.fault_dar; } +static inline bool is_kvmppc_resume_guest(int r) +{ + return (r == RESUME_GUEST || r == RESUME_GUEST_NV); +} + /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 17fc949..3b498d9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1530,7 +1530,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) vcpu->arch.trap = 0; if (vcpu->arch.ceded) { - if (ret != RESUME_GUEST) + if (!is_kvmppc_resume_guest(ret)) kvmppc_end_cede(vcpu); else kvmppc_set_timer(vcpu); @@ -1541,7 +1541,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_INACTIVE; list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { - if (vcpu->arch.ret != RESUME_GUEST) { + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); } @@ -1731,7 +1731,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } - } while (r == RESUME_GUEST); + } while (is_kvmppc_resume_guest(r)); out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; -- cgit v0.10.2 From 69e9fbb278af8de3059f1d1017b52a32b5f9f0bd Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 21 Feb 2014 16:31:10 +0100 Subject: KVM: PPC: Book3S: Introduce hypervisor call H_GET_TCE This introduces the H_GET_TCE hypervisor call, which is basically the reverse of H_PUT_TCE, as defined in the Power Architecture Platform Requirements (PAPR). The hcall H_GET_TCE is required by the kdump kernel, which uses it to retrieve TCEs set up by the previous (panicked) kernel. Signed-off-by: Laurent Dufour Signed-off-by: Alexander Graf Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index fcd53f0..4096f16 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -129,6 +129,8 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); +extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba); extern struct kvm_rma_info *kvm_alloc_rma(void); extern void kvm_release_rma(struct kvm_rma_info *ri); extern struct page *kvm_alloc_hpt(unsigned long nr_pages); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 2c25f54..89e96b3 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -75,3 +75,31 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return H_TOO_HARD; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 818dce3..7c5788c 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1691,7 +1691,7 @@ hcall_real_table: .long 0 /* 0x10 - H_CLEAR_MOD */ .long 0 /* 0x14 - H_CLEAR_REF */ .long .kvmppc_h_protect - hcall_real_table - .long 0 /* 0x1c - H_GET_TCE */ + .long .kvmppc_h_get_tce - hcall_real_table .long .kvmppc_h_put_tce - hcall_real_table .long 0 /* 0x24 - H_SET_SPRG0 */ .long .kvmppc_h_set_dabr - hcall_real_table -- cgit v0.10.2 From 7505258c5fcb0a1cc3c76a47b4cf9506d21d10e6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 25 Mar 2014 10:47:01 +1100 Subject: KVM: PPC: Book3S HV: Fix KVM hang with CONFIG_KVM_XICS=n I noticed KVM is broken when KVM in-kernel XICS emulation (CONFIG_KVM_XICS) is disabled. The problem was introduced in 48eaef05 (KVM: PPC: Book3S HV: use xics_wake_cpu only when defined). It used CONFIG_KVM_XICS to wrap xics_wake_cpu, where CONFIG_PPC_ICP_NATIVE should have been used. Signed-off-by: Anton Blanchard Cc: stable@vger.kernel.org Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3b498d9..e0a535c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -86,7 +86,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) /* CPU points to the first thread of the core */ if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { -#ifdef CONFIG_KVM_XICS +#ifdef CONFIG_PPC_ICP_NATIVE int real_cpu = cpu + vcpu->arch.ptid; if (paca[real_cpu].kvm_hstate.xics_phys) xics_wake_cpu(real_cpu); @@ -1360,9 +1360,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) smp_wmb(); #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (cpu != smp_processor_id()) { -#ifdef CONFIG_KVM_XICS xics_wake_cpu(cpu); -#endif if (vcpu->arch.ptid) ++vc->n_woken; } -- cgit v0.10.2 From 920c837785699bcc48f4a729ba9ee3492f620b95 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Mar 2014 15:54:00 +0100 Subject: KVM: vmx: fix MPX detection kvm_x86_ops is still NULL at this point. Since kvm_init_msr_list cannot fail, it is safe to initialize it before the call. Fixes: 93c4adc7afedf9b0ec190066d45b6d67db5270da Reported-by: Fengguang Wu Tested-by: Jet Chen Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aa98695..d1c55f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5594,9 +5594,10 @@ int kvm_arch_init(void *opaque) goto out_free_percpu; kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); kvm_x86_ops = ops; + kvm_init_msr_list(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); -- cgit v0.10.2 From 6acdb1603a7db4c1b5d91863a13d2c3f1b9188b0 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 28 Jan 2014 08:28:42 -0800 Subject: KVM: Specify byte order for KVM_EXIT_MMIO The KVM API documentation is not clear about the semantics of the data field on the mmio struct on the kvm_run struct. This has become problematic when supporting ARM guests on big-endian host systems with guests of both endianness types, because it is unclear how the data should be exported to user space. This should not break with existing implementations as all supported existing implementations of known user space applications (QEMU and kvmtools for virtio) only support default endianness of the architectures on the host side. Cc: Marc Zyngier Cc: Peter Maydell Cc: Alexander Graf Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2cb1640..c24211d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2595,6 +2595,10 @@ executed a memory-mapped I/O instruction which could not be satisfied by kvm. The 'data' member contains the written data if 'is_write' is true, and should be filled by application code otherwise. +The 'data' member contains, in its first 'len' bytes, the value as it would +appear if the VCPU performed a load or store of the appropriate width directly +to the byte array. + NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR, KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding operations are complete (and guest state is consistent) only after userspace -- cgit v0.10.2 From e4e38121507a27d2ccc4b28d9e7fc4818a12c44c Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 25 Mar 2014 10:47:02 +1100 Subject: KVM: PPC: Book3S HV: Add transactional memory support This adds saving of the transactional memory (TM) checkpointed state on guest entry and exit. We only do this if we see that the guest has an active transaction. It also adds emulation of the TM state changes when delivering IRQs into the guest. According to the architecture, if we are transactional when an IRQ occurs, the TM state is changed to suspended, otherwise it's left unchanged. Signed-off-by: Michael Neuling Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 6ba8d4a..af21e87 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -213,6 +213,7 @@ #define SPRN_ACOP 0x1F /* Available Coprocessor Register */ #define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */ #define SPRN_TEXASR 0x82 /* Transaction EXception & Summary */ +#define TEXASR_FS __MASK(63-36) /* Transaction Failure Summary */ #define SPRN_TEXASRU 0x83 /* '' '' '' Upper 32 */ #define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */ #define SPRN_CTRLF 0x088 diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index 9dfbc34..386a3ef 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -7,6 +7,8 @@ #include +#ifndef __ASSEMBLY__ + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM extern void do_load_up_transact_fpu(struct thread_struct *thread); extern void do_load_up_transact_altivec(struct thread_struct *thread); @@ -20,3 +22,5 @@ extern void tm_recheckpoint(struct thread_struct *thread, extern void tm_abort(uint8_t cause); extern void tm_save_sprs(struct thread_struct *thread); extern void tm_restore_sprs(struct thread_struct *thread); + +#endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 303ece7..fb25ebc 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -262,7 +262,14 @@ int kvmppc_mmu_hv_init(void) static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, vcpu->arch.intr_msr); + unsigned long msr = vcpu->arch.intr_msr; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) + msr |= MSR_TS_S; + else + msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; + kvmppc_set_msr(vcpu, msr); } /* diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7c5788c..61190dd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -28,6 +28,9 @@ #include #include #include +#include + +#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) #ifdef __LITTLE_ENDIAN__ #error Need to fix lppaca and SLB shadow accesses in little endian mode @@ -597,6 +600,116 @@ BEGIN_FTR_SECTION END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89) END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b skip_tm +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq skip_tm /* TM not active in guest */ + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl .load_fp_state + addi r3, r31, VCPU_VRS_TM + bl .load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 +skip_tm: +#endif + /* Load guest PMU registers */ /* R4 is live here (vcpu pointer) */ li r3, 1 @@ -704,14 +817,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) ld r6, VCPU_VTB(r4) mtspr SPRN_IC, r5 mtspr SPRN_VTB, r6 -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - ld r5, VCPU_TFHAR(r4) - ld r6, VCPU_TFIAR(r4) - ld r7, VCPU_TEXASR(r4) - mtspr SPRN_TFHAR, r5 - mtspr SPRN_TFIAR, r6 - mtspr SPRN_TEXASR, r7 -#endif ld r8, VCPU_EBBHR(r4) mtspr SPRN_EBBHR, r8 ld r5, VCPU_EBBRR(r4) @@ -817,7 +922,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 12: mtspr SPRN_SRR0, r10 mr r10,r0 mtspr SPRN_SRR1, r11 - ld r11, VCPU_INTR_MSR(r4) + mr r9, r4 + bl kvmppc_msr_interrupt 5: /* @@ -1103,12 +1209,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) BEGIN_FTR_SECTION b 8f END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - /* Save POWER8-specific registers */ mfspr r5, SPRN_IAMR mfspr r6, SPRN_PSPB @@ -1122,14 +1222,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) std r5, VCPU_IC(r9) std r6, VCPU_VTB(r9) std r7, VCPU_TAR(r9) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - mfspr r5, SPRN_TFHAR - mfspr r6, SPRN_TFIAR - mfspr r7, SPRN_TEXASR - std r5, VCPU_TFHAR(r9) - std r6, VCPU_TFIAR(r9) - std r7, VCPU_TEXASR(r9) -#endif mfspr r8, SPRN_EBBHR std r8, VCPU_EBBHR(r9) mfspr r5, SPRN_EBBRR @@ -1557,7 +1649,7 @@ kvmppc_hdsi: mtspr SPRN_SRR0, r10 mtspr SPRN_SRR1, r11 li r10, BOOK3S_INTERRUPT_DATA_STORAGE - ld r11, VCPU_INTR_MSR(r9) + bl kvmppc_msr_interrupt fast_interrupt_c_return: 6: ld r7, VCPU_CTR(r9) lwz r8, VCPU_XER(r9) @@ -1626,7 +1718,7 @@ kvmppc_hisi: 1: mtspr SPRN_SRR0, r10 mtspr SPRN_SRR1, r11 li r10, BOOK3S_INTERRUPT_INST_STORAGE - ld r11, VCPU_INTR_MSR(r9) + bl kvmppc_msr_interrupt b fast_interrupt_c_return 3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ @@ -1669,7 +1761,7 @@ sc_1_fast_return: mtspr SPRN_SRR0,r10 mtspr SPRN_SRR1,r11 li r10, BOOK3S_INTERRUPT_SYSCALL - ld r11, VCPU_INTR_MSR(r9) + bl kvmppc_msr_interrupt mr r4,r9 b fast_guest_return @@ -1997,7 +2089,7 @@ machine_check_realmode: beq mc_cont /* If not, deliver a machine check. SRR0/1 are already set */ li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - ld r11, VCPU_INTR_MSR(r9) + bl kvmppc_msr_interrupt b fast_interrupt_c_return /* @@ -2138,8 +2230,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) mfspr r6,SPRN_VRSAVE stw r6,VCPU_VRSAVE(r31) mtlr r30 - mtmsrd r5 - isync blr /* @@ -2186,3 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) */ kvmppc_bad_host_intr: b . + +/* + * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken + * from VCPU_INTR_MSR and is modified based on the required TM state changes. + * r11 has the guest MSR value (in/out) + * r9 has a vcpu pointer (in) + * r0 is used as a scratch register + */ +kvmppc_msr_interrupt: + rldicl r0, r11, 64 - MSR_TS_S_LG, 62 + cmpwi r0, 2 /* Check if we are in transactional state.. */ + ld r11, VCPU_INTR_MSR(r9) + bne 1f + /* ... if transactional, change to suspended */ + li r0, 1 +1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + blr -- cgit v0.10.2 From a7d80d01c68ed7d3fbc7bcf4541e6fb7e6b87cd6 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 25 Mar 2014 10:47:03 +1100 Subject: KVM: PPC: Book3S HV: Add get/set_one_reg for new TM state This adds code to get/set_one_reg to read and write the new transactional memory (TM) state. Signed-off-by: Michael Neuling Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e0a535c..a6d8f01 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -879,17 +879,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_IAMR: *val = get_reg_val(id, vcpu->arch.iamr); break; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - case KVM_REG_PPC_TFHAR: - *val = get_reg_val(id, vcpu->arch.tfhar); - break; - case KVM_REG_PPC_TFIAR: - *val = get_reg_val(id, vcpu->arch.tfiar); - break; - case KVM_REG_PPC_TEXASR: - *val = get_reg_val(id, vcpu->arch.texasr); - break; -#endif case KVM_REG_PPC_FSCR: *val = get_reg_val(id, vcpu->arch.fscr); break; @@ -970,6 +959,69 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_PPR: *val = get_reg_val(id, vcpu->arch.ppr); break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + *val = get_reg_val(id, vcpu->arch.tfhar); + break; + case KVM_REG_PPC_TFIAR: + *val = get_reg_val(id, vcpu->arch.tfiar); + break; + case KVM_REG_PPC_TEXASR: + *val = get_reg_val(id, vcpu->arch.texasr); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + *val = get_reg_val(id, vcpu->arch.gpr_tm[i]); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; + else { + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + val->vval = vcpu->arch.vr_tm.vr[i-32]; + else + r = -ENXIO; + } + break; + } + case KVM_REG_PPC_TM_CR: + *val = get_reg_val(id, vcpu->arch.cr_tm); + break; + case KVM_REG_PPC_TM_LR: + *val = get_reg_val(id, vcpu->arch.lr_tm); + break; + case KVM_REG_PPC_TM_CTR: + *val = get_reg_val(id, vcpu->arch.ctr_tm); + break; + case KVM_REG_PPC_TM_FPSCR: + *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); + break; + case KVM_REG_PPC_TM_AMR: + *val = get_reg_val(id, vcpu->arch.amr_tm); + break; + case KVM_REG_PPC_TM_PPR: + *val = get_reg_val(id, vcpu->arch.ppr_tm); + break; + case KVM_REG_PPC_TM_VRSAVE: + *val = get_reg_val(id, vcpu->arch.vrsave_tm); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); + else + r = -ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr_tm); + break; + case KVM_REG_PPC_TM_TAR: + *val = get_reg_val(id, vcpu->arch.tar_tm); + break; +#endif case KVM_REG_PPC_ARCH_COMPAT: *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); break; @@ -1039,17 +1091,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_IAMR: vcpu->arch.iamr = set_reg_val(id, *val); break; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - case KVM_REG_PPC_TFHAR: - vcpu->arch.tfhar = set_reg_val(id, *val); - break; - case KVM_REG_PPC_TFIAR: - vcpu->arch.tfiar = set_reg_val(id, *val); - break; - case KVM_REG_PPC_TEXASR: - vcpu->arch.texasr = set_reg_val(id, *val); - break; -#endif case KVM_REG_PPC_FSCR: vcpu->arch.fscr = set_reg_val(id, *val); break; @@ -1144,6 +1185,68 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_PPR: vcpu->arch.ppr = set_reg_val(id, *val); break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + vcpu->arch.tfhar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TFIAR: + vcpu->arch.tfiar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TEXASR: + vcpu->arch.texasr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; + else + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr_tm.vr[i-32] = val->vval; + else + r = -ENXIO; + break; + } + case KVM_REG_PPC_TM_CR: + vcpu->arch.cr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_LR: + vcpu->arch.lr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_CTR: + vcpu->arch.ctr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_FPSCR: + vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_AMR: + vcpu->arch.amr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_PPR: + vcpu->arch.ppr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VRSAVE: + vcpu->arch.vrsave_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); + else + r = - ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + vcpu->arch.dscr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_TAR: + vcpu->arch.tar_tm = set_reg_val(id, *val); + break; +#endif case KVM_REG_PPC_ARCH_COMPAT: r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); break; -- cgit v0.10.2 From b24f36f33ea088771c2bb7c09e84d0ddea35cf55 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Mar 2014 10:47:04 +1100 Subject: KVM: PPC: Book3S: Trim top 4 bits of physical address in RTAS code The in-kernel emulation of RTAS functions needs to read the argument buffer from guest memory in order to find out what function is being requested. The guest supplies the guest physical address of the buffer, and on a real system the code that reads that buffer would run in guest real mode. In guest real mode, the processor ignores the top 4 bits of the address specified in load and store instructions. In order to emulate that behaviour correctly, we need to mask off those bits before calling kvm_read_guest() or kvm_write_guest(). This adds that masking. Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index cf95cde..7a05315 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -213,8 +213,11 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) gpa_t args_phys; int rc; - /* r4 contains the guest physical address of the RTAS args */ - args_phys = kvmppc_get_gpr(vcpu, 4); + /* + * r4 contains the guest physical address of the RTAS args + * Mask off the top 4 bits since this is a guest real address + */ + args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM; rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); if (rc) -- cgit v0.10.2 From 739e2425fea6349ac674e93648953b3a08985f2f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Mar 2014 10:47:05 +1100 Subject: KVM: PPC: Book3S HV: Return ENODEV error rather than EIO If an attempt is made to load the kvm-hv module on a machine which doesn't have hypervisor mode available, return an ENODEV error, which is the conventional thing to return to indicate that this module is not applicable to the hardware of the current machine, rather than EIO, which causes a warning to be printed. Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a6d8f01..8227dba 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2467,7 +2467,7 @@ static int kvmppc_book3s_init_hv(void) */ r = kvmppc_core_check_processor_compat_hv(); if (r < 0) - return r; + return -ENODEV; kvm_ops_hv.owner = THIS_MODULE; kvmppc_hv_ops = &kvm_ops_hv; -- cgit v0.10.2 From 797f9c07eb4cbc2d0ff27fac165a0b885da38840 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Mar 2014 10:47:06 +1100 Subject: KVM: PPC: Book3S HV: Don't use kvm_memslots() in real mode With HV KVM, some high-frequency hypercalls such as H_ENTER are handled in real mode, and need to access the memslots array for the guest. Accessing the memslots array is safe, because we hold the SRCU read lock for the whole time that a guest vcpu is running. However, the checks that kvm_memslots() does when lockdep is enabled are potentially unsafe in real mode, when only the linear mapping is available. Furthermore, kvm_memslots() can be called from a secondary CPU thread, which is an offline CPU from the point of view of the host kernel, and is not running the task which holds the SRCU read lock. To avoid false positives in the checks in kvm_memslots(), and to avoid possible side effects from doing the checks in real mode, this replaces kvm_memslots() with kvm_memslots_raw() in all the places that execute in real mode. kvm_memslots_raw() is a new function that is like kvm_memslots() but uses rcu_dereference_raw_notrace() instead of kvm_dereference_check(). Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index bf0fa8b0a..51388be 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -289,6 +289,18 @@ static inline void note_hpte_modification(struct kvm *kvm, if (atomic_read(&kvm->arch.hpte_mod_interest)) rev->guest_rpte |= HPTE_GR_MODIFIED; } + +/* + * Like kvm_memslots(), but for use in real mode when we can't do + * any RCU stuff (since the secondary threads are offline from the + * kernel's point of view), and we can't print anything. + * Thus we use rcu_dereference_raw() rather than rcu_dereference_check(). + */ +static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) +{ + return rcu_dereference_raw_notrace(kvm->memslots); +} + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 37fb3ca..1d6c56a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -111,7 +111,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); - memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return; @@ -192,7 +192,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; - memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; is_io = ~0ul; rmap = NULL; @@ -670,7 +670,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, psize = hpte_page_size(v, r); gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; - memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (memslot) { hva = __gfn_to_hva_memslot(memslot, gfn); pte = lookup_linux_pte_and_update(pgdir, hva, -- cgit v0.10.2 From c5fb80d3b24f6280bd6f608d8f2a02139a0fabaf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Mar 2014 10:47:07 +1100 Subject: KVM: PPC: Book3S HV: Fix decrementer timeouts with non-zero TB offset Commit c7699822bc21 ("KVM: PPC: Book3S HV: Make physical thread 0 do the MMU switching") reordered the guest entry/exit code so that most of the guest register save/restore code happened in guest MMU context. A side effect of that is that the timebase still contains the guest timebase value at the point where we compute and use vcpu->arch.dec_expires, and therefore that is now a guest timebase value rather than a host timebase value. That in turn means that the timeouts computed in kvmppc_set_timer() are wrong if the timebase offset for the guest is non-zero. The consequence of that is things such as "sleep 1" in a guest after migration may sleep for much longer than they should. This fixes the problem by converting between guest and host timebase values as necessary, by adding or subtracting the timebase offset. This also fixes an incorrect comment. Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 61190dd..42bd2e6 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -841,6 +841,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) * Set the decrementer to the guest decrementer. */ ld r8,VCPU_DEC_EXPIRES(r4) + /* r8 is a host timebase value here, convert to guest TB */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r6,VCORE_TB_OFFSET(r5) + add r8,r8,r6 mftb r7 subf r3,r7,r8 mtspr SPRN_DEC,r3 @@ -1204,6 +1208,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) mftb r6 extsw r5,r5 add r5,r5,r6 + /* r5 is a guest timebase value here, convert to host TB */ + ld r3,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_TB_OFFSET(r3) + subf r5,r4,r5 std r5,VCPU_DEC_EXPIRES(r9) BEGIN_FTR_SECTION @@ -1479,7 +1487,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ld r8,VCORE_TB_OFFSET(r5) cmpdi r8,0 beq 17f - mftb r6 /* current host timebase */ + mftb r6 /* current guest timebase */ subf r8,r8,r6 mtspr SPRN_TBU40,r8 /* update upper 40 bits */ mftb r7 /* check if lower 24 bits overflowed */ -- cgit v0.10.2 From 72cde5a88d37ba88ad1d47aecf957a9e528636d7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Mar 2014 10:47:08 +1100 Subject: KVM: PPC: Book3S HV: Save/restore host PMU registers that are new in POWER8 Currently we save the host PMU configuration, counter values, etc., when entering a guest, and restore it on return from the guest. (We have to do this because the guest has control of the PMU while it is executing.) However, we missed saving/restoring the SIAR and SDAR registers, as well as the registers which are new on POWER8, namely SIER and MMCR2. This adds code to save the values of these registers when entering the guest and restore them on exit. This also works around the bug in POWER8 where setting PMAE with a counter already negative doesn't generate an interrupt. Signed-off-by: Paul Mackerras Acked-by: Scott Wood diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index f3a91dc..821725c 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -94,7 +94,7 @@ struct kvmppc_host_state { unsigned long xics_phys; u32 saved_xirr; u64 dabr; - u64 host_mmcr[3]; + u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */ u32 host_pmc[8]; u64 host_purr; u64 host_spurr; diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index e873796..e18e3cf 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -71,6 +71,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtmsrd r10,1 /* Save host PMU registers */ +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 1 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ mfspr r7, SPRN_MMCR0 /* save MMCR0 */ @@ -87,9 +95,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) cmpwi r5, 0 beq 31f /* skip if not */ mfspr r5, SPRN_MMCR1 + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR std r7, HSTATE_MMCR(r13) std r5, HSTATE_MMCR + 8(r13) std r6, HSTATE_MMCR + 16(r13) + std r9, HSTATE_MMCR + 24(r13) + std r10, HSTATE_MMCR + 32(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR + 40(r13) + std r9, HSTATE_MMCR + 48(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mfspr r3, SPRN_PMC1 mfspr r5, SPRN_PMC2 mfspr r6, SPRN_PMC3 @@ -110,6 +127,11 @@ BEGIN_FTR_SECTION stw r10, HSTATE_PMC + 24(r13) stw r11, HSTATE_PMC + 28(r13) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR + 40(r13) + std r9, HSTATE_MMCR + 48(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 31: /* diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 42bd2e6..4963335 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -109,8 +109,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) ld r3, HSTATE_MMCR(r13) ld r4, HSTATE_MMCR + 8(r13) ld r5, HSTATE_MMCR + 16(r13) + ld r6, HSTATE_MMCR + 24(r13) + ld r7, HSTATE_MMCR + 32(r13) mtspr SPRN_MMCR1, r4 mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR + 40(r13) + ld r9, HSTATE_MMCR + 48(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_MMCR0, r3 isync 23: -- cgit v0.10.2