From ca7d58f375c650cf36900cb1da1ca2cc99b13393 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:31:08 +0800 Subject: KVM: x86: fix broken read emulation spans a page boundary If the range spans a page boundary, the mmio access can be broke, fix it as write emulation. And we already get the guest physical address, so use it to read guest data directly to avoid walking guest page table again Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84a28ea..1453248 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4045,13 +4045,12 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) +static int emulator_read_emulated_onepage(unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; int handled, ret; @@ -4071,8 +4070,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, if (ret) goto mmio; - if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) - == X86EMUL_CONTINUE) + if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: @@ -4101,6 +4099,31 @@ mmio: return X86EMUL_IO_NEEDED; } +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; + + now = -addr & ~PAGE_MASK; + rc = emulator_read_emulated_onepage(addr, val, now, exception, + vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_read_emulated_onepage(addr, val, bytes, exception, + vcpu); +} + int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { -- cgit v0.10.2 From 77d197b2ca37b33b0461ab1e2dbe40cbe4a6fd6a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:31:50 +0800 Subject: KVM: x86: abstract the operation for read/write emulation The operations of read emulation and write emulation are very similar, so we can abstract the operation of them, in larter patch, it is used to cleanup the same code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1453248..f5c60a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4136,6 +4136,78 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, return 1; } +struct read_write_emulator_ops { + int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, + int bytes); + int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + int bytes, void *val); + int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + bool write; +}; + +static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) +{ + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_read_completed = 0; + return 1; + } + + return 0; +} + +static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); +} + +static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return emulator_write_phys(vcpu, gpa, val, bytes); +} + +static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + return vcpu_mmio_write(vcpu, gpa, bytes, val); +} + +static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + return X86EMUL_IO_NEEDED; +} + +static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + memcpy(vcpu->mmio_data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + return X86EMUL_CONTINUE; +} + +static struct read_write_emulator_ops read_emultor = { + .read_write_prepare = read_prepare, + .read_write_emulate = read_emulate, + .read_write_mmio = vcpu_mmio_read, + .read_write_exit_mmio = read_exit_mmio, +}; + +static struct read_write_emulator_ops write_emultor = { + .read_write_emulate = write_emulate, + .read_write_mmio = write_mmio, + .read_write_exit_mmio = write_exit_mmio, + .write = true, +}; + static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, -- cgit v0.10.2 From 22388a3c8ce2a2a004ce764194cce8a2f9b13d66 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 13 Jul 2011 14:32:31 +0800 Subject: KVM: x86: cleanup the code of read/write emulation Using the read/write operation to remove the same code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5c60a8..2b76ae3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4045,85 +4045,6 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated_onepage(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - int handled, ret; - - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); - vcpu->mmio_read_completed = 0; - return X86EMUL_CONTINUE; - } - - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - - if (ret < 0) - return X86EMUL_PROPAGATE_FAULT; - - if (ret) - goto mmio; - - if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes)) - return X86EMUL_CONTINUE; - -mmio: - /* - * Is this MMIO handled locally? - */ - handled = vcpu_mmio_read(vcpu, gpa, bytes, val); - - if (handled == bytes) - return X86EMUL_CONTINUE; - - gpa += handled; - bytes -= handled; - val += handled; - - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); - - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - vcpu->mmio_index = 0; - - return X86EMUL_IO_NEEDED; -} - -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; - - now = -addr & ~PAGE_MASK; - rc = emulator_read_emulated_onepage(addr, val, now, exception, - vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - addr += now; - val += now; - bytes -= now; - } - return emulator_read_emulated_onepage(addr, val, bytes, exception, - vcpu); -} - int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { @@ -4208,16 +4129,21 @@ static struct read_write_emulator_ops write_emultor = { .write = true, }; -static int emulator_write_emulated_onepage(unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) +static int emulator_read_write_onepage(unsigned long addr, void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu, + struct read_write_emulator_ops *ops) { gpa_t gpa; int handled, ret; + bool write = ops->write; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); if (ret < 0) return X86EMUL_PROPAGATE_FAULT; @@ -4226,15 +4152,14 @@ static int emulator_write_emulated_onepage(unsigned long addr, if (ret) goto mmio; - if (emulator_write_phys(vcpu, gpa, val, bytes)) + if (ops->read_write_emulate(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ - handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + handled = ops->read_write_mmio(vcpu, gpa, bytes, val); if (handled == bytes) return X86EMUL_CONTINUE; @@ -4243,23 +4168,20 @@ mmio: val += handled; vcpu->mmio_needed = 1; - memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; vcpu->mmio_index = 0; - return X86EMUL_CONTINUE; + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception) +int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes, + struct x86_exception *exception, + struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); @@ -4268,16 +4190,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, exception, - vcpu); + rc = emulator_read_write_onepage(addr, val, now, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, exception, - vcpu); + + return emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); +} + +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, val, bytes, + exception, &read_emultor); +} + +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, (void *)val, bytes, + exception, &write_emultor); } #define CMPXCHG_TYPE(t, ptr, old, new) \ -- cgit v0.10.2 From c298125f4bc30fdbe4b7c33460ef57271cc51a7d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 18 Jul 2011 17:17:14 +0300 Subject: KVM: MMIO: Lock coalesced device when checking for available entry Move the check whether there are available entries to within the spinlock. This allows working with larger amount of VCPUs and reduces premature exits when using a large number of VCPUs. Cc: Avi Kivity Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Pekka Enberg Signed-off-by: Sasha Levin Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index fc84875..ae075dc 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -25,23 +25,8 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, gpa_t addr, int len) { struct kvm_coalesced_mmio_zone *zone; - struct kvm_coalesced_mmio_ring *ring; - unsigned avail; int i; - /* Are we able to batch it ? */ - - /* last is the first free entry - * check if we don't meet the first used entry - * there is always one unused entry in the buffer - */ - ring = dev->kvm->coalesced_mmio_ring; - avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; - if (avail < KVM_MAX_VCPUS) { - /* full */ - return 0; - } - /* is it in a batchable area ? */ for (i = 0; i < dev->nb_zones; i++) { @@ -58,16 +43,43 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, return 0; } +static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) +{ + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; + + /* Are we able to batch it ? */ + + /* last is the first free entry + * check if we don't meet the first used entry + * there is always one unused entry in the buffer + */ + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail == 0) { + /* full */ + return 0; + } + + return 1; +} + static int coalesced_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) return -EOPNOTSUPP; spin_lock(&dev->lock); + if (!coalesced_mmio_has_room(dev)) { + spin_unlock(&dev->lock); + return -EOPNOTSUPP; + } + /* copy data in first free entry of the ring */ ring->coalesced_mmio[ring->last].phys_addr = addr; -- cgit v0.10.2 From 8c3ba334f8588e1d5099f8602cf01897720e0eca Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 18 Jul 2011 17:17:15 +0300 Subject: KVM: x86: Raise the hard VCPU count limit The patch raises the hard limit of VCPU count to 254. This will allow developers to easily work on scalability and will allow users to test high VCPU setups easily without patching the kernel. To prevent possible issues with current setups, KVM_CAP_NR_VCPUS now returns the recommended VCPU limit (which is still 64) - this should be a safe value for everybody, while a new KVM_CAP_MAX_VCPUS returns the hard limit which is now 254. Cc: Avi Kivity Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Pekka Enberg Suggested-by: Pekka Enberg Signed-off-by: Sasha Levin Signed-off-by: Marcelo Tosatti diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b0e4b9c..75cd8fb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -175,10 +175,17 @@ Parameters: vcpu id (apic id on x86) Returns: vcpu fd on success, -1 on error This API adds a vcpu to a virtual machine. The vcpu id is a small integer -in the range [0, max_vcpus). You can use KVM_CAP_NR_VCPUS of the -KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. +in the range [0, max_vcpus). + +The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of +the KVM_CHECK_EXTENSION ioctl() at run-time. +The maximum possible value for max_vcpus can be retrieved using the +KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. + If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 cpus max. +If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is +same as the value returned from KVM_CAP_NR_VCPUS. On powerpc using book3s_hv mode, the vcpus are mapped onto virtual threads in one or more virtual CPU cores. (This is because the diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd51c83..c00ec28 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -26,7 +26,8 @@ #include #include -#define KVM_MAX_VCPUS 64 +#define KVM_MAX_VCPUS 254 +#define KVM_SOFT_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b76ae3..41dfebe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2086,6 +2086,9 @@ int kvm_dev_ioctl_check_extension(long ext) r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: + r = KVM_SOFT_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index aace6b8..2069798 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -463,7 +463,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_VAPIC 6 #define KVM_CAP_EXT_CPUID 7 #define KVM_CAP_CLOCKSOURCE 8 -#define KVM_CAP_NR_VCPUS 9 /* returns max vcpus per vm */ +#define KVM_CAP_NR_VCPUS 9 /* returns recommended max vcpus per vm */ #define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ #define KVM_CAP_PIT 11 #define KVM_CAP_NOP_IO_DELAY 12 @@ -553,6 +553,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_SPAPR_TCE 63 #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 +#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_S390_GMAP 71 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v0.10.2 From 2b3c246a682c50f5415c71fc5387a114a6f0d643 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 20 Jul 2011 20:59:00 +0300 Subject: KVM: Make coalesced mmio use a device per zone This patch changes coalesced mmio to create one mmio device per zone instead of handling all zones in one device. Doing so enables us to take advantage of existing locking and prevents a race condition between coalesced mmio registration/unregistration and lookups. Suggested-by: Avi Kivity Signed-off-by: Sasha Levin Signed-off-by: Marcelo Tosatti diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eabb21a..ff4d406 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -63,7 +63,7 @@ extern struct kmem_cache *kvm_vcpu_cache; */ struct kvm_io_bus { int dev_count; -#define NR_IOBUS_DEVS 200 +#define NR_IOBUS_DEVS 300 struct kvm_io_device *devs[NR_IOBUS_DEVS]; }; @@ -256,8 +256,9 @@ struct kvm { struct kvm_arch arch; atomic_t users_count; #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET - struct kvm_coalesced_mmio_dev *coalesced_mmio_dev; struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; + spinlock_t ring_lock; + struct list_head coalesced_zones; #endif struct mutex irq_lock; diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index ae075dc..2316ec1 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -24,23 +24,13 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, gpa_t addr, int len) { - struct kvm_coalesced_mmio_zone *zone; - int i; - - /* is it in a batchable area ? */ - - for (i = 0; i < dev->nb_zones; i++) { - zone = &dev->zone[i]; - - /* (addr,len) is fully included in - * (zone->addr, zone->size) - */ + /* is it in a batchable area ? + * (addr,len) is fully included in + * (zone->addr, zone->size) + */ - if (zone->addr <= addr && - addr + len <= zone->addr + zone->size) - return 1; - } - return 0; + return (dev->zone.addr <= addr && + addr + len <= dev->zone.addr + dev->zone.size); } static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev) @@ -73,10 +63,10 @@ static int coalesced_mmio_write(struct kvm_io_device *this, if (!coalesced_mmio_in_range(dev, addr, len)) return -EOPNOTSUPP; - spin_lock(&dev->lock); + spin_lock(&dev->kvm->ring_lock); if (!coalesced_mmio_has_room(dev)) { - spin_unlock(&dev->lock); + spin_unlock(&dev->kvm->ring_lock); return -EOPNOTSUPP; } @@ -87,7 +77,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this, memcpy(ring->coalesced_mmio[ring->last].data, val, len); smp_wmb(); ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; - spin_unlock(&dev->lock); + spin_unlock(&dev->kvm->ring_lock); return 0; } @@ -95,6 +85,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + list_del(&dev->list); + kfree(dev); } @@ -105,7 +97,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = { int kvm_coalesced_mmio_init(struct kvm *kvm) { - struct kvm_coalesced_mmio_dev *dev; struct page *page; int ret; @@ -113,31 +104,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm) page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) goto out_err; - kvm->coalesced_mmio_ring = page_address(page); - ret = -ENOMEM; - dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); - if (!dev) - goto out_free_page; - spin_lock_init(&dev->lock); - kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); - dev->kvm = kvm; - kvm->coalesced_mmio_dev = dev; - - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); - mutex_unlock(&kvm->slots_lock); - if (ret < 0) - goto out_free_dev; + ret = 0; + kvm->coalesced_mmio_ring = page_address(page); - return ret; + /* + * We're using this spinlock to sync access to the coalesced ring. + * The list doesn't need it's own lock since device registration and + * unregistration should only happen when kvm->slots_lock is held. + */ + spin_lock_init(&kvm->ring_lock); + INIT_LIST_HEAD(&kvm->coalesced_zones); -out_free_dev: - kvm->coalesced_mmio_dev = NULL; - kfree(dev); -out_free_page: - kvm->coalesced_mmio_ring = NULL; - __free_page(page); out_err: return ret; } @@ -151,51 +129,49 @@ void kvm_coalesced_mmio_free(struct kvm *kvm) int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { - struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; + int ret; + struct kvm_coalesced_mmio_dev *dev; - if (dev == NULL) - return -ENXIO; + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); + dev->kvm = kvm; + dev->zone = *zone; mutex_lock(&kvm->slots_lock); - if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { - mutex_unlock(&kvm->slots_lock); - return -ENOBUFS; - } + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + if (ret < 0) + goto out_free_dev; + list_add_tail(&dev->list, &kvm->coalesced_zones); + mutex_unlock(&kvm->slots_lock); - dev->zone[dev->nb_zones] = *zone; - dev->nb_zones++; + return ret; +out_free_dev: mutex_unlock(&kvm->slots_lock); + + kfree(dev); + + if (dev == NULL) + return -ENXIO; + return 0; } int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { - int i; - struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; - struct kvm_coalesced_mmio_zone *z; - - if (dev == NULL) - return -ENXIO; + struct kvm_coalesced_mmio_dev *dev, *tmp; mutex_lock(&kvm->slots_lock); - i = dev->nb_zones; - while (i) { - z = &dev->zone[i - 1]; - - /* unregister all zones - * included in (zone->addr, zone->size) - */ - - if (zone->addr <= z->addr && - z->addr + z->size <= zone->addr + zone->size) { - dev->nb_zones--; - *z = dev->zone[dev->nb_zones]; + list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) + if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) { + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev); + kvm_iodevice_destructor(&dev->dev); } - i--; - } mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 8a5959e..b280c20 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -12,14 +12,13 @@ #ifdef CONFIG_KVM_MMIO -#define KVM_COALESCED_MMIO_ZONE_MAX 100 +#include struct kvm_coalesced_mmio_dev { + struct list_head list; struct kvm_io_device dev; struct kvm *kvm; - spinlock_t lock; - int nb_zones; - struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; + struct kvm_coalesced_mmio_zone zone; }; int kvm_coalesced_mmio_init(struct kvm *kvm); -- cgit v0.10.2 From 14fa67ee95d4f7313fbf149fe37faccb903857c8 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Thu, 21 Jul 2011 15:38:10 -0700 Subject: KVM: x86: get_msr support for HV_X64_MSR_APIC_ASSIST_PAGE "get" support for the HV_X64_MSR_APIC_ASSIST_PAGE msr was missing, even though it is explicitly enumerated as something the vmm should save in msrs_to_save and reported to userland via the KVM_GET_MSR_INDEX_LIST ioctl. Add "get" support for HV_X64_MSR_APIC_ASSIST_PAGE. We simply return the guest visible value of this register, which seems to be correct as a set on the register is validated for us already. Signed-off-by: Mike Waychison Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41dfebe..e80f0d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1825,6 +1825,8 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + return vcpu->arch.hv_vapic; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; -- cgit v0.10.2 From d1613ad5d0018a009bd4865b0fa5930abb5ed259 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Sat, 23 Jul 2011 00:31:45 -0700 Subject: KVM: Really fix HV_X64_MSR_APIC_ASSIST_PAGE Commit 0945d4b228 tried to fix the get_msr path for the HV_X64_MSR_APIC_ASSIST_PAGE msr, but was poorly tested. We should be returning 0 if the read succeeded, and passing the value back to the caller via the pdata out argument, not returning the value directly. Signed-off-by: Mike Waychison Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e80f0d7..6cb353c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1826,7 +1826,8 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_APIC_ASSIST_PAGE: - return vcpu->arch.hv_vapic; + data = vcpu->arch.hv_vapic; + break; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; -- cgit v0.10.2 From e097e5ffd69cbd7be61466e2d54c145468d48073 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 22 Jul 2011 12:46:52 +0100 Subject: KVM: Record instruction set in all vmexit tracepoints The kvm_exit tracepoint recently added the isa argument to aid decoding exit_reason. The semantics of exit_reason depend on the instruction set (vmx or svm) and the isa argument allows traces to be analyzed on other machines. Add the isa argument to kvm_nested_vmexit and kvm_nested_vmexit_inject so these tracepoints can also be self-describing. Signed-off-by: Stefan Hajnoczi Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 475d1c9..6adb7ba 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2182,7 +2182,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_info_1, vmcb->control.exit_info_2, vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err); + vmcb->control.exit_int_info_err, + KVM_ISA_SVM); nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); if (!nested_vmcb) @@ -3335,7 +3336,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err); + svm->vmcb->control.exit_int_info_err, + KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 3ff898c..4e1716b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -486,9 +486,9 @@ TRACE_EVENT(kvm_nested_intercepts, TRACE_EVENT(kvm_nested_vmexit, TP_PROTO(__u64 rip, __u32 exit_code, __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err), + __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err), + exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u64, rip ) @@ -497,6 +497,7 @@ TRACE_EVENT(kvm_nested_vmexit, __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) + __field( __u32, isa ) ), TP_fast_assign( @@ -506,6 +507,7 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; + __entry->isa = isa; ), TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", @@ -522,9 +524,9 @@ TRACE_EVENT(kvm_nested_vmexit, TRACE_EVENT(kvm_nested_vmexit_inject, TP_PROTO(__u32 exit_code, __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err), + __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err), + exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u32, exit_code ) @@ -532,6 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) + __field( __u32, isa ) ), TP_fast_assign( @@ -540,6 +543,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; + __entry->isa = isa; ), TP_printk("reason: %s ext_inf1: 0x%016llx " -- cgit v0.10.2 From 0d460ffc0956d2dbe12ca9f5f6aa0f8701ea9d73 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 22 Jul 2011 12:46:53 +0100 Subject: KVM: Use __print_symbolic() for vmexit tracepoints The vmexit tracepoints format the exit_reason to make it human-readable. Since the exit_reason depends on the instruction set (vmx or svm), formatting is handled with ftrace_print_symbols_seq() by referring to the appropriate exit reason table. However, the ftrace_print_symbols_seq() function is not meant to be used directly in tracepoints since it does not export the formatting table which userspace tools like trace-cmd and perf use to format traces. In practice perf dies when formatting vmexit-related events and trace-cmd falls back to printing the numeric value (with extra formatting code in the kvm plugin to paper over this limitation). Other userspace consumers of vmexit-related tracepoints would be in similar trouble. To avoid significant changes to the kvm_exit tracepoint, this patch moves the vmx and svm exit reason tables into arch/x86/kvm/trace.h and selects the right table with __print_symbolic() depending on the instruction set. Note that __print_symbolic() is designed for exporting the formatting table to userspace and allows trace-cmd and perf to work. Signed-off-by: Stefan Hajnoczi Signed-off-by: Avi Kivity diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c00ec28..307e3cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -635,8 +635,6 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - - const struct trace_print_flags *exit_reasons_str; }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6adb7ba..2b24a88 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3899,60 +3899,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) } } -static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, - { SVM_EXIT_INTR, "interrupt" }, - { SVM_EXIT_NMI, "nmi" }, - { SVM_EXIT_SMI, "smi" }, - { SVM_EXIT_INIT, "init" }, - { SVM_EXIT_VINTR, "vintr" }, - { SVM_EXIT_CPUID, "cpuid" }, - { SVM_EXIT_INVD, "invd" }, - { SVM_EXIT_HLT, "hlt" }, - { SVM_EXIT_INVLPG, "invlpg" }, - { SVM_EXIT_INVLPGA, "invlpga" }, - { SVM_EXIT_IOIO, "io" }, - { SVM_EXIT_MSR, "msr" }, - { SVM_EXIT_TASK_SWITCH, "task_switch" }, - { SVM_EXIT_SHUTDOWN, "shutdown" }, - { SVM_EXIT_VMRUN, "vmrun" }, - { SVM_EXIT_VMMCALL, "hypercall" }, - { SVM_EXIT_VMLOAD, "vmload" }, - { SVM_EXIT_VMSAVE, "vmsave" }, - { SVM_EXIT_STGI, "stgi" }, - { SVM_EXIT_CLGI, "clgi" }, - { SVM_EXIT_SKINIT, "skinit" }, - { SVM_EXIT_WBINVD, "wbinvd" }, - { SVM_EXIT_MONITOR, "monitor" }, - { SVM_EXIT_MWAIT, "mwait" }, - { SVM_EXIT_XSETBV, "xsetbv" }, - { SVM_EXIT_NPF, "npf" }, - { -1, NULL } -}; - static int svm_get_lpage_level(void) { return PT_PDPE_LEVEL; @@ -4225,7 +4171,6 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, - .exit_reasons_str = svm_exit_reasons_str, .get_lpage_level = svm_get_lpage_level, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4e1716b..911d264 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -2,6 +2,8 @@ #define _TRACE_KVM_H #include +#include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" } + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + /* * Tracepoint for kvm guest exit: */ @@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit, ), TP_printk("reason %s rip 0x%lx info %llx %llx", - ftrace_print_symbols_seq(p, __entry->exit_reason, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), __entry->guest_rip, __entry->info1, __entry->info2) ); @@ -512,8 +604,9 @@ TRACE_EVENT(kvm_nested_vmexit, TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, - ftrace_print_symbols_seq(p, __entry->exit_code, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); @@ -548,8 +641,9 @@ TRACE_EVENT(kvm_nested_vmexit_inject, TP_printk("reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - ftrace_print_symbols_seq(p, __entry->exit_code, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e65a158..e26629f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6241,49 +6241,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } -#define _ER(x) { EXIT_REASON_##x, #x } - -static const struct trace_print_flags vmx_exit_reasons_str[] = { - _ER(EXCEPTION_NMI), - _ER(EXTERNAL_INTERRUPT), - _ER(TRIPLE_FAULT), - _ER(PENDING_INTERRUPT), - _ER(NMI_WINDOW), - _ER(TASK_SWITCH), - _ER(CPUID), - _ER(HLT), - _ER(INVLPG), - _ER(RDPMC), - _ER(RDTSC), - _ER(VMCALL), - _ER(VMCLEAR), - _ER(VMLAUNCH), - _ER(VMPTRLD), - _ER(VMPTRST), - _ER(VMREAD), - _ER(VMRESUME), - _ER(VMWRITE), - _ER(VMOFF), - _ER(VMON), - _ER(CR_ACCESS), - _ER(DR_ACCESS), - _ER(IO_INSTRUCTION), - _ER(MSR_READ), - _ER(MSR_WRITE), - _ER(MWAIT_INSTRUCTION), - _ER(MONITOR_INSTRUCTION), - _ER(PAUSE_INSTRUCTION), - _ER(MCE_DURING_VMENTRY), - _ER(TPR_BELOW_THRESHOLD), - _ER(APIC_ACCESS), - _ER(EPT_VIOLATION), - _ER(EPT_MISCONFIG), - _ER(WBINVD), - { -1, NULL } -}; - -#undef _ER - static int vmx_get_lpage_level(void) { if (enable_ept && !cpu_has_vmx_ept_1g_page()) @@ -7039,7 +6996,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, - .exit_reasons_str = vmx_exit_reasons_str, .get_lpage_level = vmx_get_lpage_level, -- cgit v0.10.2 From 743eeb0b01d2fbf4154bf87bff1ebb6fb18aeb7a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 27 Jul 2011 16:00:48 +0300 Subject: KVM: Intelligent device lookup on I/O bus Currently the method of dealing with an IO operation on a bus (PIO/MMIO) is to call the read or write callback for each device registered on the bus until we find a device which handles it. Since the number of devices on a bus can be significant due to ioeventfds and coalesced MMIO zones, this leads to a lot of overhead on each IO operation. Instead of registering devices, we now register ranges which points to a device. Lookup is done using an efficient bsearch instead of a linear search. Performance test was conducted by comparing exit count per second with 200 ioeventfds created on one byte and the guest is trying to access a different byte continuously (triggering usermode exits). Before the patch the guest has achieved 259k exits per second, after the patch the guest does 274k exits per second. Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index efad723..76e3f1c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, + KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - &pit->speaker_dev); + KVM_SPEAKER_BASE_ADDRESS, 4, + &pit->speaker_dev); if (ret < 0) goto fail_unregister; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 19fe855..6b869ce 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -459,15 +459,9 @@ static int picdev_in_range(gpa_t addr) } } -static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pic, dev); -} - -static int picdev_write(struct kvm_io_device *this, +static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -494,10 +488,9 @@ static int picdev_write(struct kvm_io_device *this, return 0; } -static int picdev_read(struct kvm_io_device *this, +static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = 0; if (!picdev_in_range(addr)) return -EOPNOTSUPP; @@ -525,6 +518,48 @@ static int picdev_read(struct kvm_io_device *this, return 0; } +static int picdev_master_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_master_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_slave_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_slave_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_eclr_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + +static int picdev_eclr_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + /* * callback when PIC0 irq status changed */ @@ -537,9 +572,19 @@ static void pic_irq_request(struct kvm *kvm, int level) s->output = level; } -static const struct kvm_io_device_ops picdev_ops = { - .read = picdev_read, - .write = picdev_write, +static const struct kvm_io_device_ops picdev_master_ops = { + .read = picdev_master_read, + .write = picdev_master_write, +}; + +static const struct kvm_io_device_ops picdev_slave_ops = { + .read = picdev_slave_read, + .write = picdev_slave_write, +}; + +static const struct kvm_io_device_ops picdev_eclr_ops = { + .read = picdev_eclr_read, + .write = picdev_eclr_write, }; struct kvm_pic *kvm_create_pic(struct kvm *kvm) @@ -560,16 +605,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - kvm_iodevice_init(&s->dev, &picdev_ops); + kvm_iodevice_init(&s->dev_master, &picdev_master_ops); + kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); + kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, + &s->dev_master); + if (ret < 0) + goto fail_unlock; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); + if (ret < 0) + goto fail_unreg_2; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + if (ret < 0) + goto fail_unreg_1; + mutex_unlock(&kvm->slots_lock); - if (ret < 0) { - kfree(s); - return NULL; - } return s; + +fail_unreg_1: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); + +fail_unreg_2: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); + +fail_unlock: + mutex_unlock(&kvm->slots_lock); + + kfree(s); + + return NULL; } void kvm_destroy_pic(struct kvm *kvm) @@ -577,7 +645,9 @@ void kvm_destroy_pic(struct kvm *kvm) struct kvm_pic *vpic = kvm->arch.vpic; if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 53e2d08..2086f2b 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -66,7 +66,9 @@ struct kvm_pic { struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ int output; /* intr from master PIC */ - struct kvm_io_device dev; + struct kvm_io_device dev_master; + struct kvm_io_device dev_slave; + struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[16]; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6cb353c..d28dff7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3562,7 +3562,11 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r) { mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev); + &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_eclr); mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ff4d406..d0e42f3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -55,16 +55,16 @@ struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -/* - * It would be nice to use something smarter than a linear search, TBD... - * Thankfully we dont expect many devices to register (famous last words :), - * so until then it will suffice. At least its abstracted so we can change - * in one place. - */ +struct kvm_io_range { + gpa_t addr; + int len; + struct kvm_io_device *dev; +}; + struct kvm_io_bus { int dev_count; #define NR_IOBUS_DEVS 300 - struct kvm_io_device *devs[NR_IOBUS_DEVS]; + struct kvm_io_range range[NR_IOBUS_DEVS]; }; enum kvm_bus { @@ -77,8 +77,8 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val); int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); -int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev); +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 2316ec1..a6ec206 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -141,7 +141,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, dev->zone = *zone; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, + zone->size, &dev->dev); if (ret < 0) goto out_free_dev; list_add_tail(&dev->list, &kvm->coalesced_zones); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 73358d2..f59c1e8 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -586,7 +586,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) kvm_iodevice_init(&p->dev, &ioeventfd_ops); - ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); + ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, + &p->dev); if (ret < 0) goto unlock_fail; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 8df1ca1..3eed61e 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -394,7 +394,8 @@ int kvm_ioapic_init(struct kvm *kvm) kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address, + IOAPIC_MEM_LENGTH, &ioapic->dev); mutex_unlock(&kvm->slots_lock); if (ret < 0) { kvm->arch.vioapic = NULL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aefdda3..d9cfb78 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #include @@ -2391,24 +2393,92 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) int i; for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; + struct kvm_io_device *pos = bus->range[i].dev; kvm_iodevice_destructor(pos); } kfree(bus); } +int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +{ + const struct kvm_io_range *r1 = p1; + const struct kvm_io_range *r2 = p2; + + if (r1->addr < r2->addr) + return -1; + if (r1->addr + r1->len > r2->addr + r2->len) + return 1; + return 0; +} + +int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, + gpa_t addr, int len) +{ + if (bus->dev_count == NR_IOBUS_DEVS) + return -ENOSPC; + + bus->range[bus->dev_count++] = (struct kvm_io_range) { + .addr = addr, + .len = len, + .dev = dev, + }; + + sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), + kvm_io_bus_sort_cmp, NULL); + + return 0; +} + +int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, + gpa_t addr, int len) +{ + struct kvm_io_range *range, key; + int off; + + key = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; + + range = bsearch(&key, bus->range, bus->dev_count, + sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); + if (range == NULL) + return -ENOENT; + + off = range - bus->range; + + while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) + off--; + + return off; +} + /* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { - int i; + int idx; struct kvm_io_bus *bus; + struct kvm_io_range range; + + range = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); - for (i = 0; i < bus->dev_count; i++) - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + idx = kvm_io_bus_get_first_dev(bus, addr, len); + if (idx < 0) + return -EOPNOTSUPP; + + while (idx < bus->dev_count && + kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { + if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) return 0; + idx++; + } + return -EOPNOTSUPP; } @@ -2416,19 +2486,33 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { - int i; + int idx; struct kvm_io_bus *bus; + struct kvm_io_range range; + + range = (struct kvm_io_range) { + .addr = addr, + .len = len, + }; bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); - for (i = 0; i < bus->dev_count; i++) - if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + idx = kvm_io_bus_get_first_dev(bus, addr, len); + if (idx < 0) + return -EOPNOTSUPP; + + while (idx < bus->dev_count && + kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { + if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) return 0; + idx++; + } + return -EOPNOTSUPP; } /* Caller must hold slots_lock. */ -int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, + int len, struct kvm_io_device *dev) { struct kvm_io_bus *new_bus, *bus; @@ -2440,7 +2524,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (!new_bus) return -ENOMEM; memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); - new_bus->devs[new_bus->dev_count++] = dev; + kvm_io_bus_insert_dev(new_bus, dev, addr, len); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); @@ -2464,9 +2548,13 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, r = -ENOENT; for (i = 0; i < new_bus->dev_count; i++) - if (new_bus->devs[i] == dev) { + if (new_bus->range[i].dev == dev) { r = 0; - new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; + new_bus->dev_count--; + new_bus->range[i] = new_bus->range[new_bus->dev_count]; + sort(new_bus->range, new_bus->dev_count, + sizeof(struct kvm_io_range), + kvm_io_bus_sort_cmp, NULL); break; } -- cgit v0.10.2 From 807941b121cf77e70eec8db308b8c1f496cc79e9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:00:17 +0900 Subject: KVM: x86 emulator: Use ctxt->_eip directly in do_insn_fetch_byte() Instead of passing ctxt->_eip from insn_fetch() call sites, get it from ctxt in do_insn_fetch_byte(). This is done by replacing the argument _eip of insn_fetch() with _ctxt, which should be better than letting the macro use ctxt silently in its body. Though this changes the place where ctxt->_eip is incremented from insn_fetch() to do_insn_fetch_byte(), this does not have any real effect. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8b4cc5f..ce48dc4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -651,18 +651,26 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, - unsigned long eip, u8 *dest) +/* + * Fetch the next byte of the instruction being emulated which is pointed to + * by ctxt->_eip, then increment ctxt->_eip. + * + * Also prefetch the remaining bytes of the instruction without crossing page + * boundary if they are not in fetch_cache yet. + */ +static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) { struct fetch_cache *fc = &ctxt->fetch; int rc; int size, cur_size; - if (eip == fc->end) { + if (ctxt->_eip == fc->end) { unsigned long linear; - struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = ctxt->_eip }; cur_size = fc->end - fc->start; - size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + size = min(15UL - cur_size, + PAGE_SIZE - offset_in_page(ctxt->_eip)); rc = __linearize(ctxt, addr, size, false, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; @@ -672,20 +680,21 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, return rc; fc->end += size; } - *dest = fc->data[eip - fc->start]; + *dest = fc->data[ctxt->_eip - fc->start]; + ctxt->_eip++; return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - unsigned long eip, void *dest, unsigned size) + void *dest, unsigned size) { int rc; /* x86 instructions are limited to 15 bytes. */ - if (eip + size - ctxt->eip > 15) + if (ctxt->_eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_insn_fetch_byte(ctxt, eip++, dest++); + rc = do_insn_fetch_byte(ctxt, dest++); if (rc != X86EMUL_CONTINUE) return rc; } @@ -693,20 +702,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ +#define insn_fetch(_type, _size, _ctxt) \ ({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ + rc = do_insn_fetch(_ctxt, &_x, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_eip) += (_size); \ (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ +#define insn_fetch_arr(_arr, _size, _ctxt) \ +({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_eip) += (_size); \ }) /* @@ -894,7 +901,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, 1, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -928,13 +935,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, ctxt->_eip); + modrm_ea += insn_fetch(u16, 2, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt->_eip); + modrm_ea += insn_fetch(s8, 1, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, 2, ctxt->_eip); + modrm_ea += insn_fetch(u16, 2, ctxt); break; } switch (ctxt->modrm_rm) { @@ -971,13 +978,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, ctxt->_eip); + sib = insn_fetch(u8, 1, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt); else modrm_ea += ctxt->regs[base_reg]; if (index_reg != 4) @@ -990,13 +997,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt->_eip); + modrm_ea += insn_fetch(s8, 1, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, 4, ctxt); break; } } @@ -1013,13 +1020,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u16, 2, ctxt); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u32, 4, ctxt); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u64, 8, ctxt); break; } done: @@ -3309,13 +3316,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, ctxt->_eip); + op->val = insn_fetch(s8, 1, ctxt); break; case 2: - op->val = insn_fetch(s16, 2, ctxt->_eip); + op->val = insn_fetch(s16, 2, ctxt); break; case 4: - op->val = insn_fetch(s32, 4, ctxt->_eip); + op->val = insn_fetch(s32, 4, ctxt); break; } if (!sign_extension) { @@ -3374,7 +3381,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { + switch (ctxt->b = insn_fetch(u8, 1, ctxt)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -3430,7 +3437,7 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->twobyte = 1; - ctxt->b = insn_fetch(u8, 1, ctxt->_eip); + ctxt->b = insn_fetch(u8, 1, ctxt); opcode = twobyte_table[ctxt->b]; } ctxt->d = opcode.flags; @@ -3438,13 +3445,13 @@ done_prefixes: while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, 1, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, 1, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) @@ -3577,7 +3584,7 @@ done_prefixes: ctxt->src.type = OP_IMM; ctxt->src.addr.mem.ea = ctxt->_eip; ctxt->src.bytes = ctxt->op_bytes + 2; - insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip); + insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt); break; case SrcMemFAddr: memop.bytes = ctxt->op_bytes + 2; @@ -3630,7 +3637,7 @@ done_prefixes: ctxt->dst.type = OP_IMM; ctxt->dst.addr.mem.ea = ctxt->_eip; ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip); + ctxt->dst.val = insn_fetch(u8, 1, ctxt); break; case DstMem: case DstMem64: -- cgit v0.10.2 From e85a10852c26d7d509ad17bac1a0d5264224b2d2 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:01:26 +0900 Subject: KVM: x86 emulator: Drop _size argument from insn_fetch() _type is enough to know the size. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ce48dc4..d4cc8af 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -702,9 +702,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _ctxt) \ +#define insn_fetch(_type, _ctxt) \ ({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, (_size)); \ + rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_type)_x; \ @@ -901,7 +901,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, 1, ctxt); + ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -935,13 +935,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, ctxt); + modrm_ea += insn_fetch(u16, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt); + modrm_ea += insn_fetch(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, 2, ctxt); + modrm_ea += insn_fetch(u16, ctxt); break; } switch (ctxt->modrm_rm) { @@ -978,13 +978,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, ctxt); + sib = insn_fetch(u8, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, ctxt); + modrm_ea += insn_fetch(s32, ctxt); else modrm_ea += ctxt->regs[base_reg]; if (index_reg != 4) @@ -997,13 +997,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, ctxt); + modrm_ea += insn_fetch(s32, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt); + modrm_ea += insn_fetch(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, 4, ctxt); + modrm_ea += insn_fetch(s32, ctxt); break; } } @@ -1020,13 +1020,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, ctxt); + op->addr.mem.ea = insn_fetch(u16, ctxt); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, ctxt); + op->addr.mem.ea = insn_fetch(u32, ctxt); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, ctxt); + op->addr.mem.ea = insn_fetch(u64, ctxt); break; } done: @@ -3316,13 +3316,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, ctxt); + op->val = insn_fetch(s8, ctxt); break; case 2: - op->val = insn_fetch(s16, 2, ctxt); + op->val = insn_fetch(s16, ctxt); break; case 4: - op->val = insn_fetch(s32, 4, ctxt); + op->val = insn_fetch(s32, ctxt); break; } if (!sign_extension) { @@ -3381,7 +3381,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, 1, ctxt)) { + switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -3437,7 +3437,7 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->twobyte = 1; - ctxt->b = insn_fetch(u8, 1, ctxt); + ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; } ctxt->d = opcode.flags; @@ -3445,13 +3445,13 @@ done_prefixes: while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, 1, ctxt); + ctxt->modrm = insn_fetch(u8, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, 1, ctxt); + ctxt->modrm = insn_fetch(u8, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) @@ -3637,7 +3637,7 @@ done_prefixes: ctxt->dst.type = OP_IMM; ctxt->dst.addr.mem.ea = ctxt->_eip; ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, 1, ctxt); + ctxt->dst.val = insn_fetch(u8, ctxt); break; case DstMem: case DstMem64: -- cgit v0.10.2 From 7d88bb4803d62f6056b079ade6333a026fd11684 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:02:29 +0900 Subject: KVM: x86 emulator: Let compiler know insn_fetch() rarely fails Fetching the instruction which was to be executed by the guest cannot fail normally. So compiler should always predict that it will succeed. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d4cc8af..191bc9b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -672,11 +672,11 @@ static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(ctxt->_eip)); rc = __linearize(ctxt, addr, size, false, true, &linear); - if (rc != X86EMUL_CONTINUE) + if (unlikely(rc != X86EMUL_CONTINUE)) return rc; rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, size, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) + if (unlikely(rc != X86EMUL_CONTINUE)) return rc; fc->end += size; } @@ -691,7 +691,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, int rc; /* x86 instructions are limited to 15 bytes. */ - if (ctxt->_eip + size - ctxt->eip > 15) + if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) return X86EMUL_UNHANDLEABLE; while (size--) { rc = do_insn_fetch_byte(ctxt, dest++); -- cgit v0.10.2 From 1d2887e2d849969f58ce79203f9785ebe065d494 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 30 Jul 2011 18:03:34 +0900 Subject: KVM: x86 emulator: Make x86_decode_insn() return proper macros Return EMULATION_OK/FAILED consistently. Also treat instruction fetch errors, not restricted to X86EMUL_UNHANDLEABLE, as EMULATION_FAILED; although this cannot happen in practice, the current logic will continue the emulation even if the decoder fails to fetch the instruction. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 191bc9b..fe5eb6d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3373,7 +3373,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) break; #endif default: - return -1; + return EMULATION_FAILED; } ctxt->op_bytes = def_op_bytes; @@ -3465,7 +3465,7 @@ done_prefixes: break; case Prefix: if (ctxt->rep_prefix && op_prefix) - return X86EMUL_UNHANDLEABLE; + return EMULATION_FAILED; simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; @@ -3475,7 +3475,7 @@ done_prefixes: } break; default: - return X86EMUL_UNHANDLEABLE; + return EMULATION_FAILED; } ctxt->d &= ~GroupMask; @@ -3488,10 +3488,10 @@ done_prefixes: /* Unrecognised? */ if (ctxt->d == 0 || (ctxt->d & Undefined)) - return -1; + return EMULATION_FAILED; if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) - return -1; + return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; @@ -3683,7 +3683,7 @@ done: if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) memopp->addr.mem.ea += ctxt->_eip; - return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; + return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d28dff7..1fe9637 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4837,7 +4837,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; - if (r) { + if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) -- cgit v0.10.2 From 742bc67042e34a9fe1fed0b46e4cb1431a72c4bf Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 29 Jul 2011 19:44:21 -0300 Subject: KVM: x86: report valid microcode update ID Windows Server 2008 SP2 checked build with smp > 1 BSOD's during boot due to lack of microcode update: *** Assertion failed: The system BIOS on this machine does not properly support the processor. The system BIOS did not load any microcode update. A BIOS containing the latest microcode update is needed for system reliability. (CurrentUpdateRevision != 0) *** Source File: d:\longhorn\base\hals\update\intelupd\update.c, line 440 Report a non-zero microcode update signature to make it happy. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1fe9637..ea8f9f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1842,7 +1842,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) switch (msr) { case MSR_IA32_PLATFORM_ID: - case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: @@ -1863,6 +1862,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_IA32_UCODE_REV: + data = 0x100000000ULL; + break; case MSR_MTRRcap: data = 0x500 | KVM_NR_VAR_MTRR; break; -- cgit v0.10.2 From cf3ace79c065d65e9636f719a9df1382725410e3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 2 Aug 2011 12:34:57 +0200 Subject: KVM: VMX: trivial: use BUG_ON Use BUG_ON(x) rather than if(x) BUG(); The semantic patch that fixes this problem is as follows: (http://coccinelle.lip6.fr/) // @@ identifier x; @@ -if (x) BUG(); +BUG_ON(x); @@ identifier x; @@ -if (!x) BUG(); +BUG_ON(!x); // Signed-off-by: Julia Lawall Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e26629f..03df703 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4115,8 +4115,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ - if (enable_ept) - BUG(); + BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); trace_kvm_page_fault(cr2, error_code); -- cgit v0.10.2 From e4e517b4be019787ada4cbbce2f04570c21b0cbd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Jul 2011 11:36:17 +0300 Subject: KVM: MMU: Do not unconditionally read PDPTE from guest memory Architecturally, PDPTEs are cached in the PDPTRs when CR3 is reloaded. On SVM, it is not possible to implement this, but on VMX this is possible and was indeed implemented until nested SVM changed this to unconditionally read PDPTEs dynamically. This has noticable impact when running PAE guests. Fix by changing the MMU to read PDPTRs from the cache, falling back to reading from memory for the nested MMU. Signed-off-by: Avi Kivity Tested-by: Joerg Roedel Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 307e3cf..b31a341 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -265,6 +265,7 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3377d53..544076c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.walk_mmu->pdptrs[index]; } -static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) -{ - load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); - - return mmu->pdptrs[index]; -} - static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8e8da79..f1b36cf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); + pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; @@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; context->nx = is_nx(vcpu); @@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; return r; @@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; g_context->get_cr3 = get_cr3; + g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 507e2b8..f6dd9fe 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -163,7 +163,7 @@ retry_walk: #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { - pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); + pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) goto error; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2b24a88..f043168 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1844,6 +1844,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) return svm->nested.nested_cr3; } +static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 cr3 = svm->nested.nested_cr3; + u64 pdpte; + int ret; + + ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, + offset_in_page(cr3) + index * 8, 8); + if (ret) + return 0; + return pdpte; +} + static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) { @@ -1875,6 +1889,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; -- cgit v0.10.2 From cd46868c7f088baaaba8f81251c5929f6002d050 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Tue, 9 Aug 2011 18:14:01 +0800 Subject: KVM: MMU: Fix SMEP failure during fetch This patch fix kvm-unit-tests hanging and incorrect PT_ACCESSED_MASK bit set in the case of SMEP fault. The code updated 'eperm' after the variable was checked. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f6dd9fe..9299410 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - bool eperm; + bool eperm, last_gpte; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -221,6 +221,17 @@ retry_walk: eperm = true; #endif + last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); + if (last_gpte) { + pte_access = pt_access & + FNAME(gpte_access)(vcpu, pte, true); + /* check if the kernel is fetching from user page */ + if (unlikely(pte_access & PT_USER_MASK) && + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (fetch_fault && !user_fault) + eperm = true; + } + if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, @@ -238,18 +249,12 @@ retry_walk: walker->ptes[walker->level - 1] = pte; - if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { + if (last_gpte) { int lvl = walker->level; gpa_t real_gpa; gfn_t gfn; u32 ac; - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - gfn = gpte_to_gfn_lvl(pte, lvl); gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; @@ -295,7 +300,6 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", -- cgit v0.10.2 From e1a72ae287f3ea4fea63b6f14a662a15e8c11960 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 9 Aug 2011 14:28:35 +0300 Subject: KVM: nVMX: Document 'nested' parameter Add documentation of the new 'nested' parameter to 'Documentation/kernel-parameters.txt'. Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Nadav Har'El Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 854ed5ca..4711a88 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1181,6 +1181,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KVM,Intel] Disable FlexPriority feature (TPR shadow). Default is 1 (enabled) + kvm-intel.nested= + [KVM,Intel] Enable VMX nesting (nVMX). + Default is 0 (disabled) + kvm-intel.unrestricted_guest= [KVM,Intel] Disable unrestricted guest feature (virtualized real and unpaged mode) on capable -- cgit v0.10.2 From d5c1785d2f3aabe284d91bc7fc8f0abc58525dc9 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:54:20 +0300 Subject: KVM: L1 TSC handling KVM assumed in several places that reading the TSC MSR returns the value for L1. This is incorrect, because when L2 is running, the correct TSC read exit emulation is to return L2's value. We therefore add a new x86_ops function, read_l1_tsc, to use in places that specifically need to read the L1 TSC, NOT the TSC of the current level of guest. Note that one change, of one line in kvm_arch_vcpu_load, is made redundant by a different patch sent by Zachary Amsden (and not yet applied): kvm_arch_vcpu_load() should not read the guest TSC, and if it didn't, of course we didn't have to change the call of kvm_get_msr() to read_l1_tsc(). [avi: moved callback to kvm_x86_ops tsc block] Signed-off-by: Nadav Har'El Acked-by: Zachary Amsdem Signed-off-by: Avi Kivity diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b31a341..6ab4241 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -630,6 +630,7 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f043168..590d1d2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2910,6 +2910,13 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); + return vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4201,6 +4208,7 @@ static struct kvm_x86_ops svm_x86_ops = { .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, + .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 03df703..97b6454 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1748,6 +1748,21 @@ static u64 guest_read_tsc(void) } /* + * Like guest_read_tsc, but always returns L1's notion of the timestamp + * counter, even if a nested guest (L2) is currently running. + */ +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = is_guest_mode(vcpu) ? + to_vmx(vcpu)->nested.vmcs01_tsc_offset : + vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ * ioctl. In this case the call-back should update internal vmx state to make * the changes effective. @@ -7010,6 +7025,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, + .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea8f9f0..6b37f18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1098,7 +1098,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { @@ -2218,7 +2218,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) s64 tsc_delta; u64 tsc; - kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc = kvm_x86_ops->read_l1_tsc(vcpu); tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : tsc - vcpu->arch.last_guest_tsc; @@ -2242,7 +2242,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); } static int is_efer_nx(void) @@ -5729,7 +5729,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); -- cgit v0.10.2 From 27fc51b21cea3386a6672699631975d1097f9d39 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:54:52 +0300 Subject: KVM: nVMX: Fix nested VMX TSC emulation This patch fixes two corner cases in nested (L2) handling of TSC-related issues: 1. Somewhat suprisingly, according to the Intel spec, if L1 allows WRMSR to the TSC MSR without an exit, then this should set L1's TSC value itself - not offset by vmcs12.TSC_OFFSET (like was wrongly done in the previous code). 2. Allow L1 to disable the TSC_OFFSETING control, and then correctly ignore the vmcs12.TSC_OFFSET. Signed-off-by: Nadav Har'El Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 97b6454..5e8d411 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1777,15 +1777,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vmcs_write64(TSC_OFFSET, offset); - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { /* - * We're here if L1 chose not to trap the TSC MSR. Since - * prepare_vmcs12() does not copy tsc_offset, we need to also - * set the vmcs12 field here. + * We're here if L1 chose not to trap WRMSR to TSC. According + * to the spec, this should set L1's TSC; The offset that L1 + * set for L2 remains unchanged, and still needs to be added + * to the newly set TSC to get L2's TSC. */ - get_vmcs12(vcpu)->tsc_offset = offset - - to_vmx(vcpu)->nested.vmcs01_tsc_offset; + struct vmcs12 *vmcs12; + to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; + /* recalculate vmcs02.TSC_OFFSET: */ + vmcs12 = get_vmcs12(vcpu); + vmcs_write64(TSC_OFFSET, offset + + (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? + vmcs12->tsc_offset : 0)); + } else { + vmcs_write64(TSC_OFFSET, offset); + } } static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) @@ -6485,8 +6493,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - vmcs_write64(TSC_OFFSET, - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vmcs_write64(TSC_OFFSET, + vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + else + vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); if (enable_vpid) { /* @@ -6893,7 +6904,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ + /* Update TSC_OFFSET if TSC was changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); /* This is needed for same reason as it was needed in prepare_vmcs02 */ -- cgit v0.10.2 From 45133ecaaec7aea447afc98cc2c24aac638bbe5c Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 2 Aug 2011 15:55:23 +0300 Subject: KVM: SVM: Fix TSC MSR read in nested SVM When the TSC MSR is read by an L2 guest (when L1 allowed this MSR to be read without exit), we need to return L2's notion of the TSC, not L1's. The current code incorrectly returned L1 TSC, because svm_get_msr() was also used in x86.c where this was assumed, but now that these places call the new svm_read_l1_tsc(), the MSR read can be fixed. Signed-off-by: Nadav Har'El Tested-by: Joerg Roedel Acked-by: Joerg Roedel Signed-off-by: Avi Kivity diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 590d1d2..8277f32 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2923,9 +2923,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) switch (ecx) { case MSR_IA32_TSC: { - struct vmcb *vmcb = get_host_vmcb(svm); - - *data = vmcb->control.tsc_offset + + *data = svm->vmcb->control.tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; -- cgit v0.10.2 From 58fbbf26eb01cf6d92cf18da8d14b3a4af9c4b47 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Tue, 30 Aug 2011 13:56:17 +0300 Subject: KVM: APIC: avoid instruction emulation for EOI writes Instruction emulation for EOI writes can be skipped, since sane guest simply uses MOV instead of string operations. This is a nice improvement when guest doesn't support x2apic or hyper-V EOI support. a single VM bandwidth is observed with ~8% bandwidth improvement (7.4Gbps->8Gbps), by saving ~5% cycles from EOI emulation. Signed-off-by: Kevin Tian : Signed-off-by: Eddie Dong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2caf290..31f180c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -350,6 +350,18 @@ enum vmcs_field { #define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ +/* + * Exit Qualifications for APIC-Access + */ +#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */ +#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */ +#define TYPE_LINEAR_APIC_INST_READ (0 << 12) +#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12) +#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12) +#define TYPE_LINEAR_APIC_EVENT (3 << 12) +#define TYPE_PHYSICAL_APIC_EVENT (10 << 12) +#define TYPE_PHYSICAL_APIC_INST (15 << 12) + /* segment AR */ #define SEGMENT_AR_L_MASK (1 << 13) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 57dcbd4..52645f2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -864,6 +864,15 @@ static int apic_mmio_write(struct kvm_io_device *this, return 0; } +void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic) + apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); +} +EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { if (!vcpu->arch.apic) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 52c9e6b..8287243 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5e8d411..47419d6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO); static int __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); +static int __read_mostly fasteoi = 1; +module_param(fasteoi, bool, S_IRUGO); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -4540,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { + if (likely(fasteoi)) { + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int access_type, offset; + + access_type = exit_qualification & APIC_ACCESS_TYPE; + offset = exit_qualification & APIC_ACCESS_OFFSET; + /* + * Sane guest uses MOV to write EOI, with written value + * not cared. So make a short-circuit here by avoiding + * heavy instruction emulation. + */ + if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && + (offset == APIC_EOI)) { + kvm_lapic_set_eoi(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + } return emulate_instruction(vcpu, 0) == EMULATE_DONE; } -- cgit v0.10.2 From 364426871ca33752a6e8fcfccec4d89e2eaf06f7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 29 Aug 2011 16:27:08 +0300 Subject: KVM: Restore missing powerpc API docs Commit 371fefd6 lost a doc hunk somehow, restore it. Signed-off-by: Avi Kivity diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 75cd8fb..2d510b6 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -200,6 +200,19 @@ allocation of vcpu ids. For example, if userspace wants single-threaded guest vcpus, it should make all vcpu ids be a multiple of the number of vcpus per vcore. +On powerpc using book3s_hv mode, the vcpus are mapped onto virtual +threads in one or more virtual CPU cores. (This is because the +hardware requires all the hardware threads in a CPU core to be in the +same partition.) The KVM_CAP_PPC_SMT capability indicates the number +of vcpus per virtual core (vcore). The vcore id is obtained by +dividing the vcpu id by the number of vcpus per vcore. The vcpus in a +given vcore will always be in the same physical core as each other +(though that might be a different physical core from time to time). +Userspace can control the threading (SMT) mode of the guest by its +allocation of vcpu ids. For example, if userspace wants +single-threaded guest vcpus, it should make all vcpu ids be a multiple +of the number of vcpus per vcore. + 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic -- cgit v0.10.2 From db507c300ed6ce6e9fc71d4e19975d5abe01a7de Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 8 Jul 2011 13:40:10 +0200 Subject: KVM: PPC: move compute_tlbie_rb to book3s common header We need the compute_tlbie_rb in _pr and _hv implementations for papr soon, so let's move it over to a common header file that both implementations can leverage. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 98da010..37dd748 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -382,6 +382,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) } #endif +static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index fcfe6b0..bacb0cf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -110,39 +110,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } -static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, - unsigned long pte_index) -{ - unsigned long rb, va_low; - - rb = (v & ~0x7fUL) << 16; /* AVA field */ - va_low = pte_index >> 3; - if (v & HPTE_V_SECONDARY) - va_low = ~va_low; - /* xor vsid from AVA */ - if (!(v & HPTE_V_1TB_SEG)) - va_low ^= v >> 12; - else - va_low ^= v >> 24; - va_low &= 0x7ff; - if (v & HPTE_V_LARGE) { - rb |= 1; /* L field */ - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (r & 0xff000)) { - /* non-16MB large page, must be 64k */ - /* (masks depend on page size) */ - rb |= 0x1000; /* page encoding in LP field */ - rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ - rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ - } - } else { - /* 4kB page */ - rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ - } - rb |= (v >> 54) & 0x300; /* B field */ - return rb; -} - #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) static inline int try_lock_tlbie(unsigned int *lock) -- cgit v0.10.2 From 9432ba6015371f186926cd62e2395718217a17a1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 16:08:55 +0200 Subject: KVM: PPC: Add papr_enabled flag When running a PAPR guest, some things change. The privilege level drops from hypervisor to supervisor, SDR1 gets treated differently and we interpret hypercalls. For bisectability sake, add the flag now, but only enable it when all the support code is there. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index cc22b28..e681302 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -389,6 +389,7 @@ struct kvm_vcpu_arch { u8 dcr_is_write; u8 osi_needed; u8 osi_enabled; + u8 papr_enabled; u8 hcall_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ -- cgit v0.10.2 From 317a8fa304f8ba3cd8b67b0f5f38e366e8c3cf1d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 16:07:16 +0200 Subject: KVM: PPC: Check privilege level on SPRs We have 3 privilege levels: problem state, supervisor state and hypervisor state. Each of them can access different SPRs, so we need to check on every SPR if it's accessible in the respective mode. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 4668465..bf0ddcd 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -63,6 +63,25 @@ * function pointers, so let's just disable the define. */ #undef mfsrin +enum priv_level { + PRIV_PROBLEM = 0, + PRIV_SUPER = 1, + PRIV_HYPER = 2, +}; + +static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) +{ + /* PAPR VMs only access supervisor SPRs */ + if (vcpu->arch.papr_enabled && (level > PRIV_SUPER)) + return false; + + /* Limit user space to its own small SPR set */ + if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM) + return false; + + return true; +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -296,6 +315,8 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) switch (sprn) { case SPRN_SDR1: + if (!spr_allowed(vcpu, PRIV_HYPER)) + goto unprivileged; to_book3s(vcpu)->sdr1 = spr_val; break; case SPRN_DSISR: @@ -390,6 +411,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: break; +unprivileged: default: printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); #ifndef DEBUG_SPR @@ -421,6 +443,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) break; } case SPRN_SDR1: + if (!spr_allowed(vcpu, PRIV_HYPER)) + goto unprivileged; kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); break; case SPRN_DSISR: @@ -476,6 +500,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) kvmppc_set_gpr(vcpu, rt, 0); break; default: +unprivileged: printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); #ifndef DEBUG_SPR emulated = EMULATE_FAIL; -- cgit v0.10.2 From 04fcc11bb5653c421f11f109d8b79ea5eb9fedb4 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 15:06:55 +0200 Subject: KVM: PPC: Interpret SDR1 as HVA in PAPR mode When running a PAPR guest, the guest is not allowed to set SDR1 - instead the HTAB information is held in internal hypervisor structures. But all of our current code relies on SDR1 and walking the HTAB like on real hardware. So in order to not be too intrusive, we simply set SDR1 to the HTAB we hold in host memory. That way we can keep the HTAB in user space, but use it from kernel space to map the guest. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index c6d3e19..b871721 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -128,7 +128,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n", page, vcpu_book3s->sdr1, pteg, slbe->vsid); - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + /* When running a PAPR guest, SDR1 contains a HVA address instead + of a GPA */ + if (vcpu_book3s->vcpu.arch.papr_enabled) + r = pteg; + else + r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + if (kvm_is_error_hva(r)) return r; return r | (pteg & ~PAGE_MASK); -- cgit v0.10.2 From 77e675ad825d1106f973afd353e8af84cd8d3960 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 16:11:36 +0200 Subject: KVM: PPC: Read out syscall instruction on trap We have a few traps where we cache the instruction that cause the trap for analysis later on. Since we now need to be able to distinguish between SC 0 and SC 1 system calls and the only way to find out which is which is by looking at the instruction, we also read out the instruction causing the system call. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index aed32e5..678b6be 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -213,11 +213,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) beq ld_last_inst cmpwi r12, BOOK3S_INTERRUPT_PROGRAM beq ld_last_inst + cmpwi r12, BOOK3S_INTERRUPT_SYSCALL + beq ld_last_prev_inst cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT beq- ld_last_inst b no_ld_last_inst +ld_last_prev_inst: + addi r3, r3, -4 + ld_last_inst: /* Save off the guest instruction we're at */ -- cgit v0.10.2 From a15bd354f083f20f257db450488db52ac27df439 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:17:09 +0200 Subject: KVM: PPC: Add support for explicit HIOR setting Until now, we always set HIOR based on the PVR, but this is just wrong. Instead, we should be setting HIOR explicitly, so user space can decide what the initial HIOR value is - just like on real hardware. We keep the old PVR based way around for backwards compatibility, but once user space uses the SREGS based method, we drop the PVR logic. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index a4f6c85..a6a253e 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -149,6 +149,12 @@ struct kvm_regs { #define KVM_SREGS_E_UPDATE_DBSR (1 << 3) /* + * Book3S special bits to indicate contents in the struct by maintaining + * backwards compatibility with older structs. If adding a new field, + * please make sure to add a flag for that new field */ +#define KVM_SREGS_S_HIOR (1 << 0) + +/* * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a * previous KVM_GET_REGS. * @@ -173,6 +179,8 @@ struct kvm_sregs { __u64 ibat[8]; __u64 dbat[8]; } ppc32; + __u64 flags; /* KVM_SREGS_S_ */ + __u64 hior; } s; struct { union { diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 37dd748..472437b 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; + bool hior_sregs; /* HIOR is set by SREGS, not PVR */ + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 0c0d3f2..78dcf65 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -150,13 +150,15 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; + if (!to_book3s(vcpu)->hior_sregs) + to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; + if (!to_book3s(vcpu)->hior_sregs) + to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; } @@ -770,6 +772,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } + if (sregs->u.s.flags & KVM_SREGS_S_HIOR) + sregs->u.s.hior = to_book3s(vcpu)->hior; + return 0; } @@ -806,6 +811,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Flush the MMU after messing with the segments */ kvmppc_mmu_pte_flush(vcpu, 0, 0); + if (sregs->u.s.flags & KVM_SREGS_S_HIOR) { + to_book3s(vcpu)->hior_sregs = true; + to_book3s(vcpu)->hior = sregs->u.s.hior; + } + return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a107c9b..17a5c83 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -188,6 +188,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: #endif case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2069798..490b041 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_S390_GMAP 71 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v0.10.2 From 0254f0742998dc61fcf68a3488e2d93636031263 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:21:15 +0200 Subject: KVM: PPC: Add PAPR hypercall code for PR mode When running a PAPR guest, we need to handle a few hypercalls in kernel space, most prominently the page table invalidation (to sync the shadows). So this patch adds handling for a few PAPR hypercalls to PR mode KVM. I tried to share the code with HV mode, but it ended up being a lot easier this way around, as the two differ too much in those details. Signed-off-by: Alexander Graf --- v1 -> v2: - whitespace fix diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 472437b..91d41fa 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -150,6 +150,7 @@ extern void kvmppc_load_up_altivec(void); extern void kvmppc_load_up_vsx(void); extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst); extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst); +extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 08428e2..4c66d51 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -43,6 +43,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ + book3s_pr_papr.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c new file mode 100644 index 0000000..b958932 --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2011. Freescale Inc. All rights reserved. + * + * Authors: + * Alexander Graf + * Paul Mackerras + * + * Description: + * + * Hypercall handling for running PAPR guests in PR KVM on Book 3S + * processors. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + unsigned long pteg_addr; + + pte_index <<= 4; + pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70; + pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; + pteg_addr |= pte_index; + + return pteg_addr; +} + +static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) +{ + long flags = kvmppc_get_gpr(vcpu, 4); + long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long pteg[2 * 8]; + unsigned long pteg_addr, i, *hpte; + + pte_index &= ~7UL; + pteg_addr = get_pteg_addr(vcpu, pte_index); + + copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); + hpte = pteg; + + if (likely((flags & H_EXACT) == 0)) { + pte_index &= ~7UL; + for (i = 0; ; ++i) { + if (i == 8) + return H_PTEG_FULL; + if ((*hpte & HPTE_V_VALID) == 0) + break; + hpte += 2; + } + } else { + i = kvmppc_get_gpr(vcpu, 5) & 7UL; + hpte += i * 2; + } + + hpte[0] = kvmppc_get_gpr(vcpu, 6); + hpte[1] = kvmppc_get_gpr(vcpu, 7); + copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + kvmppc_set_gpr(vcpu, 4, pte_index | i); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) +{ + unsigned long flags= kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long v = 0, pteg, rb; + unsigned long pte[2]; + + pteg = get_pteg_addr(vcpu, pte_index); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { + kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); + return EMULATE_DONE; + } + + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + kvmppc_set_gpr(vcpu, 4, pte[0]); + kvmppc_set_gpr(vcpu, 5, pte[1]); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) +{ + unsigned long flags = kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long rb, pteg, r, v; + unsigned long pte[2]; + + pteg = get_pteg_addr(vcpu, pte_index); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { + kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); + return EMULATE_DONE; + } + + v = pte[0]; + r = pte[1]; + r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | + HPTE_R_KEY_LO); + r |= (flags << 55) & HPTE_R_PP0; + r |= (flags << 48) & HPTE_R_KEY_HI; + r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + pte[1] = r; + + rb = compute_tlbie_rb(v, r, pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + copy_to_user((void __user *)pteg, pte, sizeof(pte)); + + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + + return EMULATE_DONE; +} + +int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) +{ + switch (cmd) { + case H_ENTER: + return kvmppc_h_pr_enter(vcpu); + case H_REMOVE: + return kvmppc_h_pr_remove(vcpu); + case H_PROTECT: + return kvmppc_h_pr_protect(vcpu); + case H_BULK_REMOVE: + /* We just flush all PTEs, so user space can + handle the HPT modifications */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + break; + case H_CEDE: + kvm_vcpu_block(vcpu); + vcpu->stat.halt_wakeup++; + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} -- cgit v0.10.2 From aacf9aa3a7762a878c40fddef6e68262b9f9f383 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:22:59 +0200 Subject: KVM: PPC: Stub emulate CFAR and PURR SPRs Recent Linux versions use the CFAR and PURR SPRs, but don't really care about their contents (yet). So for now, we can simply return 0 when the guest wants to read them. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index bf0ddcd..0c9dc62 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -473,6 +473,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_HID5: kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); break; + case SPRN_CFAR: + case SPRN_PURR: + kvmppc_set_gpr(vcpu, rt, 0); + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: -- cgit v0.10.2 From a668f2bd3f14ce7f92e119f4b5d9b50cdc59e855 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:26:24 +0200 Subject: KVM: PPC: Support SC1 hypercalls for PAPR in PR mode PAPR defines hypercalls as SC1 instructions. Using these, the guest modifies page tables and does other privileged operations that it wouldn't be allowed to do in supervisor mode. This patch adds support for PR KVM to trap these instructions and route them through the same PAPR hypercall interface that we already use for HV style KVM. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 78dcf65..48558f6 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -648,7 +648,27 @@ program_interrupt: break; } case BOOK3S_INTERRUPT_SYSCALL: - if (vcpu->arch.osi_enabled && + if (vcpu->arch.papr_enabled && + (kvmppc_get_last_inst(vcpu) == 0x44000022) && + !(vcpu->arch.shared->msr & MSR_PR)) { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { + r = RESUME_GUEST; + break; + } + + run->papr_hcall.nr = cmd; + for (i = 0; i < 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run->papr_hcall.args[i] = gpr; + } + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + r = RESUME_HOST; + } else if (vcpu->arch.osi_enabled && (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { /* MOL hypercalls */ -- cgit v0.10.2 From 930b412a005bde2ea3f05911eaaeeb10f11d79ab Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:29:42 +0200 Subject: KVM: PPC: Enable the PAPR CAP for Book3S Now that Book3S PV mode can also run PAPR guests, we can add a PAPR cap and enable it for all Book3S targets. Enabling that CAP switches KVM into PAPR mode. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 17a5c83..13bc798 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -189,6 +189,7 @@ int kvm_dev_ioctl_check_extension(long ext) #else case KVM_CAP_PPC_SEGSTATE: case KVM_CAP_PPC_HIOR: + case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: @@ -572,6 +573,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.osi_enabled = true; break; + case KVM_CAP_PPC_PAPR: + r = 0; + vcpu->arch.papr_enabled = true; + break; default: r = -EINVAL; break; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 490b041..6884054 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -555,6 +555,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_HIOR 67 +#define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v0.10.2 From af8f38b3499f0d4a3c354df2435f0fb2dded250a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 10 Aug 2011 13:57:08 +0200 Subject: KVM: PPC: Add sanity checking to vcpu_run There are multiple features in PowerPC KVM that can now be enabled depending on the user's wishes. Some of the combinations don't make sense or don't work though. So this patch adds a way to check if the executing environment would actually be able to run the guest properly. It also adds sanity checks if PVR is set (should always be true given the current code flow), if PAPR is only used with book3s_64 where it works and that HV KVM is only used in PAPR mode. Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index a6a253e..08fe69e 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -284,6 +284,11 @@ struct kvm_guest_debug_arch { #define KVM_INTERRUPT_UNSET -2U #define KVM_INTERRUPT_SET_LEVEL -3U +#define KVM_CPU_440 1 +#define KVM_CPU_E500V2 2 +#define KVM_CPU_3S_32 3 +#define KVM_CPU_3S_64 4 + /* for KVM_CAP_SPAPR_TCE */ struct kvm_create_spapr_tce { __u64 liobn; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e681302..2b8284f 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -390,6 +390,8 @@ struct kvm_vcpu_arch { u8 osi_needed; u8 osi_enabled; u8 papr_enabled; + u8 sane; + u8 cpu_type; u8 hcall_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d121f49..46efd1a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); /* Core-specific hooks */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index da3a122..ca1f88b 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -78,6 +78,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) vcpu_44x->shadow_refs[i].gtlb_index = -1; + vcpu->arch.cpu_type = KVM_CPU_440; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cc0d7f1..bf66ec7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -510,6 +510,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; + vcpu->arch.cpu_type = KVM_CPU_3S_64; + kvmppc_sanity_check(vcpu); + return vcpu; free_vcpu: @@ -800,6 +803,11 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; + if (!vcpu->arch.sane) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + do { r = kvmppc_run_vcpu(run, vcpu); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 48558f6..6e3488b 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -153,6 +153,7 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!to_book3s(vcpu)->hior_sregs) to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_64; } else #endif { @@ -160,8 +161,11 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!to_book3s(vcpu)->hior_sregs) to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_32; } + kvmppc_sanity_check(vcpu); + /* If we are in hypervisor level on 970, we can tell the CPU to * treat DCBZ as 32 bytes store */ vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; @@ -938,6 +942,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; + /* Check if we can run the vcpu at all */ + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { kvm_run->exit_reason = KVM_EXIT_INTR; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index ee45fa0..bb6c988 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -316,6 +316,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + local_irq_disable(); kvm_guest_enter(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -618,6 +623,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int i; + int r; vcpu->arch.pc = 0; vcpu->arch.shared->msr = 0; @@ -634,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_init_timing_stats(vcpu); - return kvmppc_core_vcpu_setup(vcpu); + r = kvmppc_core_vcpu_setup(vcpu); + kvmppc_sanity_check(vcpu); + return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 797a744..26d2090 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -73,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ vcpu->vcpu_id = 0; + vcpu->arch.cpu_type = KVM_CPU_E500V2; + return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 13bc798..a8000ce 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -95,6 +95,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +int kvmppc_sanity_check(struct kvm_vcpu *vcpu) +{ + int r = false; + + /* We have to know what CPU to virtualize */ + if (!vcpu->arch.pvr) + goto out; + + /* PAPR only works with book3s_64 */ + if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) + goto out; + +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* HV KVM can only do PAPR mode for now */ + if (!vcpu->arch.papr_enabled) + goto out; +#endif + + r = true; + +out: + vcpu->arch.sane = r; + return r ? 0 : -EINVAL; +} + int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er; @@ -582,6 +607,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } + if (!r) + r = kvmppc_sanity_check(vcpu); + return r; } -- cgit v0.10.2 From 177339d7f7c99a25ecfdb6baeea6a2508fb2349f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 23 Jul 2011 17:41:11 +1000 Subject: KVM: PPC: Assemble book3s{,_hv}_rmhandlers.S separately This makes arch/powerpc/kvm/book3s_rmhandlers.S and arch/powerpc/kvm/book3s_hv_rmhandlers.S be assembled as separate compilation units rather than having them #included in arch/powerpc/kernel/exceptions-64s.S. We no longer have any conditional branches between the exception prologs in exceptions-64s.S and the KVM handlers, so there is no need to keep their contents close together in the vmlinux image. In their current location, they are using up part of the limited space between the first-level interrupt handlers and the firmware NMI data area at offset 0x7000, and with some kernel configurations this area will overflow (e.g. allyesconfig), leading to an "attempt to .org backwards" error when compiling exceptions-64s.S. Moving them out requires that we add some #includes that the book3s_{,hv_}rmhandlers.S code was previously getting implicitly via exceptions-64s.S. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 41b02c7..29ddd8b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -427,16 +427,6 @@ slb_miss_user_pseries: b . /* prevent spec. execution */ #endif /* __DISABLED__ */ -/* KVM's trampoline code needs to be close to the interrupt handlers */ - -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#ifdef CONFIG_KVM_BOOK3S_PR -#include "../kvm/book3s_rmhandlers.S" -#else -#include "../kvm/book3s_hv_rmhandlers.S" -#endif -#endif - .align 7 .globl __end_interrupts __end_interrupts: diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4c66d51..3688aee 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -50,12 +50,15 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ book3s_64_mmu_host.o \ book3s_64_mmu.o \ book3s_32_mmu.o +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + book3s_rmhandlers.o kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ book3s_64_vio_hv.o \ book3s_hv_builtin.o diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index de29501..bc6ade9 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -20,7 +20,10 @@ #include #include #include +#include #include +#include +#include #include #include diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index c1f877c..5ee66ed 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) #define FUNC(name) GLUE(.,name) + .globl kvmppc_skip_interrupt kvmppc_skip_interrupt: /* * Here all GPRs are unchanged from when the interrupt happened @@ -51,6 +53,7 @@ kvmppc_skip_interrupt: rfid b . + .globl kvmppc_skip_Hinterrupt kvmppc_skip_Hinterrupt: /* * Here all GPRs are unchanged from when the interrupt happened -- cgit v0.10.2 From 02143947603fe90237a0423d34dd8943de229f78 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 23 Jul 2011 17:41:44 +1000 Subject: KVM: PPC: book3s_pr: Simplify transitions between virtual and real mode This simplifies the way that the book3s_pr makes the transition to real mode when entering the guest. We now call kvmppc_entry_trampoline (renamed from kvmppc_rmcall) in the base kernel using a normal function call instead of doing an indirect call through a pointer in the vcpu. If kvm is a module, the module loader takes care of generating a trampoline as it does for other calls to functions outside the module. kvmppc_entry_trampoline then disables interrupts and jumps to kvmppc_handler_trampoline_enter in real mode using an rfi[d]. That then uses the link register as the address to return to (potentially in module space) when the guest exits. This also simplifies the way that we call the Linux interrupt handler when we exit the guest due to an external, decrementer or performance monitor interrupt. Instead of turning on the MMU, then deciding that we need to call the Linux handler and turning the MMU back off again, we now go straight to the handler at the point where we would turn the MMU on. The handler will then return to the virtual-mode code (potentially in the module). Along the way, this moves the setting and clearing of the HID5 DCBZ32 bit into real-mode interrupts-off code, and also makes sure that we clear the MSR[RI] bit before loading values into SRR0/1. The net result is that we no longer need any code addresses to be stored in vcpu->arch. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 91d41fa..a384ffd 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -141,9 +141,7 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); -extern void kvmppc_handler_lowmem_trampoline(void); -extern void kvmppc_handler_trampoline_enter(void); -extern void kvmppc_rmcall(ulong srr0, ulong srr1); +extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); extern void kvmppc_load_up_fpu(void); extern void kvmppc_load_up_altivec(void); diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index ef7b368..af73469 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -75,6 +75,7 @@ struct kvmppc_host_state { ulong scratch0; ulong scratch1; u8 in_guest; + u8 restore_hid5; #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu *kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2b8284f..dec3054 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -258,14 +258,6 @@ struct kvm_vcpu_arch { ulong host_stack; u32 host_pid; #ifdef CONFIG_PPC_BOOK3S - ulong host_msr; - ulong host_r2; - void *host_retip; - ulong trampoline_lowmem; - ulong trampoline_enter; - ulong highmem_handler; - ulong rmcall; - ulong host_paca_phys; struct kvmppc_slb slb[64]; int slb_max; /* 1 + index of last valid entry in slb[] */ int slb_nr; /* total number of entries in SLB */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 5f078bc..e069c76 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -449,8 +449,6 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); - DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip)); - DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); @@ -458,10 +456,6 @@ int main(void) DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); - DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); - DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); - DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); - DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); @@ -537,6 +531,7 @@ int main(void) HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); + HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); #ifdef CONFIG_KVM_BOOK3S_64_HV HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S index 3608471..7e06a6f 100644 --- a/arch/powerpc/kvm/book3s_32_sr.S +++ b/arch/powerpc/kvm/book3s_32_sr.S @@ -31,7 +31,7 @@ * R1 = host R1 * R2 = host R2 * R3 = shadow vcpu - * all other volatile GPRS = free + * all other volatile GPRS = free except R4, R6 * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 04e7d3b..f2e6e48 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -53,7 +53,7 @@ slb_exit_skip_ ## num: * R1 = host R1 * R2 = host R2 * R3 = shadow vcpu - * all other volatile GPRS = free + * all other volatile GPRS = free except R4, R6 * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 88c8f26..f7f63a0 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -23,9 +23,7 @@ #ifdef CONFIG_KVM_BOOK3S_64_HV EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); #else -EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter); -EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline); -EXPORT_SYMBOL_GPL(kvmppc_rmcall); +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index c54b0e3..0a8515a 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -29,27 +29,11 @@ #define ULONG_SIZE 8 #define FUNC(name) GLUE(.,name) -#define GET_SHADOW_VCPU_R13 - -#define DISABLE_INTERRUPTS \ - mfmsr r0; \ - rldicl r0,r0,48,1; \ - rotldi r0,r0,16; \ - mtmsrd r0,1; \ - #elif defined(CONFIG_PPC_BOOK3S_32) #define ULONG_SIZE 4 #define FUNC(name) name -#define GET_SHADOW_VCPU_R13 \ - lwz r13, (THREAD + THREAD_KVM_SVCPU)(r2) - -#define DISABLE_INTERRUPTS \ - mfmsr r0; \ - rlwinm r0,r0,0,17,15; \ - mtmsr r0; \ - #endif /* CONFIG_PPC_BOOK3S_XX */ @@ -108,44 +92,17 @@ kvm_start_entry: kvm_start_lightweight: - GET_SHADOW_VCPU_R13 - PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4) - PPC_STL r3, HSTATE_VMHANDLER(r13) - - PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ - - DISABLE_INTERRUPTS - #ifdef CONFIG_PPC_BOOK3S_64 - /* Some guests may need to have dcbz set to 32 byte length. - * - * Usually we ensure that by patching the guest's instructions - * to trap on dcbz and emulate it in the hypervisor. - * - * If we can, we should tell the CPU to use 32 byte dcbz though, - * because that's a lot faster. - */ - PPC_LL r3, VCPU_HFLAGS(r4) - rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ - beq no_dcbz32_on - - mfspr r3,SPRN_HID5 - ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */ - mtspr SPRN_HID5,r3 - -no_dcbz32_on: - + rldicl r3, r3, 0, 63 /* r3 &= 1 */ + stb r3, HSTATE_RESTORE_HID5(r13) #endif /* CONFIG_PPC_BOOK3S_64 */ - PPC_LL r6, VCPU_RMCALL(r4) - mtctr r6 - - PPC_LL r3, VCPU_TRAMPOLINE_ENTER(r4) - LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ /* Jump to segment patching handler and into our guest */ - bctr + bl FUNC(kvmppc_entry_trampoline) + nop /* * This is the handler in module memory. It gets jumped at from the @@ -170,21 +127,6 @@ kvmppc_handler_highmem: /* R7 = vcpu */ PPC_LL r7, GPR4(r1) -#ifdef CONFIG_PPC_BOOK3S_64 - - PPC_LL r5, VCPU_HFLAGS(r7) - rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ - beq no_dcbz32_off - - li r4, 0 - mfspr r5,SPRN_HID5 - rldimi r5,r4,6,56 - mtspr SPRN_HID5,r5 - -no_dcbz32_off: - -#endif /* CONFIG_PPC_BOOK3S_64 */ - PPC_STL r14, VCPU_GPR(r14)(r7) PPC_STL r15, VCPU_GPR(r15)(r7) PPC_STL r16, VCPU_GPR(r16)(r7) @@ -204,67 +146,6 @@ no_dcbz32_off: PPC_STL r30, VCPU_GPR(r30)(r7) PPC_STL r31, VCPU_GPR(r31)(r7) - /* Restore host msr -> SRR1 */ - PPC_LL r6, VCPU_HOST_MSR(r7) - - /* - * For some interrupts, we need to call the real Linux - * handler, so it can do work for us. This has to happen - * as if the interrupt arrived from the kernel though, - * so let's fake it here where most state is restored. - * - * Call Linux for hardware interrupts/decrementer - * r3 = address of interrupt handler (exit reason) - */ - - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq call_linux_handler - cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER - beq call_linux_handler - cmpwi r12, BOOK3S_INTERRUPT_PERFMON - beq call_linux_handler - - /* Back to EE=1 */ - mtmsr r6 - sync - b kvm_return_point - -call_linux_handler: - - /* - * If we land here we need to jump back to the handler we - * came from. - * - * We have a page that we can access from real mode, so let's - * jump back to that and use it as a trampoline to get back into the - * interrupt handler! - * - * R3 still contains the exit code, - * R5 VCPU_HOST_RETIP and - * R6 VCPU_HOST_MSR - */ - - /* Restore host IP -> SRR0 */ - PPC_LL r5, VCPU_HOST_RETIP(r7) - - /* XXX Better move to a safe function? - * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - - mtlr r12 - - PPC_LL r4, VCPU_TRAMPOLINE_LOWMEM(r7) - mtsrr0 r4 - LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r3 - - RFI - -.global kvm_return_point -kvm_return_point: - - /* Jump back to lightweight entry if we're supposed to */ - /* go back into the guest */ - /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ mr r5, r12 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 6e3488b..d417511 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -875,8 +875,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (!p) goto uninit_vcpu; - vcpu->arch.host_retip = kvm_return_point; - vcpu->arch.host_msr = mfmsr(); #ifdef CONFIG_PPC_BOOK3S_64 /* default to book3s_64 (970fx) */ vcpu->arch.pvr = 0x3C0301; @@ -887,16 +885,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) kvmppc_set_pvr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; - /* remember where some real-mode handlers are */ - vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline); - vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter); - vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; -#ifdef CONFIG_PPC_BOOK3S_64 - vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; -#else - vcpu->arch.rmcall = (ulong)kvmppc_rmcall; -#endif - vcpu->arch.shadow_msr = MSR_USER64; err = kvmppc_mmu_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 5ee66ed..3418758 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -36,9 +36,8 @@ #if defined(CONFIG_PPC_BOOK3S_64) -#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg) -#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) #define FUNC(name) GLUE(.,name) +#define MTMSR_EERI(reg) mtmsrd (reg),1 .globl kvmppc_skip_interrupt kvmppc_skip_interrupt: @@ -68,8 +67,8 @@ kvmppc_skip_Hinterrupt: #elif defined(CONFIG_PPC_BOOK3S_32) -#define MSR_NOIRQ MSR_KERNEL #define FUNC(name) name +#define MTMSR_EERI(reg) mtmsr (reg) .macro INTERRUPT_TRAMPOLINE intno @@ -170,40 +169,24 @@ kvmppc_handler_skip_ins: #endif /* - * This trampoline brings us back to a real mode handler - * - * Input Registers: - * - * R5 = SRR0 - * R6 = SRR1 - * LR = real-mode IP + * Call kvmppc_handler_trampoline_enter in real mode * + * On entry, r4 contains the guest shadow MSR */ -.global kvmppc_handler_lowmem_trampoline -kvmppc_handler_lowmem_trampoline: - - mtsrr0 r5 +_GLOBAL(kvmppc_entry_trampoline) + mfmsr r5 + LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) + toreal(r7) + + li r9, MSR_RI + ori r9, r9, MSR_EE + andc r9, r5, r9 /* Clear EE and RI in MSR value */ + li r6, MSR_IR | MSR_DR + ori r6, r6, MSR_EE + andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ + MTMSR_EERI(r9) /* Clear EE and RI in MSR */ + mtsrr0 r7 /* before we set srr0/1 */ mtsrr1 r6 - blr -kvmppc_handler_lowmem_trampoline_end: - -/* - * Call a function in real mode - * - * Input Registers: - * - * R3 = function - * R4 = MSR - * R5 = scratch register - * - */ -_GLOBAL(kvmppc_rmcall) - LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ) - mtmsr r5 /* Disable relocation and interrupts, so mtsrr - doesn't get interrupted */ - sync - mtsrr0 r3 - mtsrr1 r4 RFI #if defined(CONFIG_PPC_BOOK3S_32) diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 678b6be..0676ae2 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -23,6 +23,7 @@ #define GET_SHADOW_VCPU(reg) \ mr reg, r13 +#define MTMSR_EERI(reg) mtmsrd (reg),1 #elif defined(CONFIG_PPC_BOOK3S_32) @@ -30,6 +31,7 @@ tophys(reg, r2); \ lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ tophys(reg, reg) +#define MTMSR_EERI(reg) mtmsr (reg) #endif @@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter: /* Required state: * * MSR = ~IR|DR - * R13 = PACA * R1 = host R1 * R2 = host R2 - * R10 = guest MSR + * R4 = guest shadow MSR + * R5 = normal host MSR + * R6 = current host MSR (EE, IR, DR off) + * LR = highmem guest exit code * all other volatile GPRS = free * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER @@ -71,15 +75,15 @@ kvmppc_handler_trampoline_enter: /* r3 = shadow vcpu */ GET_SHADOW_VCPU(r3) + /* Save guest exit handler address and MSR */ + mflr r0 + PPC_STL r0, HSTATE_VMHANDLER(r3) + PPC_STL r5, HSTATE_HOST_MSR(r3) + /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ PPC_STL r1, HSTATE_HOST_R1(r3) PPC_STL r2, HSTATE_HOST_R2(r3) - /* Move SRR0 and SRR1 into the respective regs */ - PPC_LL r9, SVCPU_PC(r3) - mtsrr0 r9 - mtsrr1 r10 - /* Activate guest mode, so faults get handled by KVM */ li r11, KVM_GUEST_MODE_GUEST stb r11, HSTATE_IN_GUEST(r3) @@ -87,17 +91,46 @@ kvmppc_handler_trampoline_enter: /* Switch to guest segment. This is subarch specific. */ LOAD_GUEST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + /* Some guests may need to have dcbz set to 32 byte length. + * + * Usually we ensure that by patching the guest's instructions + * to trap on dcbz and emulate it in the hypervisor. + * + * If we can, we should tell the CPU to use 32 byte dcbz though, + * because that's a lot faster. + */ + lbz r0, HSTATE_RESTORE_HID5(r3) + cmpwi r0, 0 + beq no_dcbz32_on + + mfspr r0,SPRN_HID5 + ori r0, r0, 0x80 /* XXX HID5_dcbz32 = 0x80 */ + mtspr SPRN_HID5,r0 +no_dcbz32_on: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + /* Enter guest */ - PPC_LL r4, SVCPU_CTR(r3) - PPC_LL r5, SVCPU_LR(r3) - lwz r6, SVCPU_CR(r3) - lwz r7, SVCPU_XER(r3) + PPC_LL r8, SVCPU_CTR(r3) + PPC_LL r9, SVCPU_LR(r3) + lwz r10, SVCPU_CR(r3) + lwz r11, SVCPU_XER(r3) + + mtctr r8 + mtlr r9 + mtcr r10 + mtxer r11 - mtctr r4 - mtlr r5 - mtcr r6 - mtxer r7 + /* Move SRR0 and SRR1 into the respective regs */ + PPC_LL r9, SVCPU_PC(r3) + /* First clear RI in our current MSR value */ + li r0, MSR_RI + andc r6, r6, r0 + MTMSR_EERI(r6) + mtsrr0 r9 + mtsrr1 r4 PPC_LL r0, SVCPU_R0(r3) PPC_LL r1, SVCPU_R1(r3) @@ -259,6 +292,43 @@ no_ld_last_inst: /* Switch back to host MMU */ LOAD_HOST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + + lbz r5, HSTATE_RESTORE_HID5(r13) + cmpwi r5, 0 + beq no_dcbz32_off + + li r4, 0 + mfspr r5,SPRN_HID5 + rldimi r5,r4,6,56 + mtspr SPRN_HID5,r5 + +no_dcbz32_off: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* + * For some interrupts, we need to call the real Linux + * handler, so it can do work for us. This has to happen + * as if the interrupt arrived from the kernel though, + * so let's fake it here where most state is restored. + * + * Having set up SRR0/1 with the address where we want + * to continue with relocation on (potentially in module + * space), we either just go straight there with rfi[d], + * or we jump to an interrupt handler with bctr if there + * is an interrupt to be handled first. In the latter + * case, the rfi[d] at the end of the interrupt handler + * will get us back to where we want to continue. + */ + + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 1f + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER + beq 1f + cmpwi r12, BOOK3S_INTERRUPT_PERFMON +1: mtctr r12 + /* Register usage at this point: * * R1 = host R1 @@ -269,13 +339,15 @@ no_ld_last_inst: * */ - /* RFI into the highmem handler */ - mfmsr r7 - ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */ - mtsrr1 r7 - /* Load highmem handler address */ + PPC_LL r6, HSTATE_HOST_MSR(r13) PPC_LL r8, HSTATE_VMHANDLER(r13) + + /* Restore host msr -> SRR1 */ + mtsrr1 r6 + /* Load highmem handler address */ mtsrr0 r8 + /* RFI into the highmem handler, or jump to interrupt handler */ + beqctr RFI kvmppc_handler_trampoline_exit_end: -- cgit v0.10.2 From 19ccb76a1938ab364a412253daec64613acbf3df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 23 Jul 2011 17:42:46 +1000 Subject: KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per core), whenever a CPU goes idle, we have to pull all the other hardware threads in the core out of the guest, because the H_CEDE hcall is handled in the kernel. This is inefficient. This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall in real mode. When a guest vcpu does an H_CEDE hcall, we now only exit to the kernel if all the other vcpus in the same core are also idle. Otherwise we mark this vcpu as napping, save state that could be lost in nap mode (mainly GPRs and FPRs), and execute the nap instruction. When the thread wakes up, because of a decrementer or external interrupt, we come back in at kvm_start_guest (from the system reset interrupt vector), find the `napping' flag set in the paca, and go to the resume path. This has some other ramifications. First, when starting a core, we now start all the threads, both those that are immediately runnable and those that are idle. This is so that we don't have to pull all the threads out of the guest when an idle thread gets a decrementer interrupt and wants to start running. In fact the idle threads will all start with the H_CEDE hcall returning; being idle they will just do another H_CEDE immediately and go to nap mode. This required some changes to kvmppc_run_core() and kvmppc_run_vcpu(). These functions have been restructured to make them simpler and clearer. We introduce a level of indirection in the wait queue that gets woken when external and decrementer interrupts get generated for a vcpu, so that we can have the 4 vcpus in a vcore using the same wait queue. We need this because the 4 vcpus are being handled by one thread. Secondly, when we need to exit from the guest to the kernel, we now have to generate an IPI for any napping threads, because an HDEC interrupt doesn't wake up a napping thread. Thirdly, we now need to be able to handle virtual external interrupts and decrementer interrupts becoming pending while a thread is napping, and deliver those interrupts to the guest when the thread wakes. This is done in kvmppc_cede_reentry, just before fast_guest_return. Finally, since we are not using the generic kvm_vcpu_block for book3s_hv, and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef from kvm_arch_vcpu_runnable. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index af73469..1f2f5b6 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -76,6 +76,7 @@ struct kvmppc_host_state { ulong scratch1; u8 in_guest; u8 restore_hid5; + u8 napping; #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu *kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index dec3054..bf8af5d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -198,21 +198,29 @@ struct kvm_arch { */ struct kvmppc_vcore { int n_runnable; - int n_blocked; + int n_busy; int num_threads; int entry_exit_count; int n_woken; int nap_count; + int napping_threads; u16 pcpu; - u8 vcore_running; + u8 vcore_state; u8 in_guest; struct list_head runnable_threads; spinlock_t lock; + wait_queue_head_t wq; }; #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) #define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) +/* Values for vcore_state */ +#define VCORE_INACTIVE 0 +#define VCORE_RUNNING 1 +#define VCORE_EXITING 2 +#define VCORE_SLEEPING 3 + struct kvmppc_pte { ulong eaddr; u64 vpage; @@ -403,11 +411,13 @@ struct kvm_vcpu_arch { struct dtl *dtl; struct dtl *dtl_end; + wait_queue_head_t *wqp; struct kvmppc_vcore *vcore; int ret; int trap; int state; int ptid; + bool timer_running; wait_queue_head_t cpu_run; struct kvm_vcpu_arch_shared *shared; @@ -423,8 +433,9 @@ struct kvm_vcpu_arch { #endif }; -#define KVMPPC_VCPU_BUSY_IN_HOST 0 -#define KVMPPC_VCPU_BLOCKED 1 +/* Values for vcpu->arch.state */ +#define KVMPPC_VCPU_STOPPED 0 +#define KVMPPC_VCPU_BUSY_IN_HOST 1 #define KVMPPC_VCPU_RUNNABLE 2 #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e069c76..69f7ffe 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -44,6 +44,7 @@ #include #include #include +#include #endif #ifdef CONFIG_PPC_ISERIES #include @@ -460,6 +461,8 @@ int main(void) DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); + DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded)); + DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); @@ -475,6 +478,7 @@ int main(void) DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); + DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - offsetof(struct kvmppc_vcpu_book3s, vcpu)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); @@ -532,6 +536,7 @@ int main(void) HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); + HSTATE_FIELD(HSTATE_NAPPING, napping); #ifdef CONFIG_KVM_BOOK3S_64_HV HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); @@ -544,6 +549,7 @@ int main(void) HSTATE_FIELD(HSTATE_DSCR, host_dscr); HSTATE_FIELD(HSTATE_DABR, dabr); HSTATE_FIELD(HSTATE_DECEXP, dec_expires); + DEFINE(IPI_PRIORITY, IPI_PRIORITY); #endif /* CONFIG_KVM_BOOK3S_64_HV */ #else /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bf66ec7..4644c79 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -62,6 +62,8 @@ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ +static void kvmppc_end_cede(struct kvm_vcpu *vcpu); + void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { local_paca->kvm_hstate.kvm_vcpu = vcpu; @@ -72,40 +74,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { } -static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu); -static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu); - -void kvmppc_vcpu_block(struct kvm_vcpu *vcpu) -{ - u64 now; - unsigned long dec_nsec; - - now = get_tb(); - if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu)) - kvmppc_core_queue_dec(vcpu); - if (vcpu->arch.pending_exceptions) - return; - if (vcpu->arch.dec_expires != ~(u64)0) { - dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC / - tb_ticks_per_sec; - hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), - HRTIMER_MODE_REL); - } - - kvmppc_vcpu_blocked(vcpu); - - kvm_vcpu_block(vcpu); - vcpu->stat.halt_wakeup++; - - if (vcpu->arch.dec_expires != ~(u64)0) - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - - kvmppc_vcpu_unblocked(vcpu); -} - void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) { vcpu->arch.shregs.msr = msr; + kvmppc_end_cede(vcpu); } void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) @@ -257,15 +229,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) switch (req) { case H_CEDE: - vcpu->arch.shregs.msr |= MSR_EE; - vcpu->arch.ceded = 1; - smp_mb(); - if (!vcpu->arch.prodded) - kvmppc_vcpu_block(vcpu); - else - vcpu->arch.prodded = 0; - smp_mb(); - vcpu->arch.ceded = 0; break; case H_PROD: target = kvmppc_get_gpr(vcpu, 4); @@ -388,20 +351,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - - if (!(r & RESUME_HOST)) { - /* To avoid clobbering exit_reason, only check for signals if - * we aren't already exiting to userspace for some other - * reason. */ - if (signal_pending(tsk)) { - vcpu->stat.signal_exits++; - run->exit_reason = KVM_EXIT_INTR; - r = -EINTR; - } else { - kvmppc_core_deliver_interrupts(vcpu); - } - } - return r; } @@ -479,13 +428,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) kvmppc_mmu_book3s_hv_init(vcpu); /* - * Some vcpus may start out in stopped state. If we initialize - * them to busy-in-host state they will stop other vcpus in the - * vcore from running. Instead we initialize them to blocked - * state, effectively considering them to be stopped until we - * see the first run ioctl for them. + * We consider the vcpu stopped until we see the first run ioctl for it. */ - vcpu->arch.state = KVMPPC_VCPU_BLOCKED; + vcpu->arch.state = KVMPPC_VCPU_STOPPED; init_waitqueue_head(&vcpu->arch.cpu_run); @@ -496,6 +441,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (vcore) { INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); + init_waitqueue_head(&vcore->wq); } kvm->arch.vcores[core] = vcore; } @@ -506,7 +452,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_lock(&vcore->lock); ++vcore->num_threads; - ++vcore->n_blocked; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; @@ -527,30 +472,31 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kfree(vcpu); } -static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu) +static void kvmppc_set_timer(struct kvm_vcpu *vcpu) { - struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long dec_nsec, now; - spin_lock(&vc->lock); - vcpu->arch.state = KVMPPC_VCPU_BLOCKED; - ++vc->n_blocked; - if (vc->n_runnable > 0 && - vc->n_runnable + vc->n_blocked == vc->num_threads) { - vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, - arch.run_list); - wake_up(&vcpu->arch.cpu_run); + now = get_tb(); + if (now > vcpu->arch.dec_expires) { + /* decrementer has already gone negative */ + kvmppc_core_queue_dec(vcpu); + kvmppc_core_deliver_interrupts(vcpu); + return; } - spin_unlock(&vc->lock); + dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC + / tb_ticks_per_sec; + hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), + HRTIMER_MODE_REL); + vcpu->arch.timer_running = 1; } -static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu) +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) { - struct kvmppc_vcore *vc = vcpu->arch.vcore; - - spin_lock(&vc->lock); - vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; - --vc->n_blocked; - spin_unlock(&vc->lock); + vcpu->arch.ceded = 0; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } } extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); @@ -565,6 +511,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, return; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; --vc->n_runnable; + ++vc->n_busy; /* decrement the physical thread id of each following vcpu */ v = vcpu; list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) @@ -578,15 +525,20 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) struct paca_struct *tpaca; struct kvmppc_vcore *vc = vcpu->arch.vcore; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } cpu = vc->pcpu + vcpu->arch.ptid; tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcore = vc; + tpaca->kvm_hstate.napping = 0; + vcpu->cpu = vc->pcpu; smp_wmb(); #ifdef CONFIG_PPC_ICP_NATIVE if (vcpu->arch.ptid) { tpaca->cpu_start = 0x80; - tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST; wmb(); xics_wake_cpu(cpu); ++vc->n_woken; @@ -634,9 +586,10 @@ static int on_primary_thread(void) */ static int kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu, *vcpu0, *vnext; long ret; u64 now; + int ptid; /* don't start if any threads have a signal pending */ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) @@ -655,29 +608,50 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) goto out; } + /* + * Assign physical thread IDs, first to non-ceded vcpus + * and then to ceded ones. + */ + ptid = 0; + vcpu0 = NULL; + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + if (!vcpu->arch.ceded) { + if (!ptid) + vcpu0 = vcpu; + vcpu->arch.ptid = ptid++; + } + } + if (!vcpu0) + return 0; /* nothing to run */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + if (vcpu->arch.ceded) + vcpu->arch.ptid = ptid++; + vc->n_woken = 0; vc->nap_count = 0; vc->entry_exit_count = 0; - vc->vcore_running = 1; + vc->vcore_state = VCORE_RUNNING; vc->in_guest = 0; vc->pcpu = smp_processor_id(); + vc->napping_threads = 0; list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) kvmppc_start_thread(vcpu); - vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, - arch.run_list); + preempt_disable(); spin_unlock(&vc->lock); - preempt_disable(); kvm_guest_enter(); - __kvmppc_vcore_entry(NULL, vcpu); + __kvmppc_vcore_entry(NULL, vcpu0); - /* wait for secondary threads to finish writing their state to memory */ spin_lock(&vc->lock); + /* disable sending of IPIs on virtual external irqs */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->cpu = -1; + /* wait for secondary threads to finish writing their state to memory */ if (vc->nap_count < vc->n_woken) kvmppc_wait_for_nap(vc); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ - vc->vcore_running = 2; + vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ @@ -693,22 +667,26 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) kvmppc_core_dequeue_dec(vcpu); - if (!vcpu->arch.trap) { - if (signal_pending(vcpu->arch.run_task)) { - vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - } - continue; /* didn't get to run */ - } - ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); + + ret = RESUME_GUEST; + if (vcpu->arch.trap) + ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + vcpu->arch.ret = ret; vcpu->arch.trap = 0; + + if (vcpu->arch.ceded) { + if (ret != RESUME_GUEST) + kvmppc_end_cede(vcpu); + else + kvmppc_set_timer(vcpu); + } } spin_lock(&vc->lock); out: - vc->vcore_running = 0; + vc->vcore_state = VCORE_INACTIVE; list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { if (vcpu->arch.ret != RESUME_GUEST) { @@ -720,82 +698,130 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) return 1; } -static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +/* + * Wait for some other vcpu thread to execute us, and + * wake us up when we need to handle something in the host. + */ +static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) { - int ptid; - int wait_state; - struct kvmppc_vcore *vc; DEFINE_WAIT(wait); - /* No need to go into the guest when all we do is going out */ - if (signal_pending(current)) { - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) + schedule(); + finish_wait(&vcpu->arch.cpu_run, &wait); +} + +/* + * All the vcpus in this vcore are idle, so wait for a decrementer + * or external interrupt to one of the vcpus. vc->lock is held. + */ +static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) +{ + DEFINE_WAIT(wait); + struct kvm_vcpu *v; + int all_idle = 1; + + prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + vc->vcore_state = VCORE_SLEEPING; + spin_unlock(&vc->lock); + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + if (!v->arch.ceded || v->arch.pending_exceptions) { + all_idle = 0; + break; + } } + if (all_idle) + schedule(); + finish_wait(&vc->wq, &wait); + spin_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; +} - /* On PPC970, check that we have an RMA region */ - if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) - return -EPERM; +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int n_ceded; + int prev_state; + struct kvmppc_vcore *vc; + struct kvm_vcpu *v, *vn; kvm_run->exit_reason = 0; vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - flush_vsx_to_thread(current); - /* * Synchronize with other threads in this virtual core */ vc = vcpu->arch.vcore; spin_lock(&vc->lock); - /* This happens the first time this is called for a vcpu */ - if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED) - --vc->n_blocked; - vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; - ptid = vc->n_runnable; + vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.kvm_run = kvm_run; - vcpu->arch.ptid = ptid; + prev_state = vcpu->arch.state; + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); ++vc->n_runnable; - wait_state = TASK_INTERRUPTIBLE; - while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { - if (signal_pending(current)) { - if (!vc->vcore_running) { - kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - break; - } - /* have to wait for vcore to stop executing guest */ - wait_state = TASK_UNINTERRUPTIBLE; - smp_send_reschedule(vc->pcpu); + /* + * This happens the first time this is called for a vcpu. + * If the vcore is already running, we may be able to start + * this thread straight away and have it join in. + */ + if (prev_state == KVMPPC_VCPU_STOPPED) { + if (vc->vcore_state == VCORE_RUNNING && + VCORE_EXIT_COUNT(vc) == 0) { + vcpu->arch.ptid = vc->n_runnable - 1; + kvmppc_start_thread(vcpu); } - if (!vc->vcore_running && - vc->n_runnable + vc->n_blocked == vc->num_threads) { - /* we can run now */ - if (kvmppc_run_core(vc)) - continue; - } + } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) + --vc->n_busy; - if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0) - kvmppc_start_thread(vcpu); + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + !signal_pending(current)) { + if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); + spin_lock(&vc->lock); + continue; + } + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + n_ceded += v->arch.ceded; + if (n_ceded == vc->n_runnable) + kvmppc_vcore_blocked(vc); + else + kvmppc_run_core(vc); + + list_for_each_entry_safe(v, vn, &vc->runnable_threads, + arch.run_list) { + kvmppc_core_deliver_interrupts(v); + if (signal_pending(v->arch.run_task)) { + kvmppc_remove_runnable(vc, v); + v->stat.signal_exits++; + v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; + v->arch.ret = -EINTR; + wake_up(&v->arch.cpu_run); + } + } + } - /* wait for other threads to come in, or wait for vcore */ - prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); - spin_unlock(&vc->lock); - schedule(); - finish_wait(&vcpu->arch.cpu_run, &wait); - spin_lock(&vc->lock); + if (signal_pending(current)) { + if (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } } - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) - kvmppc_remove_runnable(vc, vcpu); spin_unlock(&vc->lock); - return vcpu->arch.ret; } @@ -808,6 +834,21 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINVAL; } + /* No need to go into the guest when all we'll do is come back out */ + if (signal_pending(current)) { + run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + /* On PPC970, check that we have an RMA region */ + if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) + return -EPERM; + + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_vsx_to_thread(current); + vcpu->arch.wqp = &vcpu->arch.vcore->wq; + do { r = kvmppc_run_vcpu(run, vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bc6ade9..f422231 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -52,7 +52,7 @@ kvmppc_skip_Hinterrupt: b . /* - * Call kvmppc_handler_trampoline_enter in real mode. + * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. * * Input Registers: @@ -92,6 +92,12 @@ _GLOBAL(kvmppc_hv_entry_trampoline) kvm_start_guest: ld r1,PACAEMERGSP(r13) subi r1,r1,STACK_FRAME_OVERHEAD + ld r2,PACATOC(r13) + + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,0 + bne kvm_end_cede /* get vcpu pointer */ ld r4, HSTATE_KVM_VCPU(r13) @@ -279,15 +285,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) cmpwi r0,0 beq 20b - /* Set LPCR. Set the MER bit if there is a pending external irq. */ + /* Set LPCR and RMOR. */ 10: ld r8,KVM_LPCR(r9) - ld r0,VCPU_PENDING_EXC(r4) - li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL) - oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h - and. r0,r0,r7 - beq 11f - ori r8,r8,LPCR_MER -11: mtspr SPRN_LPCR,r8 + mtspr SPRN_LPCR,r8 ld r8,KVM_RMOR(r9) mtspr SPRN_RMOR,r8 isync @@ -451,19 +451,50 @@ toc_tlbie_lock: mtctr r6 mtxer r7 - /* Move SRR0 and SRR1 into the respective regs */ +kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) - mtspr SPRN_SRR0, r6 - mtspr SPRN_SRR1, r7 - ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ - ld r11, VCPU_MSR(r4) /* r10 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0,VCPU_PENDING_EXC(r4) + li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) + oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + and r0,r0,r8 + cmpdi cr1,r0,0 + andi. r0,r11,MSR_EE + beq cr1,11f +BEGIN_FTR_SECTION + mfspr r8,SPRN_LPCR + ori r8,r8,LPCR_MER + mtspr SPRN_LPCR,r8 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + beq 5f + li r0,BOOK3S_INTERRUPT_EXTERNAL +12: mr r6,r10 + mr r10,r0 + mr r7,r11 + li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11,r11,63 + b 5f +11: beq 5f + mfspr r0,SPRN_DEC + cmpwi r0,0 + li r0,BOOK3S_INTERRUPT_DECREMENTER + blt 12b + + /* Move SRR0 and SRR1 into the respective regs */ +5: mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r7 + li r0,0 + stb r0,VCPU_CEDED(r4) /* cancel cede */ + fast_guest_return: mtspr SPRN_HSRR0,r10 mtspr SPRN_HSRR1,r11 @@ -577,21 +608,20 @@ kvmppc_interrupt: /* See if this is something we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode -hcall_real_cont: /* Check for mediated interrupts (could be done earlier really ...) */ BEGIN_FTR_SECTION cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL bne+ 1f - ld r5,VCPU_KVM(r9) - ld r5,KVM_LPCR(r5) andi. r0,r11,MSR_EE beq 1f + mfspr r5,SPRN_LPCR andi. r0,r5,LPCR_MER bne bounce_ext_interrupt 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC mftb r6 @@ -685,7 +715,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) slbia ptesync -hdec_soon: +hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */ BEGIN_FTR_SECTION b 32f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) @@ -703,6 +733,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) addi r0,r3,0x100 stwcx. r0,0,r6 bne 41b + lwsync /* * At this point we have an interrupt that we have to pass @@ -716,18 +747,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * interrupt, since the other threads will already be on their * way here in that case. */ + cmpwi r3,0x100 /* Are we the first here? */ + bge 43f + cmpwi r3,1 /* Are any other threads in the guest? */ + ble 43f cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER beq 40f - cmpwi r3,0x100 /* Are we the first here? */ - bge 40f - cmpwi r3,1 - ble 40f li r0,0 mtspr SPRN_HDEC,r0 40: + /* + * Send an IPI to any napping threads, since an HDEC interrupt + * doesn't wake CPUs up from nap. + */ + lwz r3,VCORE_NAPPING_THREADS(r5) + lwz r4,VCPU_PTID(r9) + li r0,1 + sldi r0,r0,r4 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ + beq 43f + mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ + subf r6,r4,r13 +42: andi. r0,r3,1 + beq 44f + ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ + li r0,IPI_PRIORITY + li r7,XICS_QIRR + stbcix r0,r7,r8 /* trigger the IPI */ +44: srdi. r3,r3,1 + addi r6,r6,PACA_SIZE + bne 42b /* Secondary threads wait for primary to do partition switch */ - ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ +43: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ ld r5,HSTATE_KVM_VCORE(r13) lwz r3,VCPU_PTID(r9) cmpwi r3,0 @@ -1080,7 +1132,6 @@ hcall_try_real_mode: hcall_real_fallback: li r12,BOOK3S_INTERRUPT_SYSCALL ld r9, HSTATE_KVM_VCPU(r13) - ld r11, VCPU_MSR(r9) b hcall_real_cont @@ -1142,7 +1193,7 @@ hcall_real_table: .long 0 /* 0xd4 */ .long 0 /* 0xd8 */ .long 0 /* 0xdc */ - .long 0 /* 0xe0 */ + .long .kvmppc_h_cede - hcall_real_table .long 0 /* 0xe4 */ .long 0 /* 0xe8 */ .long 0 /* 0xec */ @@ -1171,7 +1222,8 @@ bounce_ext_interrupt: mtspr SPRN_SRR0,r10 mtspr SPRN_SRR1,r11 li r10,BOOK3S_INTERRUPT_EXTERNAL - LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME); + li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11,r11,63 b fast_guest_return _GLOBAL(kvmppc_h_set_dabr) @@ -1180,6 +1232,178 @@ _GLOBAL(kvmppc_h_set_dabr) li r3,0 blr +_GLOBAL(kvmppc_h_cede) + ori r11,r11,MSR_EE + std r11,VCPU_MSR(r3) + li r0,1 + stb r0,VCPU_CEDED(r3) + sync /* order setting ceded vs. testing prodded */ + lbz r5,VCPU_PRODDED(r3) + cmpwi r5,0 + bne 1f + li r0,0 /* set trap to 0 to say hcall is handled */ + stw r0,VCPU_TRAP(r3) + li r0,H_SUCCESS + std r0,VCPU_GPR(r3)(r3) +BEGIN_FTR_SECTION + b 2f /* just send it up to host on 970 */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) + + /* + * Set our bit in the bitmask of napping threads unless all the + * other threads are already napping, in which case we send this + * up to the host. + */ + ld r5,HSTATE_KVM_VCORE(r13) + lwz r6,VCPU_PTID(r3) + lwz r8,VCORE_ENTRY_EXIT(r5) + clrldi r8,r8,56 + li r0,1 + sld r0,r0,r6 + addi r6,r5,VCORE_NAPPING_THREADS +31: lwarx r4,0,r6 + or r4,r4,r0 + popcntw r7,r4 + cmpw r7,r8 + bge 2f + stwcx. r4,0,r6 + bne 31b + li r0,1 + stb r0,HSTATE_NAPPING(r13) + /* order napping_threads update vs testing entry_exit_count */ + lwsync + mr r4,r3 + lwz r7,VCORE_ENTRY_EXIT(r5) + cmpwi r7,0x100 + bge 33f /* another thread already exiting */ + +/* + * Although not specifically required by the architecture, POWER7 + * preserves the following registers in nap mode, even if an SMT mode + * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3, + * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR. + */ + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(r14)(r3) + std r15, VCPU_GPR(r15)(r3) + std r16, VCPU_GPR(r16)(r3) + std r17, VCPU_GPR(r17)(r3) + std r18, VCPU_GPR(r18)(r3) + std r19, VCPU_GPR(r19)(r3) + std r20, VCPU_GPR(r20)(r3) + std r21, VCPU_GPR(r21)(r3) + std r22, VCPU_GPR(r22)(r3) + std r23, VCPU_GPR(r23)(r3) + std r24, VCPU_GPR(r24)(r3) + std r25, VCPU_GPR(r25)(r3) + std r26, VCPU_GPR(r26)(r3) + std r27, VCPU_GPR(r27)(r3) + std r28, VCPU_GPR(r28)(r3) + std r29, VCPU_GPR(r29)(r3) + std r30, VCPU_GPR(r30)(r3) + std r31, VCPU_GPR(r31)(r3) + + /* save FP state */ + bl .kvmppc_save_fp + + /* + * Take a nap until a decrementer or external interrupt occurs, + * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR + */ + li r0,0x80 + stb r0,PACAPROCSTART(r13) + mfspr r5,SPRN_LPCR + ori r5,r5,LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR,r5 + isync + li r0, 0 + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +kvm_end_cede: + /* Woken by external or decrementer interrupt */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* If we're a secondary thread and we got here by an IPI, ack it */ + ld r4,HSTATE_KVM_VCPU(r13) + lwz r3,VCPU_PTID(r4) + cmpwi r3,0 + beq 27f + mfspr r3,SPRN_SRR1 + rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ + cmpwi r3,4 /* was it an external interrupt? */ + bne 27f + ld r5, HSTATE_XICS_PHYS(r13) + li r0,0xff + li r6,XICS_QIRR + li r7,XICS_XIRR + lwzcix r8,r5,r7 /* ack the interrupt */ + sync + stbcix r0,r5,r6 /* clear it */ + stwcix r8,r5,r7 /* EOI it */ +27: + /* load up FP state */ + bl kvmppc_load_fp + + /* Load NV GPRS */ + ld r14, VCPU_GPR(r14)(r4) + ld r15, VCPU_GPR(r15)(r4) + ld r16, VCPU_GPR(r16)(r4) + ld r17, VCPU_GPR(r17)(r4) + ld r18, VCPU_GPR(r18)(r4) + ld r19, VCPU_GPR(r19)(r4) + ld r20, VCPU_GPR(r20)(r4) + ld r21, VCPU_GPR(r21)(r4) + ld r22, VCPU_GPR(r22)(r4) + ld r23, VCPU_GPR(r23)(r4) + ld r24, VCPU_GPR(r24)(r4) + ld r25, VCPU_GPR(r25)(r4) + ld r26, VCPU_GPR(r26)(r4) + ld r27, VCPU_GPR(r27)(r4) + ld r28, VCPU_GPR(r28)(r4) + ld r29, VCPU_GPR(r29)(r4) + ld r30, VCPU_GPR(r30)(r4) + ld r31, VCPU_GPR(r31)(r4) + + /* clear our bit in vcore->napping_threads */ +33: ld r5,HSTATE_KVM_VCORE(r13) + lwz r3,VCPU_PTID(r4) + li r0,1 + sld r0,r0,r3 + addi r6,r5,VCORE_NAPPING_THREADS +32: lwarx r7,0,r6 + andc r7,r7,r0 + stwcx. r7,0,r6 + bne 32b + li r0,0 + stb r0,HSTATE_NAPPING(r13) + + /* see if any other thread is already exiting */ + lwz r0,VCORE_ENTRY_EXIT(r5) + cmpwi r0,0x100 + blt kvmppc_cede_reentry /* if not go back to guest */ + + /* some threads are exiting, so go to the guest exit path */ + b hcall_real_fallback + + /* cede when already previously prodded case */ +1: li r0,0 + stb r0,VCPU_PRODDED(r3) + sync /* order testing prodded vs. clearing ceded */ + stb r0,VCPU_CEDED(r3) + li r3,H_SUCCESS + blr + + /* we've ceded but we want to give control to the host */ +2: li r3,H_TOO_HARD + blr + secondary_too_late: ld r5,HSTATE_KVM_VCORE(r13) HMT_LOW @@ -1197,14 +1421,20 @@ secondary_too_late: slbmte r6,r5 1: addi r11,r11,16 .endr - b 50f secondary_nap: - /* Clear any pending IPI */ -50: ld r5, HSTATE_XICS_PHYS(r13) + /* Clear any pending IPI - assume we're a secondary thread */ + ld r5, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + lwzcix r3, r5, r7 /* ack any pending interrupt */ + rlwinm. r0, r3, 0, 0xffffff /* any pending? */ + beq 37f + sync li r0, 0xff li r6, XICS_QIRR - stbcix r0, r5, r6 + stbcix r0, r5, r6 /* clear the IPI */ + stwcix r3, r5, r7 /* EOI it */ +37: sync /* increment the nap count and then go to nap mode */ ld r4, HSTATE_KVM_VCORE(r13) @@ -1214,13 +1444,12 @@ secondary_nap: addi r3, r3, 1 stwcx. r3, 0, r4 bne 51b - isync + li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR - li r0, LPCR_PECE - andc r4, r4, r0 - ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */ + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 + isync li r0, 0 std r0, HSTATE_SCRATCH0(r13) ptesync diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a8000ce..0d843c6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,12 +39,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { -#ifndef CONFIG_KVM_BOOK3S_64_HV return !(v->arch.shared->msr & MSR_WE) || !!(v->arch.pending_exceptions); -#else - return !(v->arch.ceded) || !!(v->arch.pending_exceptions); -#endif } int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) @@ -285,6 +281,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); + vcpu->arch.wqp = &vcpu->wq; if (!IS_ERR(vcpu)) kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; @@ -316,8 +313,8 @@ static void kvmppc_decrementer_func(unsigned long data) kvmppc_core_queue_dec(vcpu); - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; } } @@ -570,13 +567,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq == KVM_INTERRUPT_UNSET) + if (irq->irq == KVM_INTERRUPT_UNSET) { kvmppc_core_dequeue_external(vcpu, irq); - else - kvmppc_core_queue_external(vcpu, irq); + return 0; + } + + kvmppc_core_queue_external(vcpu, irq); - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; } else if (vcpu->cpu != -1) { smp_send_reschedule(vcpu->cpu); -- cgit v0.10.2 From 821246a5a548858c78a5e8860862e91c9e035d6b Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 31 Aug 2011 10:58:55 +0200 Subject: KVM: Update documentation to include detailed ENABLE_CAP description We have an ioctl that enables capabilities individually, but no description on what exactly happens when we enable a capability using this ioctl. This patch adds documentation for capability enabling in a new section of the API documentation. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2d510b6..7945b0b 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1653,3 +1653,50 @@ developer registration required to access it). char padding[256]; }; }; + +6. Capabilities that can be enabled + +There are certain capabilities that change the behavior of the virtual CPU when +enabled. To enable them, please see section 4.37. Below you can find a list of +capabilities and what their effect on the vCPU is when enabling them. + +The following information is provided along with the description: + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Parameters: what parameters are accepted by the capability. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + +6.1 KVM_CAP_PPC_OSI + +Architectures: ppc +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables interception of OSI hypercalls that otherwise would +be treated as normal system calls to be injected into the guest. OSI hypercalls +were invented by Mac-on-Linux to have a standardized communication mechanism +between the guest and the host. + +When this capability is enabled, KVM_EXIT_OSI can occur. + +6.2 KVM_CAP_PPC_PAPR + +Architectures: ppc +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables interception of PAPR hypercalls. PAPR hypercalls are +done using the hypercall instruction "sc 1". + +It also sets the guest privilege level to "supervisor" mode. Usually the guest +runs in "hypervisor" privilege mode with a few missing features. + +In addition to the above, it changes the semantics of SDR1. In this mode, the +HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the +HTAB invisible to the guest. + +When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. -- cgit v0.10.2 From a31b9ceadb61487042dec92f662736ccddadc75f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:35 +0300 Subject: KVM: x86 emulator: simplify emulate_2op_SrcV() emulate_2op_SrcV(), and its siblings, emulate_2op_SrcV_nobyte() and emulate_2op_SrcB(), all use the same calling conventions and all get passed exactly the same parameters. Simplify them by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fe5eb6d..458914d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -205,64 +205,62 @@ struct gprefix { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ +#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ + : "=m" ((ctxt)->eflags), \ + "+q" (*(_dsttype*)&(ctxt)->dst.val), \ "=&r" (_tmp) \ - : _y ((_src).val), "i" (EFLAGS_MASK)); \ + : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ } while (0) /* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ +#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ unsigned long _tmp; \ \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ + ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ + ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ + ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ break; \ } \ } while (0) -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ +#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ unsigned long _tmp; \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ + ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ break; \ default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + __emulate_2op_nobyte(ctxt, _op, \ _wx, _wy, _lx, _ly, _qx, _qy); \ break; \ } \ } while (0) /* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") +#define emulate_2op_SrcB(ctxt, _op) \ + __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") /* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") +#define emulate_2op_SrcV(ctxt, _op) \ + __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") /* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") +#define emulate_2op_SrcV_nobyte(ctxt, _op) \ + __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ #define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ @@ -1681,26 +1679,26 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) { switch (ctxt->modrm_reg) { case 0: /* rol */ - emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rol"); break; case 1: /* ror */ - emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "ror"); break; case 2: /* rcl */ - emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rcl"); break; case 3: /* rcr */ - emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rcr"); break; case 4: /* sal/shl */ case 6: /* sal/shl */ - emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "sal"); break; case 5: /* shr */ - emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "shr"); break; case 7: /* sar */ - emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "sar"); break; } return X86EMUL_CONTINUE; @@ -1714,7 +1712,7 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0 ... 1: /* test */ - emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "test"); break; case 2: /* not */ ctxt->dst.val = ~ctxt->dst.val; @@ -2459,7 +2457,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; - emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "or"); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2509,49 +2507,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) static int em_add(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "add"); return X86EMUL_CONTINUE; } static int em_or(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "or"); return X86EMUL_CONTINUE; } static int em_adc(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "adc"); return X86EMUL_CONTINUE; } static int em_sbb(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "sbb"); return X86EMUL_CONTINUE; } static int em_and(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "and"); return X86EMUL_CONTINUE; } static int em_sub(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "sub"); return X86EMUL_CONTINUE; } static int em_xor(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "xor"); return X86EMUL_CONTINUE; } static int em_cmp(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "cmp"); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; @@ -2559,7 +2557,7 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt) static int em_test(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "test"); return X86EMUL_CONTINUE; } @@ -2577,7 +2575,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) static int em_imul(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "imul"); return X86EMUL_CONTINUE; } @@ -4121,7 +4119,7 @@ twobyte_insn: ctxt->dst.type = OP_NONE; /* only subword offset */ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "bt"); break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ @@ -4135,7 +4133,7 @@ twobyte_insn: break; case 0xab: bts: /* bts */ - emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "bts"); break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ @@ -4150,7 +4148,7 @@ twobyte_insn: */ ctxt->src.orig_val = ctxt->src.val; ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "cmp"); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ ctxt->dst.val = ctxt->src.orig_val; @@ -4165,7 +4163,7 @@ twobyte_insn: break; case 0xb3: btr: /* btr */ - emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "btr"); break; case 0xb4: /* lfs */ rc = emulate_load_segment(ctxt, VCPU_SREG_FS); @@ -4192,7 +4190,7 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "btc"); break; case 0xbc: { /* bsf */ u8 zf; @@ -4224,7 +4222,7 @@ twobyte_insn: (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "add"); /* Write back the register source. */ ctxt->src.val = ctxt->dst.orig_val; write_register_operand(&ctxt->src); -- cgit v0.10.2 From 761441b9f42159409d56f74dcc7ce5538d9efd69 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:36 +0300 Subject: KVM: x86 emulator: simplify emulate_2op_cl() emulate_2op_cl() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 458914d..0b33884 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -263,40 +263,37 @@ struct gprefix { __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ +#define __emulate_2op_cl(_op, ctxt, _suffix, _type) \ do { \ unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ + _type _clv = (ctxt)->src2.val; \ + _type _srcv = (ctxt)->src.val; \ + _type _dstv = (ctxt)->dst.val; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "2") \ _op _suffix " %4,%1 \n" \ _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ ); \ \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ + (ctxt)->src2.val = (unsigned long) _clv; \ + (ctxt)->src2.val = (unsigned long) _srcv; \ + (ctxt)->dst.val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ +#define emulate_2op_cl(ctxt, _op) \ do { \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ + __emulate_2op_cl(_op, ctxt, "w", u16); \ break; \ case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ + __emulate_2op_cl(_op, ctxt, "l", u32); \ break; \ case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ + ON64(__emulate_2op_cl(_op, ctxt, "q", ulong)); \ break; \ } \ } while (0) @@ -4123,7 +4120,7 @@ twobyte_insn: break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_cl(ctxt, "shld"); break; case 0xa8: /* push gs */ rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); @@ -4137,7 +4134,7 @@ twobyte_insn: break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_cl(ctxt, "shrd"); break; case 0xae: /* clflush */ break; -- cgit v0.10.2 From 29053a60d791a492b4609d87397b70a7a3254eb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:37 +0300 Subject: KVM: x86 emulator: simplify emulate_2op_cl() emulate_2op_cl() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0b33884..14b2791 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -263,7 +263,7 @@ struct gprefix { __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, ctxt, _suffix, _type) \ +#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ do { \ unsigned long _tmp; \ _type _clv = (ctxt)->src2.val; \ @@ -287,13 +287,13 @@ struct gprefix { do { \ switch ((ctxt)->dst.bytes) { \ case 2: \ - __emulate_2op_cl(_op, ctxt, "w", u16); \ + __emulate_2op_cl(ctxt, _op, "w", u16); \ break; \ case 4: \ - __emulate_2op_cl(_op, ctxt, "l", u32); \ + __emulate_2op_cl(ctxt, _op, "l", u32); \ break; \ case 8: \ - ON64(__emulate_2op_cl(_op, ctxt, "q", ulong)); \ + ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ break; \ } \ } while (0) -- cgit v0.10.2 From d1eef45d5906a0738684833cf1e0395b4d098340 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:38 +0300 Subject: KVM: x86 emulator: simplify emulate_1op() emulate_1op() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 14b2791..5b4e1d4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -298,7 +298,7 @@ struct gprefix { } \ } while (0) -#define __emulate_1op(_op, _dst, _eflags, _suffix) \ +#define __emulate_1op(ctxt, _op, _suffix) \ do { \ unsigned long _tmp; \ \ @@ -306,19 +306,19 @@ struct gprefix { _PRE_EFLAGS("0", "3", "2") \ _op _suffix " %1; " \ _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "+m" ((_dst).val), \ + : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ "=&r" (_tmp) \ : "i" (EFLAGS_MASK)); \ } while (0) /* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ +#define emulate_1op(ctxt, _op) \ do { \ - switch ((_dst).bytes) { \ - case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ - case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ - case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + switch ((ctxt)->dst.bytes) { \ + case 1: __emulate_1op(ctxt, _op, "b"); break; \ + case 2: __emulate_1op(ctxt, _op, "w"); break; \ + case 4: __emulate_1op(ctxt, _op, "l"); break; \ + case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ } \ } while (0) @@ -1715,7 +1715,7 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) ctxt->dst.val = ~ctxt->dst.val; break; case 3: /* neg */ - emulate_1op("neg", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "neg"); break; case 4: /* mul */ emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); @@ -1745,10 +1745,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0: /* inc */ - emulate_1op("inc", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "inc"); break; case 1: /* dec */ - emulate_1op("dec", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "dec"); break; case 2: /* call near abs */ { long int old_eip; @@ -3849,10 +3849,10 @@ special_insn: rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); break; case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "inc"); break; case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "dec"); break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) -- cgit v0.10.2 From 9fef72ce10dc8263fec9350a8e2a1c505ebedaae Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:39 +0300 Subject: KVM: x86 emulator: merge the two emulate_1op_rax_rdx implementations We have two emulate-with-extended-accumulator implementations: once which expect traps (_ex) and one which doesn't (plain). Drop the plain implementation and always use the one which expects traps; it will simply return 0 in the _ex argument and we can happily ignore it. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5b4e1d4..6f2e419 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -322,21 +322,7 @@ struct gprefix { } \ } while (0) -#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "1") \ - _op _suffix " %5; " \ - _POST_EFLAGS("0", "4", "1") \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ - } while (0) - -#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ +#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ do { \ unsigned long _tmp; \ \ @@ -358,46 +344,24 @@ struct gprefix { } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _ex) \ do { \ switch((_src).bytes) { \ case 1: \ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "b"); \ + _eflags, "b", _ex); \ break; \ case 2: \ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "w"); \ + _eflags, "w", _ex); \ break; \ case 4: \ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "l"); \ - break; \ - case 8: \ - ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "q")); \ - break; \ - } \ - } while (0) - -#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ - do { \ - switch((_src).bytes) { \ - case 1: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "l", _ex); \ + _eflags, "l", _ex); \ break; \ case 8: ON64( \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "q", _ex)); \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "q", _ex)); \ break; \ } \ } while (0) @@ -1718,18 +1682,20 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) emulate_1op(ctxt, "neg"); break; case 4: /* mul */ - emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; case 5: /* imul */ - emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; case 6: /* div */ - emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx("div", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; case 7: /* idiv */ - emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx("idiv", ctxt->src, *rax, *rdx, + ctxt->eflags, de); break; default: return X86EMUL_UNHANDLEABLE; -- cgit v0.10.2 From e8f2b1d621e7b73c16aaae2ccf4a64d09a0f56d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Sep 2011 16:41:40 +0300 Subject: KVM: x86 emulator: simplify emulate_1op_rax_rdx() emulate_1op_rax_rdx() is always called with the same parameters. Simplify by passing just the emulation context. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6f2e419..e10fd37 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -322,9 +322,11 @@ struct gprefix { } \ } while (0) -#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ +#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ + ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ + ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "1") \ @@ -337,31 +339,27 @@ struct gprefix { "jmp 2b \n\t" \ ".popsection \n\t" \ _ASM_EXTABLE(1b, 3b) \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ + : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ + "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ + "a" (*rax), "d" (*rdx)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _ex) \ +#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ do { \ - switch((_src).bytes) { \ + switch((ctxt)->src.bytes) { \ case 1: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "b", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ break; \ case 2: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "w", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ break; \ case 4: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "l", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ break; \ case 8: ON64( \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "q", _ex)); \ + __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ break; \ } \ } while (0) @@ -1667,8 +1665,6 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) static int em_grp3(struct x86_emulate_ctxt *ctxt) { - unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX]; - unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX]; u8 de = 0; switch (ctxt->modrm_reg) { @@ -1682,20 +1678,16 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) emulate_1op(ctxt, "neg"); break; case 4: /* mul */ - emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "mul", de); break; case 5: /* imul */ - emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "imul", de); break; case 6: /* div */ - emulate_1op_rax_rdx("div", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "div", de); break; case 7: /* idiv */ - emulate_1op_rax_rdx("idiv", ctxt->src, *rax, *rdx, - ctxt->eflags, de); + emulate_1op_rax_rdx(ctxt, "idiv", de); break; default: return X86EMUL_UNHANDLEABLE; -- cgit v0.10.2 From 9f9f6b78776be3de02e0c7533924e6954b918123 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 11 Sep 2011 10:16:16 +0200 Subject: KVM: Clean up unneeded void pointer casts Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 4e9eaeb..ea1bf5d 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -143,7 +143,7 @@ static void deassign_host_irq(struct kvm *kvm, for (i = 0; i < assigned_dev->entries_nr; i++) free_irq(assigned_dev->host_msix_entries[i].vector, - (void *)assigned_dev); + assigned_dev); assigned_dev->entries_nr = 0; kfree(assigned_dev->host_msix_entries); @@ -153,7 +153,7 @@ static void deassign_host_irq(struct kvm *kvm, /* Deal with MSI and INTx */ disable_irq(assigned_dev->host_irq); - free_irq(assigned_dev->host_irq, (void *)assigned_dev); + free_irq(assigned_dev->host_irq, assigned_dev); if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) pci_disable_msi(assigned_dev->dev); @@ -237,7 +237,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm, * are going to be long delays in accepting, acking, etc. */ if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, dev->irq_name, (void *)dev)) + IRQF_ONESHOT, dev->irq_name, dev)) return -EIO; return 0; } @@ -256,7 +256,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, dev->host_irq = dev->dev->irq; if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, (void *)dev)) { + 0, dev->irq_name, dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -283,7 +283,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, for (i = 0; i < dev->entries_nr; i++) { r = request_threaded_irq(dev->host_msix_entries[i].vector, NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, (void *)dev); + 0, dev->irq_name, dev); if (r) goto err; } @@ -291,7 +291,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, return 0; err: for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, (void *)dev); + free_irq(dev->host_msix_entries[i].vector, dev); pci_disable_msix(dev->dev); return r; } -- cgit v0.10.2 From c61fa9d63b40b06522c5f1d940b084323a090688 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 11 Sep 2011 10:16:20 +0200 Subject: KVM: Avoid needless registrations of IRQ ack notifier for assigned devices We only perform work in kvm_assigned_dev_ack_irq if the guest IRQ is of INTx type. This completely avoids the callback invocation in non-INTx cases by registering the IRQ ack notifier only for INTx. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ea1bf5d..84ead54 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -86,13 +86,9 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) /* Ack the irq line for an assigned device */ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) { - struct kvm_assigned_dev_kernel *dev; - - if (kian->gsi == -1) - return; - - dev = container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); + struct kvm_assigned_dev_kernel *dev = + container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); @@ -110,8 +106,9 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) static void deassign_guest_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { - kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); - assigned_dev->ack_notifier.gsi = -1; + if (assigned_dev->ack_notifier.gsi != -1) + kvm_unregister_irq_ack_notifier(kvm, + &assigned_dev->ack_notifier); kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, assigned_dev->guest_irq, 0); @@ -404,7 +401,8 @@ static int assign_guest_irq(struct kvm *kvm, if (!r) { dev->irq_requested_type |= guest_irq_type; - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + if (dev->ack_notifier.gsi != -1) + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); } else kvm_free_irq_source_id(kvm, dev->irq_source_id); -- cgit v0.10.2 From caa8a168e35650961b9b0d43b9b6fc2279351949 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 11 Sep 2011 11:23:02 +0300 Subject: KVM: x86 emulator: disable writeback for TEST The TEST instruction doesn't write its destination operand. This could cause problems if an MMIO register was accessed using the TEST instruction. Recently Windows XP was observed to use TEST against the APIC ICR; this can cause spurious IPIs. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e10fd37..af06539 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1670,6 +1670,8 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0 ... 1: /* test */ emulate_2op_SrcV(ctxt, "test"); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; break; case 2: /* not */ ctxt->dst.val = ~ctxt->dst.val; @@ -2513,6 +2515,8 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt) static int em_test(struct x86_emulate_ctxt *ctxt) { emulate_2op_SrcV(ctxt, "test"); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } -- cgit v0.10.2 From 1e2b1dd797f9356f779268ecc1ba21110d4e341c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 10:52:24 +0200 Subject: KVM: x86: Move kvm_trace_exit into atomic vmexit section This avoids that events causing the vmexit are recorded before the actual exit reason. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8277f32..e7ed4b1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3335,8 +3335,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -3790,6 +3788,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_handle_nmi(&svm->vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 47419d6..21217b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5739,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); @@ -6144,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); -- cgit v0.10.2 From 7712de872c8ec00a657b867ab0296913f69addac Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 11:25:51 +0200 Subject: KVM: x86: Avoid guest-triggerable printks in APIC model Convert remaining printks that the guest can trigger to apic_printk. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 52645f2..4b53b81 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -316,8 +316,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) result = 1; break; default: - printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + apic_debug("Bad DFR vcpu %d: %08x\n", + apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); break; } @@ -354,8 +354,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, result = (target != source); break; default: - printk(KERN_WARNING "Bad dest shorthand value %x\n", - short_hand); + apic_debug("kvm: apic: Bad dest shorthand value %x\n", + short_hand); break; } @@ -401,11 +401,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_REMRD: - printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + apic_debug("Ignoring delivery mode 3\n"); break; case APIC_DM_SMI: - printk(KERN_DEBUG "Ignoring guest SMI\n"); + apic_debug("Ignoring guest SMI\n"); break; case APIC_DM_NMI: @@ -565,8 +565,7 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = kvm_apic_id(apic) << 24; break; case APIC_ARBPRI: - printk(KERN_WARNING "Access APIC ARBPRI register " - "which is for P6\n"); + apic_debug("Access APIC ARBPRI register which is for P6\n"); break; case APIC_TMCCT: /* Timer CCR */ @@ -804,14 +803,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: if (val & 4) - printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); + apic_debug("KVM_WRITE:TDCR %x\n", val); apic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { - printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + apic_debug("KVM_WRITE:ESR not zero %x\n", val); ret = 1; } break; -- cgit v0.10.2 From bd80158aff71a80292f96d9baea1a65bc0ce87b3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 11:26:22 +0200 Subject: KVM: Clean up and extend rate-limited output The use of printk_ratelimit is discouraged, replace it with pr*_ratelimited or __ratelimit. While at it, convert remaining guest-triggerable printks to rate-limited variants. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 6b869ce..cac4746 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -34,6 +34,9 @@ #include #include "trace.h" +#define pr_pic_unimpl(fmt, ...) \ + pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) @@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } s->init_state = 1; if (val & 0x02) - printk(KERN_ERR "single mode not supported"); + pr_pic_unimpl("single mode not supported"); if (val & 0x08) - printk(KERN_ERR - "level sensitive irq not supported"); + pr_pic_unimpl( + "level sensitive irq not supported"); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; @@ -467,8 +470,7 @@ static int picdev_write(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte write\n"); + pr_pic_unimpl("non byte write\n"); return 0; } pic_lock(s); @@ -496,8 +498,7 @@ static int picdev_read(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte read\n"); + pr_pic_unimpl("non byte read\n"); return 0; } pic_lock(s); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 2460a26..746ec25 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); unsigned long *rmapp; struct kvm_mmu_page *rev_sp; gfn_t gfn; - rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no memslot for gfn %llx\n", gfn); audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", @@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no rmap for writable spte %llx\n", *sptep); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 21217b6..a0d6bd9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2762,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { - printk(KERN_DEBUG "%s: tss fixup for long mode. \n", - __func__); + pr_debug_ratelimited("%s: tss fixup for long mode. \n", + __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); @@ -5634,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; if (unlikely(vmx->fail)) { - printk(KERN_INFO "%s failed vm entry %x\n", - __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); return 1; } @@ -6612,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (vmcs12->vm_entry_msr_load_count > 0 || vmcs12->vm_exit_msr_load_count > 0 || vmcs12->vm_exit_msr_store_count > 0) { - if (printk_ratelimit()) - printk(KERN_WARNING - "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); + pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", + __func__); nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d0e42f3..2a414f6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -282,11 +283,8 @@ struct kvm { /* The guest did something we don't support. */ #define pr_unimpl(vcpu, fmt, ...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ - } while (0) + pr_err_ratelimited("kvm: %i: cpu%i " fmt, \ + current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__) #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) -- cgit v0.10.2 From 9bc5791d4aa78b72abd76f506c0a391a54abc4d1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 14:10:22 +0200 Subject: KVM: x86: Add module parameter for lapic periodic timer limit Certain guests, specifically RTOSes, request faster periodic timers than what we allow by default. Add a module parameter to adjust the limit for non-standard setups. Also add a rate-limited warning in case the guest requested more. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b53b81..2fb20ca 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -68,6 +68,9 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) +static unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((u32 *) (apic->regs + reg_off)); @@ -677,8 +680,16 @@ static void start_apic_timer(struct kvm_lapic *apic) * scheduler. */ if (apic_lvtt_period(apic)) { - if (apic->lapic_timer.period < NSEC_PER_MSEC/2) - apic->lapic_timer.period = NSEC_PER_MSEC/2; + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, apic->lapic_timer.period, + min_period); + apic->lapic_timer.period = min_period; + } } hrtimer_start(&apic->lapic_timer.timer, -- cgit v0.10.2 From cc0793968a602c9b40f7b99c401674750aa99e06 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 12 Sep 2011 18:57:56 +0200 Subject: KVM: Split up MSI-X assigned device IRQ handler The threaded IRQ handler for MSI-X has almost nothing in common with the INTx/MSI handler. Move its code into a dedicated handler. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 84ead54..7debe8c 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -58,8 +58,6 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - u32 vector; - int index; if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { spin_lock(&assigned_dev->intx_lock); @@ -68,20 +66,28 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) spin_unlock(&assigned_dev->intx_lock); } - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - index = find_index_from_host_irq(assigned_dev, irq); - if (index >= 0) { - vector = assigned_dev-> - guest_msix_entries[index].vector; - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1); - } - } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + + return IRQ_HANDLED; +} + +#ifdef __KVM_HAVE_MSIX +static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int index = find_index_from_host_irq(assigned_dev, irq); + u32 vector; + + if (index >= 0) { + vector = assigned_dev->guest_msix_entries[index].vector; kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); + vector, 1); + } return IRQ_HANDLED; } +#endif /* Ack the irq line for an assigned device */ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) @@ -279,7 +285,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm, for (i = 0; i < dev->entries_nr; i++) { r = request_threaded_irq(dev->host_msix_entries[i].vector, - NULL, kvm_assigned_dev_thread, + NULL, kvm_assigned_dev_thread_msix, 0, dev->irq_name, dev); if (r) goto err; -- cgit v0.10.2 From 3329ece161ad65ea31d825720e270f3a79ebba92 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:39 +0300 Subject: KVM: x86 emulator: convert group 3 instructions to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index af06539..ed819bd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1663,37 +1663,49 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp3(struct x86_emulate_ctxt *ctxt) +static int em_not(struct x86_emulate_ctxt *ctxt) +{ + ctxt->dst.val = ~ctxt->dst.val; + return X86EMUL_CONTINUE; +} + +static int em_neg(struct x86_emulate_ctxt *ctxt) +{ + emulate_1op(ctxt, "neg"); + return X86EMUL_CONTINUE; +} + +static int em_mul_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 ex = 0; + + emulate_1op_rax_rdx(ctxt, "mul", ex); + return X86EMUL_CONTINUE; +} + +static int em_imul_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 ex = 0; + + emulate_1op_rax_rdx(ctxt, "imul", ex); + return X86EMUL_CONTINUE; +} + +static int em_div_ex(struct x86_emulate_ctxt *ctxt) { u8 de = 0; - switch (ctxt->modrm_reg) { - case 0 ... 1: /* test */ - emulate_2op_SrcV(ctxt, "test"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - break; - case 2: /* not */ - ctxt->dst.val = ~ctxt->dst.val; - break; - case 3: /* neg */ - emulate_1op(ctxt, "neg"); - break; - case 4: /* mul */ - emulate_1op_rax_rdx(ctxt, "mul", de); - break; - case 5: /* imul */ - emulate_1op_rax_rdx(ctxt, "imul", de); - break; - case 6: /* div */ - emulate_1op_rax_rdx(ctxt, "div", de); - break; - case 7: /* idiv */ - emulate_1op_rax_rdx(ctxt, "idiv", de); - break; - default: - return X86EMUL_UNHANDLEABLE; - } + emulate_1op_rax_rdx(ctxt, "div", de); + if (de) + return emulate_de(ctxt); + return X86EMUL_CONTINUE; +} + +static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 de = 0; + + emulate_1op_rax_rdx(ctxt, "idiv", de); if (de) return emulate_de(ctxt); return X86EMUL_CONTINUE; @@ -2989,9 +3001,14 @@ static struct opcode group1A[] = { }; static struct opcode group3[] = { - D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(SrcMem | ModRM)), + I(DstMem | SrcImm | ModRM, em_test), + I(DstMem | SrcImm | ModRM, em_test), + I(DstMem | SrcNone | ModRM | Lock, em_not), + I(DstMem | SrcNone | ModRM | Lock, em_neg), + I(SrcMem | ModRM, em_mul_ex), + I(SrcMem | ModRM, em_imul_ex), + I(SrcMem | ModRM, em_div_ex), + I(SrcMem | ModRM, em_idiv_ex), }; static struct opcode group4[] = { @@ -3917,9 +3934,6 @@ special_insn: /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = em_grp3(ctxt); - break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; break; -- cgit v0.10.2 From f09ed83e211d253809e575e05bd4de1e335c0cb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:40 +0300 Subject: KVM: x86 emulator: move memop, memopp into emulation context Simplifies further generalization of decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6040d11..56bac3e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -275,6 +275,8 @@ struct x86_emulate_ctxt { unsigned long _eip; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; + struct operand memop; + struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ed819bd..58172fb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3323,8 +3323,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; struct opcode opcode; - struct operand memop = { .type = OP_NONE }, *memopp = NULL; + ctxt->memop.type = OP_NONE; + ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; @@ -3482,21 +3483,21 @@ done_prefixes: /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { - rc = decode_modrm(ctxt, &memop); + rc = decode_modrm(ctxt, &ctxt->memop); if (!ctxt->has_seg_override) set_seg_override(ctxt, ctxt->modrm_seg); } else if (ctxt->d & MemAbs) - rc = decode_abs(ctxt, &memop); + rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; if (!ctxt->has_seg_override) set_seg_override(ctxt, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt); + ctxt->memop.addr.mem.seg = seg_override(ctxt); - if (memop.type == OP_MEM && ctxt->ad_bytes != 8) - memop.addr.mem.ea = (u32)memop.addr.mem.ea; + if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; /* * Decode and fetch the source operand: register, memory @@ -3509,17 +3510,16 @@ done_prefixes: decode_register_operand(ctxt, &ctxt->src, 0); break; case SrcMem16: - memop.bytes = 2; + ctxt->memop.bytes = 2; goto srcmem_common; case SrcMem32: - memop.bytes = 4; + ctxt->memop.bytes = 4; goto srcmem_common; case SrcMem: - memop.bytes = (ctxt->d & ByteOp) ? 1 : - ctxt->op_bytes; + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; srcmem_common: - ctxt->src = memop; - memopp = &ctxt->src; + ctxt->src = ctxt->memop; + ctxt->memopp = &ctxt->src; break; case SrcImmU16: rc = decode_imm(ctxt, &ctxt->src, 2, false); @@ -3561,7 +3561,7 @@ done_prefixes: insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt); break; case SrcMemFAddr: - memop.bytes = ctxt->op_bytes + 2; + ctxt->memop.bytes = ctxt->op_bytes + 2; goto srcmem_common; break; case SrcDX: @@ -3615,8 +3615,8 @@ done_prefixes: break; case DstMem: case DstMem64: - ctxt->dst = memop; - memopp = &ctxt->dst; + ctxt->dst = ctxt->memop; + ctxt->memopp = &ctxt->dst; if ((ctxt->d & DstMask) == DstMem64) ctxt->dst.bytes = 8; else @@ -3654,8 +3654,8 @@ done_prefixes: } done: - if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) - memopp->addr.mem.ea += ctxt->_eip; + if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + ctxt->memopp->addr.mem.ea += ctxt->_eip; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } -- cgit v0.10.2 From a99455499a86bde28fccd84e2119be9cd7c23a3f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:41 +0300 Subject: KVM: x86 emulator: split dst decode to a generic decode_operand() Instead of decoding each operand using its own code, use a generic function. Start with the destination operand. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 58172fb..6a6aed9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -29,6 +29,22 @@ #include "tss.h" /* + * Operand types + */ +#define OpNone 0 +#define OpImplicit 1 /* No generic decode */ +#define OpReg 2 /* Register */ +#define OpMem 3 /* Memory */ +#define OpAcc 4 /* Accumulator: AL/AX/EAX/RAX */ +#define OpDI 5 /* ES:DI/EDI/RDI */ +#define OpMem64 6 /* Memory, 64-bit */ +#define OpImmUByte 7 /* Zero-extended 8-bit immediate */ +#define OpDX 8 /* DX register */ + +#define OpBits 4 /* Width of operand field */ +#define OpMask ((1 << OpBits) - 1) + +/* * Opcode effective-address decode tables. * Note that we only emulate instructions that have at least one memory * operand (excluding implicit stack references). We assume that stack @@ -40,15 +56,16 @@ /* Operand sizes: 8-bit operands or specified/overridden size. */ #define ByteOp (1<<0) /* 8-bit operands. */ /* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstDI (5<<1) /* Destination is in ES:(E)DI */ -#define DstMem64 (6<<1) /* 64bit memory operand */ -#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ -#define DstDX (8<<1) /* Destination is in DX register */ -#define DstMask (0xf<<1) +#define DstShift 1 +#define ImplicitOps (OpImplicit << DstShift) +#define DstReg (OpReg << DstShift) +#define DstMem (OpMem << DstShift) +#define DstAcc (OpAcc << DstShift) +#define DstDI (OpDI << DstShift) +#define DstMem64 (OpMem64 << DstShift) +#define DstImmUByte (OpImmUByte << DstShift) +#define DstDX (OpDX << DstShift) +#define DstMask (OpMask << DstShift) /* Source operand type. */ #define SrcNone (0<<5) /* No source operand. */ #define SrcReg (1<<5) /* Register operand. */ @@ -3316,6 +3333,66 @@ done: return rc; } +static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned d) +{ + int rc = X86EMUL_CONTINUE; + + switch (d) { + case OpReg: + decode_register_operand(ctxt, op, + ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + break; + case OpImmUByte: + op->type = OP_IMM; + op->addr.mem.ea = ctxt->_eip; + op->bytes = 1; + op->val = insn_fetch(u8, ctxt); + break; + case OpMem: + case OpMem64: + *op = ctxt->memop; + ctxt->memopp = op; + if (d == OpMem64) + op->bytes = 8; + else + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + if (ctxt->d & BitOp) + fetch_bit_operand(ctxt); + op->orig_val = op->val; + break; + case OpAcc: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpDI: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); + op->addr.mem.seg = VCPU_SREG_ES; + op->val = 0; + break; + case OpDX: + op->type = OP_REG; + op->bytes = 2; + op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + fetch_register_operand(op); + break; + case OpImplicit: + /* Special instructions do their own operand decoding. */ + default: + op->type = OP_NONE; /* Disable writeback. */ + break; + } + +done: + return rc; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) { int rc = X86EMUL_CONTINUE; @@ -3602,56 +3679,7 @@ done_prefixes: goto done; /* Decode and fetch the destination operand: register or memory. */ - switch (ctxt->d & DstMask) { - case DstReg: - decode_register_operand(ctxt, &ctxt->dst, - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); - break; - case DstImmUByte: - ctxt->dst.type = OP_IMM; - ctxt->dst.addr.mem.ea = ctxt->_eip; - ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, ctxt); - break; - case DstMem: - case DstMem64: - ctxt->dst = ctxt->memop; - ctxt->memopp = &ctxt->dst; - if ((ctxt->d & DstMask) == DstMem64) - ctxt->dst.bytes = 8; - else - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - if (ctxt->d & BitOp) - fetch_bit_operand(ctxt); - ctxt->dst.orig_val = ctxt->dst.val; - break; - case DstAcc: - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(&ctxt->dst); - ctxt->dst.orig_val = ctxt->dst.val; - break; - case DstDI: - ctxt->dst.type = OP_MEM; - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->dst.addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); - ctxt->dst.addr.mem.seg = VCPU_SREG_ES; - ctxt->dst.val = 0; - break; - case DstDX: - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = 2; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(&ctxt->dst); - break; - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - default: - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - break; - } + rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) -- cgit v0.10.2 From b1ea50b2b63a95aa5a7944b48ba4d0e9b32211d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:42 +0300 Subject: KVM: x86 emulator: expand decode flags to 64 bits Unifiying the operands means not taking advantage of the fact that some operand types can only go into certain operands (for example, DI can only be used by the destination), so we need more bits to hold the operand type. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 56bac3e..a026507 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -262,7 +262,7 @@ struct x86_emulate_ctxt { struct operand dst; bool has_seg_override; u8 seg_override; - unsigned int d; + u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); /* modrm */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6a6aed9..8c65ff2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -31,18 +31,18 @@ /* * Operand types */ -#define OpNone 0 -#define OpImplicit 1 /* No generic decode */ -#define OpReg 2 /* Register */ -#define OpMem 3 /* Memory */ -#define OpAcc 4 /* Accumulator: AL/AX/EAX/RAX */ -#define OpDI 5 /* ES:DI/EDI/RDI */ -#define OpMem64 6 /* Memory, 64-bit */ -#define OpImmUByte 7 /* Zero-extended 8-bit immediate */ -#define OpDX 8 /* DX register */ +#define OpNone 0ull +#define OpImplicit 1ull /* No generic decode */ +#define OpReg 2ull /* Register */ +#define OpMem 3ull /* Memory */ +#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */ +#define OpDI 5ull /* ES:DI/EDI/RDI */ +#define OpMem64 6ull /* Memory, 64-bit */ +#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ +#define OpDX 8ull /* DX register */ #define OpBits 4 /* Width of operand field */ -#define OpMask ((1 << OpBits) - 1) +#define OpMask ((1ull << OpBits) - 1) /* * Opcode effective-address decode tables. @@ -108,12 +108,12 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ -#define Src2None (0<<29) -#define Src2CL (1<<29) -#define Src2ImmByte (2<<29) -#define Src2One (3<<29) -#define Src2Imm (4<<29) -#define Src2Mask (7<<29) +#define Src2None (0u<<29) +#define Src2CL (1u<<29) +#define Src2ImmByte (2u<<29) +#define Src2One (3u<<29) +#define Src2Imm (4u<<29) +#define Src2Mask (7u<<29) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -125,8 +125,8 @@ #define X16(x...) X8(x), X8(x) struct opcode { - u32 flags; - u8 intercept; + u64 flags : 56; + u64 intercept : 8; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; @@ -3530,7 +3530,7 @@ done_prefixes: return EMULATION_FAILED; } - ctxt->d &= ~GroupMask; + ctxt->d &= ~(u64)GroupMask; ctxt->d |= opcode.flags; } -- cgit v0.10.2 From 4dd6a57df7ca9088a4b14664764e7adb9c120bb1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:43 +0300 Subject: KVM: x86 emulator: switch src2 to generic decode_operand() Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8c65ff2..88d32fc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -40,6 +40,10 @@ #define OpMem64 6ull /* Memory, 64-bit */ #define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ #define OpDX 8ull /* DX register */ +#define OpCL 9ull /* CL register (for shifts) */ +#define OpImmByte 10ull /* 8-bit sign extended immediate */ +#define OpOne 11ull /* Implied 1 */ +#define OpImm 12ull /* Sign extended immediate */ #define OpBits 4 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -108,12 +112,13 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ -#define Src2None (0u<<29) -#define Src2CL (1u<<29) -#define Src2ImmByte (2u<<29) -#define Src2One (3u<<29) -#define Src2Imm (4u<<29) -#define Src2Mask (7u<<29) +#define Src2Shift (29) +#define Src2None (OpNone << Src2Shift) +#define Src2CL (OpCL << Src2Shift) +#define Src2ImmByte (OpImmByte << Src2Shift) +#define Src2One (OpOne << Src2Shift) +#define Src2Imm (OpImm << Src2Shift) +#define Src2Mask (OpMask << Src2Shift) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -3382,6 +3387,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; fetch_register_operand(op); break; + case OpCL: + op->bytes = 1; + op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; + break; + case OpImmByte: + rc = decode_imm(ctxt, op, 1, true); + break; + case OpOne: + op->bytes = 1; + op->val = 1; + break; + case OpImm: + rc = decode_imm(ctxt, op, imm_size(ctxt), true); + break; case OpImplicit: /* Special instructions do their own operand decoding. */ default: @@ -3656,25 +3675,7 @@ done_prefixes: * Decode and fetch the second source operand: register, memory * or immediate. */ - switch (ctxt->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - ctxt->src2.bytes = 1; - ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff; - break; - case Src2ImmByte: - rc = decode_imm(ctxt, &ctxt->src2, 1, true); - break; - case Src2One: - ctxt->src2.bytes = 1; - ctxt->src2.val = 1; - break; - case Src2Imm: - rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true); - break; - } - + rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; -- cgit v0.10.2 From 20c29ff205ea9c4e2bf4ef79b0e5b934d6c60aff Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:44 +0300 Subject: KVM: x86 emulator: free up some flag bits near src, dst Op fields are going to grow by a bit, we need two free bits. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 88d32fc..00e0904 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -88,10 +88,6 @@ #define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ #define SrcDX (0xf<<5) /* Source is in DX register */ #define SrcMask (0xf<<5) -/* Generic ModRM decode. */ -#define ModRM (1<<9) -/* Destination is only written; never read. */ -#define Mov (1<<10) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ @@ -102,6 +98,10 @@ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Sse (1<<18) /* SSE Vector instruction */ +/* Generic ModRM decode. */ +#define ModRM (1<<19) +/* Destination is only written; never read. */ +#define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ -- cgit v0.10.2 From 608aabe316eab9116ddd73418217277693df3bb8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:45 +0300 Subject: KVM: x86 emulator: switch OpImmUByte decode to decode_imm() Similar to SrcImmUByte. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 00e0904..a0d6ceb4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3349,10 +3349,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); break; case OpImmUByte: - op->type = OP_IMM; - op->addr.mem.ea = ctxt->_eip; - op->bytes = 1; - op->val = insn_fetch(u8, ctxt); + rc = decode_imm(ctxt, op, 1, false); break; case OpMem: case OpMem64: -- cgit v0.10.2 From 5217973ef899ffecdd77743aae3b633994a08cde Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:46 +0300 Subject: KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack OpReg decoding has a hack that inhibits byte registers for movsx and movzx instructions. It should be replaced by something better, but meanwhile, qualify that the hack is only active for the destination operand. Note these instructions only use OpReg for the destination, but better to be explicit about it. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a0d6ceb4..17a8910 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3346,6 +3346,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, switch (d) { case OpReg: decode_register_operand(ctxt, op, + op == &ctxt->dst && ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); break; case OpImmUByte: -- cgit v0.10.2 From 0fe591288470aebba4104b17513c9ad5050f9d0d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:47 +0300 Subject: KVM: x86 emulator: switch src decode to decode_operand() Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 17a8910..e46809b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -44,8 +44,15 @@ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ #define OpImm 12ull /* Sign extended immediate */ - -#define OpBits 4 /* Width of operand field */ +#define OpMem16 13ull /* Memory operand (16-bit). */ +#define OpMem32 14ull /* Memory operand (32-bit). */ +#define OpImmU 15ull /* Immediate operand, zero extended */ +#define OpSI 16ull /* SI/ESI/RSI */ +#define OpImmFAddr 17ull /* Immediate far address */ +#define OpMemFAddr 18ull /* Far address in memory */ +#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ + +#define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) /* @@ -71,23 +78,24 @@ #define DstDX (OpDX << DstShift) #define DstMask (OpMask << DstShift) /* Source operand type. */ -#define SrcNone (0<<5) /* No source operand. */ -#define SrcReg (1<<5) /* Register operand. */ -#define SrcMem (2<<5) /* Memory operand. */ -#define SrcMem16 (3<<5) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<5) /* Memory operand (32-bit). */ -#define SrcImm (5<<5) /* Immediate operand. */ -#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */ -#define SrcOne (7<<5) /* Implied '1' */ -#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */ -#define SrcImmU (9<<5) /* Immediate operand, unsigned */ -#define SrcSI (0xa<<5) /* Source is in the DS:RSI */ -#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */ -#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */ -#define SrcAcc (0xd<<5) /* Source Accumulator */ -#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ -#define SrcDX (0xf<<5) /* Source is in DX register */ -#define SrcMask (0xf<<5) +#define SrcShift 6 +#define SrcNone (OpNone << SrcShift) +#define SrcReg (OpReg << SrcShift) +#define SrcMem (OpMem << SrcShift) +#define SrcMem16 (OpMem16 << SrcShift) +#define SrcMem32 (OpMem32 << SrcShift) +#define SrcImm (OpImm << SrcShift) +#define SrcImmByte (OpImmByte << SrcShift) +#define SrcOne (OpOne << SrcShift) +#define SrcImmUByte (OpImmUByte << SrcShift) +#define SrcImmU (OpImmU << SrcShift) +#define SrcSI (OpSI << SrcShift) +#define SrcImmFAddr (OpImmFAddr << SrcShift) +#define SrcMemFAddr (OpMemFAddr << SrcShift) +#define SrcAcc (OpAcc << SrcShift) +#define SrcImmU16 (OpImmU16 << SrcShift) +#define SrcDX (OpDX << SrcShift) +#define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ @@ -3354,13 +3362,14 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, break; case OpMem: case OpMem64: - *op = ctxt->memop; - ctxt->memopp = op; if (d == OpMem64) - op->bytes = 8; + ctxt->memop.bytes = 8; else - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - if (ctxt->d & BitOp) + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + mem_common: + *op = ctxt->memop; + ctxt->memopp = op; + if ((ctxt->d & BitOp) && op == &ctxt->dst) fetch_bit_operand(ctxt); op->orig_val = op->val; break; @@ -3399,6 +3408,35 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpMem16: + ctxt->memop.bytes = 2; + goto mem_common; + case OpMem32: + ctxt->memop.bytes = 4; + goto mem_common; + case OpImmU16: + rc = decode_imm(ctxt, op, 2, false); + break; + case OpImmU: + rc = decode_imm(ctxt, op, imm_size(ctxt), false); + break; + case OpSI: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); + op->addr.mem.seg = seg_override(ctxt); + op->val = 0; + break; + case OpImmFAddr: + op->type = OP_IMM; + op->addr.mem.ea = ctxt->_eip; + op->bytes = ctxt->op_bytes + 2; + insn_fetch_arr(op->valptr, op->bytes, ctxt); + break; + case OpMemFAddr: + ctxt->memop.bytes = ctxt->op_bytes + 2; + goto mem_common; case OpImplicit: /* Special instructions do their own operand decoding. */ default: @@ -3597,75 +3635,7 @@ done_prefixes: * Decode and fetch the source operand: register, memory * or immediate. */ - switch (ctxt->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(ctxt, &ctxt->src, 0); - break; - case SrcMem16: - ctxt->memop.bytes = 2; - goto srcmem_common; - case SrcMem32: - ctxt->memop.bytes = 4; - goto srcmem_common; - case SrcMem: - ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - srcmem_common: - ctxt->src = ctxt->memop; - ctxt->memopp = &ctxt->src; - break; - case SrcImmU16: - rc = decode_imm(ctxt, &ctxt->src, 2, false); - break; - case SrcImm: - rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true); - break; - case SrcImmU: - rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false); - break; - case SrcImmByte: - rc = decode_imm(ctxt, &ctxt->src, 1, true); - break; - case SrcImmUByte: - rc = decode_imm(ctxt, &ctxt->src, 1, false); - break; - case SrcAcc: - ctxt->src.type = OP_REG; - ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(&ctxt->src); - break; - case SrcOne: - ctxt->src.bytes = 1; - ctxt->src.val = 1; - break; - case SrcSI: - ctxt->src.type = OP_MEM; - ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->src.addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); - ctxt->src.addr.mem.seg = seg_override(ctxt); - ctxt->src.val = 0; - break; - case SrcImmFAddr: - ctxt->src.type = OP_IMM; - ctxt->src.addr.mem.ea = ctxt->_eip; - ctxt->src.bytes = ctxt->op_bytes + 2; - insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt); - break; - case SrcMemFAddr: - ctxt->memop.bytes = ctxt->op_bytes + 2; - goto srcmem_common; - break; - case SrcDX: - ctxt->src.type = OP_REG; - ctxt->src.bytes = 2; - ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(&ctxt->src); - break; - } - + rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; -- cgit v0.10.2 From 41ddf9784cb91c9e4d3a218eef3551bebe9c7362 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:48 +0300 Subject: KVM: x86 emulator: simplify OpMem64 decode Use the same technique as the other OpMem variants, and goto mem_common. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e46809b..1c95935 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3361,11 +3361,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, rc = decode_imm(ctxt, op, 1, false); break; case OpMem: - case OpMem64: - if (d == OpMem64) - ctxt->memop.bytes = 8; - else - ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; mem_common: *op = ctxt->memop; ctxt->memopp = op; @@ -3373,6 +3369,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, fetch_bit_operand(ctxt); op->orig_val = op->val; break; + case OpMem64: + ctxt->memop.bytes = 8; + goto mem_common; case OpAcc: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; -- cgit v0.10.2 From c191a7a0f4d3b17cc6cee1d3f721dfe23fc7d6c6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:49 +0300 Subject: KVM: x86 emulator: streamline decode of segment registers The opcodes push %seg pop %seg l%seg, %mem, %reg (e.g. lds/les/lss/lfs/lgs) all have an segment register encoded in the instruction. To allow reuse, decode the segment number into src2 during the decode stage instead of the execution stage. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1c95935..ab48611 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -51,6 +51,12 @@ #define OpImmFAddr 17ull /* Immediate far address */ #define OpMemFAddr 18ull /* Far address in memory */ #define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ +#define OpES 20ull /* ES */ +#define OpCS 21ull /* CS */ +#define OpSS 22ull /* SS */ +#define OpDS 23ull /* DS */ +#define OpFS 24ull /* FS */ +#define OpGS 25ull /* GS */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -126,6 +132,12 @@ #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) #define Src2Imm (OpImm << Src2Shift) +#define Src2ES (OpES << Src2Shift) +#define Src2CS (OpCS << Src2Shift) +#define Src2SS (OpSS << Src2Shift) +#define Src2DS (OpDS << Src2Shift) +#define Src2FS (OpFS << Src2Shift) +#define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define X2(x...) x, x @@ -3101,16 +3113,19 @@ static struct gprefix pfx_0f_6f_0f_7f = { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + D(ImplicitOps | Stack | No64 | Src2ES), + D(ImplicitOps | Stack | No64 | Src2ES), /* 0x08 - 0x0F */ I6ALU(Lock, em_or), - D(ImplicitOps | Stack | No64), N, + D(ImplicitOps | Stack | No64 | Src2CS), N, /* 0x10 - 0x17 */ I6ALU(Lock, em_adc), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + D(ImplicitOps | Stack | No64 | Src2SS), + D(ImplicitOps | Stack | No64 | Src2SS), /* 0x18 - 0x1F */ I6ALU(Lock, em_sbb), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + D(ImplicitOps | Stack | No64 | Src2DS), + D(ImplicitOps | Stack | No64 | Src2DS), /* 0x20 - 0x27 */ I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ @@ -3178,7 +3193,8 @@ static struct opcode opcode_table[256] = { D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), - D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), + D(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES), + D(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, I(ImplicitOps | Stack, em_ret_far), @@ -3253,20 +3269,22 @@ static struct opcode twobyte_table[256] = { /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), + D(Stack | Src2FS), D(Stack | Src2FS), DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), + D(Stack | Src2GS), D(Stack | Src2GS), DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D2bv(DstMem | SrcReg | ModRM | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + D(DstReg | SrcMemFAddr | ModRM | Src2SS), + D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMemFAddr | ModRM | Src2FS), + D(DstReg | SrcMemFAddr | ModRM | Src2GS), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, @@ -3436,6 +3454,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpMemFAddr: ctxt->memop.bytes = ctxt->op_bytes + 2; goto mem_common; + case OpES: + op->val = VCPU_SREG_ES; + break; + case OpCS: + op->val = VCPU_SREG_CS; + break; + case OpSS: + op->val = VCPU_SREG_SS; + break; + case OpDS: + op->val = VCPU_SREG_DS; + break; + case OpFS: + op->val = VCPU_SREG_FS; + break; + case OpGS: + op->val = VCPU_SREG_GS; + break; case OpImplicit: /* Special instructions do their own operand decoding. */ default: @@ -3803,26 +3839,15 @@ special_insn: switch (ctxt->b) { case 0x06: /* push es */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); - break; - case 0x07: /* pop es */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES); - break; case 0x0e: /* push cs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_CS); - break; case 0x16: /* push ss */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_SS); - break; - case 0x17: /* pop ss */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS); - break; case 0x1e: /* push ds */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, ctxt->src2.val); break; + case 0x07: /* pop es */ + case 0x17: /* pop ss */ case 0x1f: /* pop ds */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); - break; + rc = emulate_pop_sreg(ctxt, ctxt->src2.val); case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op(ctxt, "inc"); break; @@ -3869,10 +3894,8 @@ special_insn: rc = em_grp2(ctxt); break; case 0xc4: /* les */ - rc = emulate_load_segment(ctxt, VCPU_SREG_ES); - break; case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, VCPU_SREG_DS); + rc = emulate_load_segment(ctxt, ctxt->src2.val); break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); @@ -4078,10 +4101,12 @@ twobyte_insn: ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; case 0xa0: /* push fs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); + case 0xa8: /* push gs */ + rc = emulate_push_sreg(ctxt, ctxt->src2.val); break; case 0xa1: /* pop fs */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS); + case 0xa9: /* pop gs */ + rc = emulate_pop_sreg(ctxt, ctxt->src2.val); break; case 0xa3: bt: /* bt */ @@ -4094,12 +4119,6 @@ twobyte_insn: case 0xa5: /* shld cl, r, r/m */ emulate_2op_cl(ctxt, "shld"); break; - case 0xa8: /* push gs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); - break; - case 0xa9: /* pop gs */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS); - break; case 0xab: bts: /* bts */ emulate_2op_SrcV_nobyte(ctxt, "bts"); @@ -4128,18 +4147,14 @@ twobyte_insn: } break; case 0xb2: /* lss */ - rc = emulate_load_segment(ctxt, VCPU_SREG_SS); + case 0xb4: /* lfs */ + case 0xb5: /* lgs */ + rc = emulate_load_segment(ctxt, ctxt->src2.val); break; case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte(ctxt, "btr"); break; - case 0xb4: /* lfs */ - rc = emulate_load_segment(ctxt, VCPU_SREG_FS); - break; - case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, VCPU_SREG_GS); - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val -- cgit v0.10.2 From d4b4325fdb66739a74148f90da360ff82ce707d4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:50 +0300 Subject: KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ab48611..bd3e488 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1828,8 +1828,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) +static int em_lseg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; unsigned short sel; int rc; @@ -3193,8 +3194,8 @@ static struct opcode opcode_table[256] = { D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), - D(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES), - D(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS), + I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), + I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, I(ImplicitOps | Stack, em_ret_far), @@ -3281,10 +3282,10 @@ static struct opcode twobyte_table[256] = { D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D2bv(DstMem | SrcReg | ModRM | Lock), - D(DstReg | SrcMemFAddr | ModRM | Src2SS), + I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMemFAddr | ModRM | Src2FS), - D(DstReg | SrcMemFAddr | ModRM | Src2GS), + I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), + I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, @@ -3893,10 +3894,6 @@ special_insn: case 0xc0 ... 0xc1: rc = em_grp2(ctxt); break; - case 0xc4: /* les */ - case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, ctxt->src2.val); - break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; @@ -4146,11 +4143,6 @@ twobyte_insn: ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; } break; - case 0xb2: /* lss */ - case 0xb4: /* lfs */ - case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, ctxt->src2.val); - break; case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte(ctxt, "btr"); -- cgit v0.10.2 From 1cd196ea42c526549ded4fd29809c3fdaa4a7f41 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Sep 2011 10:45:51 +0300 Subject: KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bd3e488..f1e3be18 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1458,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } -static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; + ctxt->src.val = get_segment_selector(ctxt, seg); return em_push(ctxt); } -static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; unsigned long selector; int rc; @@ -3114,19 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), - D(ImplicitOps | Stack | No64 | Src2ES), - D(ImplicitOps | Stack | No64 | Src2ES), + I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ I6ALU(Lock, em_or), - D(ImplicitOps | Stack | No64 | Src2CS), N, + I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), + N, /* 0x10 - 0x17 */ I6ALU(Lock, em_adc), - D(ImplicitOps | Stack | No64 | Src2SS), - D(ImplicitOps | Stack | No64 | Src2SS), + I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ I6ALU(Lock, em_sbb), - D(ImplicitOps | Stack | No64 | Src2DS), - D(ImplicitOps | Stack | No64 | Src2DS), + I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ @@ -3270,12 +3274,12 @@ static struct opcode twobyte_table[256] = { /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ - D(Stack | Src2FS), D(Stack | Src2FS), + I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ - D(Stack | Src2GS), D(Stack | Src2GS), + I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), @@ -3839,16 +3843,6 @@ special_insn: goto twobyte_insn; switch (ctxt->b) { - case 0x06: /* push es */ - case 0x0e: /* push cs */ - case 0x16: /* push ss */ - case 0x1e: /* push ds */ - rc = emulate_push_sreg(ctxt, ctxt->src2.val); - break; - case 0x07: /* pop es */ - case 0x17: /* pop ss */ - case 0x1f: /* pop ds */ - rc = emulate_pop_sreg(ctxt, ctxt->src2.val); case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op(ctxt, "inc"); break; @@ -4097,14 +4091,6 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa0: /* push fs */ - case 0xa8: /* push gs */ - rc = emulate_push_sreg(ctxt, ctxt->src2.val); - break; - case 0xa1: /* pop fs */ - case 0xa9: /* pop gs */ - rc = emulate_pop_sreg(ctxt, ctxt->src2.val); - break; case 0xa3: bt: /* bt */ ctxt->dst.type = OP_NONE; -- cgit v0.10.2 From 7460fb4a340033107530df19e7e125bd0969bfb2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Sep 2011 13:43:14 +0300 Subject: KVM: Fix simultaneous NMIs If simultaneous NMIs happen, we're supposed to queue the second and next (collapsing them), but currently we sometimes collapse the second into the first. Fix by using a counter for pending NMIs instead of a bool; since the counter limit depends on whether the processor is currently in an NMI handler, which can only be checked in vcpu context (via the NMI mask), we add a new KVM_REQ_NMI to request recalculation of the counter. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ab4241..ab62711 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -413,8 +413,9 @@ struct kvm_vcpu_arch { u32 tsc_catchup_mult; s8 tsc_catchup_shift; - bool nmi_pending; - bool nmi_injected; + atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ + unsigned nmi_pending; /* NMI queued after currently running handler */ + bool nmi_injected; /* Trying to inject an NMI this entry */ struct mtrr_state_type mtrr_state; u32 pat; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b37f18..d51e407 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; + atomic_inc(&vcpu->arch.nmi_queued); + kvm_make_request(KVM_REQ_NMI, vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -2827,6 +2828,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + process_nmi(vcpu); events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2844,7 +2846,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); events->nmi.pad = 0; @@ -2864,6 +2866,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; + process_nmi(vcpu); vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -4763,7 +4766,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_pending = 0; else vcpu->arch.interrupt.pending = false; @@ -5572,7 +5575,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) /* try to inject new event if pending */ if (vcpu->arch.nmi_pending) { if (kvm_x86_ops->nmi_allowed(vcpu)) { - vcpu->arch.nmi_pending = false; + --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } @@ -5604,10 +5607,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } +static void process_nmi(struct kvm_vcpu *vcpu) +{ + unsigned limit = 2; + + /* + * x86 is limited to one NMI running, and one NMI pending after it. + * If an NMI is already in progress, limit further NMIs to just one. + * Otherwise, allow two (and we'll inject the first one immediately). + */ + if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + limit = 1; + + vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5647,6 +5666,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + process_nmi(vcpu); } @@ -5654,19 +5675,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; - /* - * An NMI can be injected between local nmi_pending read and - * vcpu->arch.nmi_pending read inside inject_pending_event(). - * But in that case, KVM_REQ_EVENT will be set, which makes - * the race described above benign. - */ - nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (nmi_pending) + if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -6374,7 +6387,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.nmi_pending = false; + atomic_set(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; vcpu->arch.switch_db_regs = 0; @@ -6649,7 +6663,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending || + || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2a414f6..d526231 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -49,6 +49,7 @@ #define KVM_REQ_EVENT 11 #define KVM_REQ_APF_HALT 12 #define KVM_REQ_STEAL_UPDATE 13 +#define KVM_REQ_NMI 14 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 -- cgit v0.10.2 From b90dfb0419a79a90395e04fee3fbda3c12ba8237 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 22 Sep 2011 16:53:58 +0800 Subject: x86: TSC deadline definitions This pre-defination is preparing for KVM tsc deadline timer emulation, but theirself are not kvm specific. Signed-off-by: Liu, Jinsong Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 34595d5..3925d80 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -100,7 +100,9 @@ #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) #define APIC_LVT_MASKED (1 << 16) #define APIC_LVT_LEVEL_TRIGGER (1 << 15) #define APIC_LVT_REMOTE_IRR (1 << 14) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 88b23a4..94dfb0a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -120,6 +120,7 @@ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ #define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d52609a..a6962d9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -229,6 +229,8 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define MSR_IA32_TSCDEADLINE 0x000006e0 + #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b -- cgit v0.10.2 From a3e06bbe8445f57eb949e6474c5a9b30f24d2057 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Thu, 22 Sep 2011 16:55:52 +0800 Subject: KVM: emulate lapic tsc deadline timer for guest This patch emulate lapic tsc deadline timer for guest: Enumerate tsc deadline timer capability by CPUID; Enable tsc deadline timer mode by lapic MMIO; Start tsc deadline timer by WRMSR; [jan: use do_div()] [avi: fix for !irqchip_in_kernel()] [marcelo: another fix for !irqchip_in_kernel()] Signed-off-by: Liu, Jinsong Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ab62711..b4973f4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -674,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); + /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; /* minimum supported tsc_khz for guests */ diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 64bc6ea..497dbaa 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -2,6 +2,8 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ bool reinject; struct kvm_timer_ops *t_ops; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2fb20ca..54abb40 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -138,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); +} + static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); +} + +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == + APIC_LVT_TIMER_TSCDEADLINE); } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -169,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) } static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ LINT_MASK, LINT_MASK, /* LVT0-1 */ @@ -572,6 +586,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) break; case APIC_TMCCT: /* Timer CCR */ + if (apic_lvtt_tscdeadline(apic)) + return 0; + val = apic_get_tmcct(apic); break; @@ -666,37 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic) static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->lapic_timer.timer.base->get_time(); - - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->divide_count; + ktime_t now; atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, apic->lapic_timer.period, - min_period); - apic->lapic_timer.period = min_period; + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + /* lapic timer in oneshot or peroidic mode */ + now = apic->lapic_timer.timer.base->get_time(); + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } } - } - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), + HRTIMER_MODE_ABS); - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, @@ -705,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); + } else if (apic_lvtt_tscdeadline(apic)) { + /* lapic timer in tsc deadline mode */ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long flags; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + } + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + + local_irq_restore(flags); + } } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -792,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); - case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: @@ -806,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; + case APIC_LVTT: + if ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) != + (val & apic->lapic_timer.timer_mode_mask)) + hrtimer_cancel(&apic->lapic_timer.timer); + + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); + apic_set_reg(apic, APIC_LVTT, val); + break; + case APIC_TMICT: + if (apic_lvtt_tscdeadline(apic)) + break; + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); @@ -902,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return 0; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return 0; + + return apic->lapic_timer.tscdeadline; +} + +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return; + + hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.tscdeadline = data; + start_apic_timer(apic); +} + void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 8287243..138e8cc 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -42,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu); bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d51e407..cf26909 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -600,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -611,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= bit(X86_FEATURE_OSXSAVE); } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + best->function == 0x1) { + best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); + timer_mode_mask = 3 << 17; + } else + timer_mode_mask = 1 << 17; + + if (apic) + apic->lapic_timer.timer_mode_mask = timer_mode_mask; } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -826,6 +838,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { + MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1001,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) { if (vcpu->arch.virtual_tsc_khz) return vcpu->arch.virtual_tsc_khz; @@ -1565,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_TSCDEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1894,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr, pdata); break; + case MSR_IA32_TSCDEADLINE: + data = kvm_get_lapic_tscdeadline_msr(vcpu); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; -- cgit v0.10.2 From 4d47555a80495657161a7e71ec3014ff2021e450 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Tue, 18 Oct 2011 12:27:12 +0200 Subject: KVM: s390: check cpu_id prior to using it We use the cpu id provided by userspace as array index here. Thus we clearly need to check it first. Ooops. CC: Signed-off-by: Carsten Otte Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dc2b580..0cba935 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -312,11 +312,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); - int rc = -ENOMEM; + struct kvm_vcpu *vcpu; + int rc = -EINVAL; + + if (id >= KVM_MAX_VCPUS) + goto out; + + rc = -ENOMEM; + vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); if (!vcpu) - goto out_nomem; + goto out; vcpu->arch.sie_block = (struct kvm_s390_sie_block *) get_zeroed_page(GFP_KERNEL); @@ -352,7 +358,7 @@ out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); out_free_cpu: kfree(vcpu); -out_nomem: +out: return ERR_PTR(rc); } -- cgit v0.10.2 From b290411a1321dd937dce4aaa812e5d8fae8a14a5 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Tue, 18 Oct 2011 12:27:13 +0200 Subject: KVM: s390: fix return value of kvm_arch_init_vm This patch fixes the return value of kvm_arch_init_vm in case a memory allocation goes wrong. Signed-off-by: Carsten Otte Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0cba935..397f0cb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -175,6 +175,8 @@ int kvm_arch_init_vm(struct kvm *kvm) if (rc) goto out_err; + rc = -ENOMEM; + kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); if (!kvm->arch.sca) goto out_err; -- cgit v0.10.2 From 7eef87dc99e419b1cc051e4417c37e4744d7b661 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Tue, 18 Oct 2011 12:27:14 +0200 Subject: KVM: s390: fix register setting KVM common code does vcpu_load prior to calling our arch ioctls and vcpu_put after we're done here. Via the kvm_arch_vcpu_load/put callbacks we do load the fpu and access register state into the processor, which saves us moving the state on every SIE exit the kernel handles. However this breaks register setting from userspace, because of the following sequence: 1a. vcpu load stores userspace register content 1b. vcpu load loads guest register content 2. kvm_arch_vcpu_ioctl_set_fpu/sregs updates saved guest register content 3a. vcpu put stores the guest registers and overwrites the new content 3b. vcpu put loads the userspace register set again This patch loads the new guest register state into the cpu, so that the correct (new) set of guest registers will be stored in step 3a. Signed-off-by: Carsten Otte Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 397f0cb..2963567 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -394,6 +394,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); + restore_access_regs(vcpu->arch.guest_acrs); return 0; } @@ -409,6 +410,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); vcpu->arch.guest_fpregs.fpc = fpu->fpc; + restore_fp_regs(&vcpu->arch.guest_fpregs); return 0; } -- cgit v0.10.2 From 7697e71f72b45a1bd0abe70918c383100fcc8514 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Tue, 18 Oct 2011 12:27:15 +0200 Subject: KVM: s390: implement sigp external call Implement sigp external call, which might be required for guests that issue an external call instead of an emergency signal for IPI. This fixes an issue with "KVM: unknown SIGP: 0x02" when booting such an SMP guest. Signed-off-by: Christian Ehrhardt Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 00ff00d..1ca5de0 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -119,6 +119,7 @@ struct kvm_vcpu_stat { u32 instruction_lctlg; u32 exit_program_interruption; u32 exit_instr_and_program; + u32 deliver_external_call; u32 deliver_emergency_signal; u32 deliver_service_signal; u32 deliver_virtio_interrupt; @@ -138,6 +139,7 @@ struct kvm_vcpu_stat { u32 instruction_stfl; u32 instruction_tprot; u32 instruction_sigp_sense; + u32 instruction_sigp_external_call; u32 instruction_sigp_emergency; u32 instruction_sigp_stop; u32 instruction_sigp_arch; @@ -174,6 +176,10 @@ struct kvm_s390_prefix_info { __u32 address; }; +struct kvm_s390_extcall_info { + __u16 code; +}; + struct kvm_s390_emerg_info { __u16 code; }; @@ -186,6 +192,7 @@ struct kvm_s390_interrupt_info { struct kvm_s390_ext_info ext; struct kvm_s390_pgm_info pgm; struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; struct kvm_s390_prefix_info prefix; }; }; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c9aeb4b..87c1670 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -38,6 +38,11 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info *inti) { switch (inti->type) { + case KVM_S390_INT_EXTERNAL_CALL: + if (psw_extint_disabled(vcpu)) + return 0; + if (vcpu->arch.sie_block->gcr[0] & 0x2000ul) + return 1; case KVM_S390_INT_EMERGENCY: if (psw_extint_disabled(vcpu)) return 0; @@ -98,6 +103,7 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info *inti) { switch (inti->type) { + case KVM_S390_INT_EXTERNAL_CALL: case KVM_S390_INT_EMERGENCY: case KVM_S390_INT_SERVICE: case KVM_S390_INT_VIRTIO: @@ -143,6 +149,28 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, exception = 1; break; + case KVM_S390_INT_EXTERNAL_CALL: + VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); + vcpu->stat.deliver_external_call++; + rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); + if (rc == -EFAULT) + exception = 1; + + rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code); + if (rc == -EFAULT) + exception = 1; + + rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (rc == -EFAULT) + exception = 1; + + rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + if (rc == -EFAULT) + exception = 1; + break; + case KVM_S390_INT_SERVICE: VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", inti->ext.ext_params); @@ -522,6 +550,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, break; case KVM_S390_PROGRAM_INT: case KVM_S390_SIGP_STOP: + case KVM_S390_INT_EXTERNAL_CALL: case KVM_S390_INT_EMERGENCY: default: kfree(inti); @@ -581,6 +610,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, break; case KVM_S390_SIGP_STOP: case KVM_S390_RESTART: + case KVM_S390_INT_EXTERNAL_CALL: case KVM_S390_INT_EMERGENCY: VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); inti->type = s390int->type; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2963567..9610ba4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -46,6 +46,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, { "instruction_lctl", VCPU_STAT(instruction_lctl) }, { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, + { "deliver_external_call", VCPU_STAT(deliver_external_call) }, { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) }, { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, @@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_stfl", VCPU_STAT(instruction_stfl) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, + { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index d6a50c1..f815118 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -87,6 +87,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) return -ENOMEM; inti->type = KVM_S390_INT_EMERGENCY; + inti->emerg.code = vcpu->vcpu_id; spin_lock(&fi->lock); li = fi->local_int[cpu_addr]; @@ -103,9 +104,47 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr) wake_up_interruptible(&li->wq); spin_unlock_bh(&li->lock); rc = 0; /* order accepted */ + VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); +unlock: + spin_unlock(&fi->lock); + return rc; +} + +static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_local_interrupt *li; + struct kvm_s390_interrupt_info *inti; + int rc; + + if (cpu_addr >= KVM_MAX_VCPUS) + return 3; /* not operational */ + + inti = kzalloc(sizeof(*inti), GFP_KERNEL); + if (!inti) + return -ENOMEM; + + inti->type = KVM_S390_INT_EXTERNAL_CALL; + inti->extcall.code = vcpu->vcpu_id; + + spin_lock(&fi->lock); + li = fi->local_int[cpu_addr]; + if (li == NULL) { + rc = 3; /* not operational */ + kfree(inti); + goto unlock; + } + spin_lock_bh(&li->lock); + list_add_tail(&inti->list, &li->list); + atomic_set(&li->active, 1); + atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); + if (waitqueue_active(&li->wq)) + wake_up_interruptible(&li->wq); + spin_unlock_bh(&li->lock); + rc = 0; /* order accepted */ + VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); unlock: spin_unlock(&fi->lock); - VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); return rc; } @@ -267,6 +306,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) rc = __sigp_sense(vcpu, cpu_addr, &vcpu->arch.guest_gprs[r1]); break; + case SIGP_EXTERNAL_CALL: + vcpu->stat.instruction_sigp_external_call++; + rc = __sigp_external_call(vcpu, cpu_addr); + break; case SIGP_EMERGENCY: vcpu->stat.instruction_sigp_emergency++; rc = __sigp_emergency(vcpu, cpu_addr); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6884054..f47fcd3 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -371,6 +371,7 @@ struct kvm_s390_psw { #define KVM_S390_INT_VIRTIO 0xffff2603u #define KVM_S390_INT_SERVICE 0xffff2401u #define KVM_S390_INT_EMERGENCY 0xffff1201u +#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u struct kvm_s390_interrupt { __u32 type; -- cgit v0.10.2 From f1c1da2bde712812a3e0f9a7a7ebe7a916a4b5f4 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 18 Oct 2011 18:23:11 +0200 Subject: KVM: SVM: Keep intercepting task switching with NPT enabled AMD processors apparently have a bug in the hardware task switching support when NPT is enabled. If the task switch triggers a NPF, we can get wrong EXITINTINFO along with that fault. On resume, spurious exceptions may then be injected into the guest. We were able to reproduce this bug when our guest triggered #SS and the handler were supposed to run over a separate task with not yet touched stack pages. Work around the issue by continuing to emulate task switches even in NPT mode. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e7ed4b1..e32243e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - clr_intercept(svm, INTERCEPT_TASK_SWITCH); clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); -- cgit v0.10.2