From 83c529151ab0d4a813e3f6a3e293fff75d468519 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Tue, 28 Feb 2012 05:15:46 +0000
Subject: KVM: x86: expose Intel cpu new features (HLE, RTM) to guest

Intel recently release 2 new features, HLE and RTM.
Refer to http://software.intel.com/file/41417.
This patch expose them to guest.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5be..c2134b8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
-		F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
+		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+		F(BMI2) | F(ERMS) | F(RTM);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
-- 
cgit v0.10.2


From a13007160f1b9ec7c67e28ec9254f197c5c08d7d Mon Sep 17 00:00:00 2001
From: Amos Kong <akong@redhat.com>
Date: Fri, 9 Mar 2012 12:17:32 +0800
Subject: KVM: resize kvm_io_range array dynamically

This patch makes the kvm_io_range array can be resized dynamically.

Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 665a260..ba9fb4a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,10 +68,11 @@ struct kvm_io_range {
 	struct kvm_io_device *dev;
 };
 
+#define NR_IOBUS_DEVS 300
+
 struct kvm_io_bus {
 	int                   dev_count;
-#define NR_IOBUS_DEVS 300
-	struct kvm_io_range range[NR_IOBUS_DEVS];
+	struct kvm_io_range range[];
 };
 
 enum kvm_bus {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 42b7393..a9565e2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2393,9 +2393,6 @@ int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
 			  gpa_t addr, int len)
 {
-	if (bus->dev_count == NR_IOBUS_DEVS)
-		return -ENOSPC;
-
 	bus->range[bus->dev_count++] = (struct kvm_io_range) {
 		.addr = addr,
 		.len = len,
@@ -2495,12 +2492,15 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
-	if (bus->dev_count > NR_IOBUS_DEVS-1)
+	if (bus->dev_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
 
-	new_bus = kmemdup(bus, sizeof(struct kvm_io_bus), GFP_KERNEL);
+	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
+	memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
+	       sizeof(struct kvm_io_range)));
 	kvm_io_bus_insert_dev(new_bus, dev, addr, len);
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
@@ -2517,27 +2517,25 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
-
-	new_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL);
-	if (!new_bus)
-		return -ENOMEM;
-
 	r = -ENOENT;
-	for (i = 0; i < new_bus->dev_count; i++)
-		if (new_bus->range[i].dev == dev) {
+	for (i = 0; i < bus->dev_count; i++)
+		if (bus->range[i].dev == dev) {
 			r = 0;
-			new_bus->dev_count--;
-			new_bus->range[i] = new_bus->range[new_bus->dev_count];
-			sort(new_bus->range, new_bus->dev_count,
-			     sizeof(struct kvm_io_range),
-			     kvm_io_bus_sort_cmp, NULL);
 			break;
 		}
 
-	if (r) {
-		kfree(new_bus);
+	if (r)
 		return r;
-	}
+
+	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+			  sizeof(struct kvm_io_range)), GFP_KERNEL);
+	if (!new_bus)
+		return -ENOMEM;
+
+	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+	new_bus->dev_count--;
+	memcpy(new_bus->range + i, bus->range + i + 1,
+	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
 
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
-- 
cgit v0.10.2


From 786a9f888bfbe70a36d0592b26037ca1e8c8da7f Mon Sep 17 00:00:00 2001
From: Amos Kong <akong@redhat.com>
Date: Fri, 9 Mar 2012 12:17:40 +0800
Subject: KVM: set upper bounds for iobus dev to limit userspace

kvm_io_bus devices are used for ioevent, pit, pic, ioapic,
coalesced_mmio.

Currently Qemu only emulates one PCI bus, it contains 32 slots,
one slot contains 8 functions, maximum of supported PCI devices:
 1 * 32 * 8 = 256. One virtio-blk takes one iobus device,
one virtio-net(vhost=on) takes two iobus devices.
The maximum of coalesced mmio zone is 100, each zone
has an iobus devices. So 300 io_bus devices are not enough.

Set an upper bounds for kvm_io_range to limit userspace.
1000 is a very large limit and not bloat the typical user.

Signed-off-by: Amos Kong <akong@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ba9fb4a..3a2cea6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,7 +68,7 @@ struct kvm_io_range {
 	struct kvm_io_device *dev;
 };
 
-#define NR_IOBUS_DEVS 300
+#define NR_IOBUS_DEVS 1000
 
 struct kvm_io_bus {
 	int                   dev_count;
-- 
cgit v0.10.2


From 675acb758ab2381c72fe3ceb5c091cbd0879d4dd Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 8 Mar 2012 18:07:56 +0800
Subject: KVM: SVM: count all irq windows exit

Also count the exits of fast-path.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e334389..f316720 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3240,6 +3240,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	mark_dirty(svm->vmcb, VMCB_INTR);
+	++svm->vcpu.stat.irq_window_exits;
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
@@ -3247,7 +3248,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
 	    kvm_run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
-		++svm->vcpu.stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
 	}
-- 
cgit v0.10.2


From 66ef89315f121cda9bf5b65a4ef02ad1b4fb16d9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 8 Apr 2012 12:47:32 +0300
Subject: KVM: schedule debugfs statistics for removal

Deprecated in favour of tracepoints.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 709e08e..d6dd84f 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -531,3 +531,10 @@ Why:	There appear to be no production users of the get_robust_list syscall,
 	of ASLR. It was only ever intended for debugging, so it should be
 	removed.
 Who:	Kees Cook <keescook@chromium.org>
+
+----------------------------
+
+What:	KVM debugfs statistics
+When:	2013
+Why:	KVM tracepoints provide mostly equivalent information in a much more
+        flexible fashion.
-- 
cgit v0.10.2


From b6d33834bd4e8bdf4a199812e31b3e36da53c794 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <c.dall@virtualopensystems.com>
Date: Thu, 8 Mar 2012 16:44:24 -0500
Subject: KVM: Factor out kvm_vcpu_kick to arch-generic code

The kvm_vcpu_kick function performs roughly the same funcitonality on
most all architectures, so we shouldn't have separate copies.

PowerPC keeps a pointer to interchanging waitqueues on the vcpu_arch
structure and to accomodate this special need a
__KVM_HAVE_ARCH_VCPU_GET_WQ define and accompanying function
kvm_arch_vcpu_wq have been defined. For all other architectures this
is a generic inline that just returns &vcpu->wq;

Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index e35b3a8..c4b4bac 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -365,6 +365,7 @@ struct thash_cb {
 };
 
 struct kvm_vcpu_stat {
+	u32 halt_wakeup;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f5104b7..9d80ff8 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1872,21 +1872,6 @@ void kvm_arch_hardware_unsetup(void)
 {
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	int me;
-	int cpu = vcpu->cpu;
-
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
-
-	me = get_cpu();
-	if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
-			smp_send_reschedule(cpu);
-	put_cpu();
-}
-
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
 	return __apic_accept_irq(vcpu, irq->vector);
@@ -1956,6 +1941,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		(kvm_highest_pending_irq(vcpu) != -1);
 }
 
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests));
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 52eb9c1..8893837 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -498,4 +498,10 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
 
+#define __KVM_HAVE_ARCH_VCPU_GET_WQ 1
+static inline wait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.wqp;
+}
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00d7e34..b5e9046 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -43,6 +43,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	       v->requests;
 }
 
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 {
 	int nr = kvmppc_get_gpr(vcpu, 11);
@@ -588,21 +593,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	return r;
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-	int me;
-	int cpu = vcpu->cpu;
-
-	me = get_cpu();
-	if (waitqueue_active(vcpu->arch.wqp)) {
-		wake_up_interruptible(vcpu->arch.wqp);
-		vcpu->stat.halt_wakeup++;
-	} else if (cpu != me && cpu != -1) {
-		smp_send_reschedule(vcpu->cpu);
-	}
-	put_cpu();
-}
-
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 	if (irq->irq == KVM_INTERRUPT_UNSET) {
@@ -611,6 +601,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	}
 
 	kvmppc_core_queue_external(vcpu, irq);
+
 	kvm_vcpu_kick(vcpu);
 
 	return 0;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 217ce44..d30c835 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -423,6 +423,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	/* kvm common code refers to this, but never calls it */
+	BUG();
+	return 0;
+}
+
+
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
 	kvm_s390_vcpu_initial_reset(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4044ce0..511031d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6403,21 +6403,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		 kvm_cpu_has_interrupt(vcpu));
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
-	int me;
-	int cpu = vcpu->cpu;
-
-	if (waitqueue_active(&vcpu->wq)) {
-		wake_up_interruptible(&vcpu->wq);
-		++vcpu->stat.halt_wakeup;
-	}
-
-	me = get_cpu();
-	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
-			smp_send_reschedule(cpu);
-	put_cpu();
+	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 }
 
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3a2cea6..5b624e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -439,6 +439,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			     gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
@@ -507,6 +508,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
@@ -522,6 +524,13 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
+#ifndef __KVM_HAVE_ARCH_VCPU_GET_WQ
+static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->wq;
+}
+#endif
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a9565e2..7149a2e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1514,6 +1514,28 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 }
 
+/*
+ * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+ */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me;
+	int cpu = vcpu->cpu;
+	wait_queue_head_t *wqp;
+
+	wqp = kvm_arch_vcpu_wq(vcpu);
+	if (waitqueue_active(wqp)) {
+		wake_up_interruptible(wqp);
+		++vcpu->stat.halt_wakeup;
+	}
+
+	me = get_cpu();
+	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+		if (kvm_arch_vcpu_should_kick(vcpu))
+			smp_send_reschedule(cpu);
+	put_cpu();
+}
+
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
 	if (!need_resched())
-- 
cgit v0.10.2


From 2246f8b56315befa30f3d3d2800e0734c774f70e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 13 Mar 2012 22:35:01 +0100
Subject: KVM: PPC: Rework wqp conditional code

On PowerPC, we sometimes use a waitqueue per core, not per thread,
so we can't always use the vcpu internal waitqueue.

This code has been generalized by Christoffer Dall recently, but
unfortunately broke compilation for PowerPC. At the time the helper
function is defined, struct kvm_vcpu is not declared yet, so we can't
dereference it.

This patch moves all logic into the generic inline function, at which
time we have all information necessary.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8893837..20ab5b2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -498,10 +498,6 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
 
-#define __KVM_HAVE_ARCH_VCPU_GET_WQ 1
-static inline wait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.wqp;
-}
+#define __KVM_HAVE_ARCH_WQP
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5b624e1..5184817 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -524,12 +524,14 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
-#ifndef __KVM_HAVE_ARCH_VCPU_GET_WQ
 static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
+#ifdef __KVM_HAVE_ARCH_WQP
+	return vcpu->arch.wqp;
+#else
 	return &vcpu->wq;
-}
 #endif
+}
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
-- 
cgit v0.10.2


From eae3ee7d8a7c59cf63441dedf28674889f5fc477 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:25 -0500
Subject: x86: pvclock: Add flag to indicate that a vm was stopped by the host

This flag will be used to check if the vm was stopped by the host when a soft
lockup was detected.  The host will set the flag when it stops the guest.  On
resume, the guest will check this flag if a soft lockup is detected and skip
issuing the warning.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 35f2d19..6167fd7 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -40,5 +40,6 @@ struct pvclock_wall_clock {
 } __attribute__((__packed__));
 
 #define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
+#define PVCLOCK_GUEST_STOPPED	(1 << 1)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
-- 
cgit v0.10.2


From 3b5d56b9317fa7b5407dff1aa7b115bf6cdbd494 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:26 -0500
Subject: kvmclock: Add functions to check if the host has stopped the vm

When a host stops or suspends a VM it will set a flag to show this.  The
watchdog will use these functions to determine if a softlockup is real, or the
result of a suspended VM.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
asm-generic changes Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/alpha/include/asm/kvm_para.h b/arch/alpha/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/alpha/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/arm/include/asm/kvm_para.h b/arch/arm/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/arm/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/avr32/include/asm/kvm_para.h b/arch/avr32/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/avr32/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/blackfin/include/asm/kvm_para.h b/arch/blackfin/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/blackfin/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/c6x/include/asm/kvm_para.h b/arch/c6x/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/c6x/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/frv/include/asm/kvm_para.h b/arch/frv/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/frv/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/h8300/include/asm/kvm_para.h b/arch/h8300/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/h8300/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/hexagon/include/asm/kvm_para.h b/arch/hexagon/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/hexagon/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
index 1588aee..2019cb9 100644
--- a/arch/ia64/include/asm/kvm_para.h
+++ b/arch/ia64/include/asm/kvm_para.h
@@ -26,6 +26,11 @@ static inline unsigned int kvm_arch_para_features(void)
 	return 0;
 }
 
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
 #endif
 
 #endif
diff --git a/arch/m68k/include/asm/kvm_para.h b/arch/m68k/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/m68k/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/microblaze/include/asm/kvm_para.h b/arch/microblaze/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/microblaze/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/mips/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/mn10300/include/asm/kvm_para.h b/arch/mn10300/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/mn10300/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/openrisc/include/asm/kvm_para.h b/arch/openrisc/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/openrisc/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/parisc/include/asm/kvm_para.h b/arch/parisc/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/parisc/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 7b754e7..c18916b 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -206,6 +206,11 @@ static inline unsigned int kvm_arch_para_features(void)
 	return r;
 }
 
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index 6964db2..a988329 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -149,6 +149,11 @@ static inline unsigned int kvm_arch_para_features(void)
 	return 0;
 }
 
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* __S390_KVM_PARA_H */
diff --git a/arch/score/include/asm/kvm_para.h b/arch/score/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/score/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/sh/include/asm/kvm_para.h b/arch/sh/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/sh/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/sparc/include/asm/kvm_para.h b/arch/sparc/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/sparc/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/um/include/asm/kvm_para.h b/arch/um/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/um/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/unicore32/include/asm/kvm_para.h b/arch/unicore32/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/unicore32/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 734c376..99c4bbe 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -95,6 +95,14 @@ struct kvm_vcpu_pv_apf_data {
 extern void kvmclock_init(void);
 extern int kvm_register_clock(char *txt);
 
+#ifdef CONFIG_KVM_CLOCK
+bool kvm_check_and_clear_guest_paused(void);
+#else
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+#endif /* CONFIG_KVMCLOCK */
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f8492da6..4ba090c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,7 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -114,6 +115,26 @@ static void kvm_get_preset_lpj(void)
 	preset_lpj = lpj;
 }
 
+bool kvm_check_and_clear_guest_paused(void)
+{
+	bool ret = false;
+	struct pvclock_vcpu_time_info *src;
+
+	/*
+	 * per_cpu() is safe here because this function is only called from
+	 * timer functions where preemption is already disabled.
+	 */
+	WARN_ON(!in_atomic());
+	src = &__get_cpu_var(hv_clock);
+	if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
+		__this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+		ret = true;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused);
+
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_get_cycles,
diff --git a/arch/xtensa/include/asm/kvm_para.h b/arch/xtensa/include/asm/kvm_para.h
new file mode 100644
index 0000000..14fab8f
--- /dev/null
+++ b/arch/xtensa/include/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
new file mode 100644
index 0000000..05ef7e7
--- /dev/null
+++ b/include/asm-generic/kvm_para.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_GENERIC_KVM_PARA_H
+#define _ASM_GENERIC_KVM_PARA_H
+
+
+/*
+ * This function is used by architectures that support kvm to avoid issuing
+ * false soft lockup messages.
+ */
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+	return false;
+}
+
+#endif
-- 
cgit v0.10.2


From 1c0b28c2a46d98cd258d96b8c222144b22876c46 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:27 -0500
Subject: KVM: x86: Add ioctl for KVM_KVMCLOCK_CTRL

Now that we have a flag that will tell the guest it was suspended, create an
interface for that communication using a KVM ioctl.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6386f8c..81ff39f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1669,6 +1669,26 @@ at the memory location pointed to by "addr".
 The list of registers accessible using this interface is identical to the
 list in 4.64.
 
+4.70 KVM_KVMCLOCK_CTRL
+
+Capability: KVM_CAP_KVMCLOCK_CTRL
+Architectures: Any that implement pvclocks (currently x86 only)
+Type: vcpu ioctl
+Parameters: None
+Returns: 0 on success, -1 on error
+
+This signals to the host kernel that the specified guest is being paused by
+userspace.  The host will set a flag in the pvclock structure that is checked
+from the soft lockup watchdog.  The flag is part of the pvclock structure that
+is shared between guest and host, specifically the second bit of the flags
+field of the pvclock_vcpu_time_info structure.  It will be set exclusively by
+the host and read/cleared exclusively by the guest.  The guest operation of
+checking and clearing the flag must an atomic operation so
+load-link/store-conditional, or equivalent must be used.  There are two cases
+where the guest will clear the flag: when the soft lockup watchdog timer resets
+itself or when a soft lockup is detected.  This ioctl can be called any time
+after pausing the vcpu, but before it is resumed.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index 5031780..96b41bd 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -109,6 +109,10 @@ MSR_KVM_SYSTEM_TIME_NEW:  0x4b564d01
 		     0      |	   24      | multiple cpus are guaranteed to
 			    |		   | be monotonic
 		-------------------------------------------------------------
+			    |		   | guest vcpu has been paused by
+		     1	    |	  N/A	   | the host
+			    |		   | See 4.70 in api.txt
+		-------------------------------------------------------------
 
 	Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
 	leaf prior to usage.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 511031d..99b7380 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_PCI_2_3:
+	case KVM_CAP_KVMCLOCK_CTRL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor.  This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+	struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
+	if (!vcpu->arch.time_page)
+		return -EINVAL;
+	src->flags |= PVCLOCK_GUEST_STOPPED;
+	mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = vcpu->arch.virtual_tsc_khz;
 		goto out;
 	}
+	case KVM_KVMCLOCK_CTRL: {
+		r = kvm_set_guest_paused(vcpu);
+		goto out;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6c322a9..7a9dd4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -589,6 +589,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_S390_UCONTROL 73
 #define KVM_CAP_SYNC_REGS 74
 #define KVM_CAP_PCI_2_3 75
+#define KVM_CAP_KVMCLOCK_CTRL 76
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -859,6 +860,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_ONE_REG */
 #define KVM_GET_ONE_REG		  _IOW(KVMIO,  0xab, struct kvm_one_reg)
 #define KVM_SET_ONE_REG		  _IOW(KVMIO,  0xac, struct kvm_one_reg)
+/* VM is being stopped by host */
+#define KVM_KVMCLOCK_CTRL	  _IO(KVMIO,   0xad)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-- 
cgit v0.10.2


From 5d1c0f4a80a6df73395fb3fc2c302510f8f09d36 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Sat, 10 Mar 2012 14:37:28 -0500
Subject: watchdog: add check for suspended vm in softlockup detector

A suspended VM can cause spurious soft lockup warnings.  To avoid these, the
watchdog now checks if the kernel knows it was stopped by the host and skips
the warning if so.  When the watchdog is reset successfully, clear the guest
paused flag.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index df30ee0..e5e1d85 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <linux/sysctl.h>
 
 #include <asm/irq_regs.h>
+#include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 
 int watchdog_enabled = 1;
@@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 			__this_cpu_write(softlockup_touch_sync, false);
 			sched_clock_tick();
 		}
+
+		/* Clear the guest paused flag on watchdog reset */
+		kvm_check_and_clear_guest_paused();
 		__touch_watchdog();
 		return HRTIMER_RESTART;
 	}
@@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	 */
 	duration = is_softlockup(touch_ts);
 	if (unlikely(duration)) {
+		/*
+		 * If a virtual machine is stopped by the host it can look to
+		 * the watchdog like a soft lockup, check to see if the host
+		 * stopped the vm before we issue the warning
+		 */
+		if (kvm_check_and_clear_guest_paused())
+			return HRTIMER_RESTART;
+
 		/* only warn once */
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
-- 
cgit v0.10.2


From 8c84780df909433777ea9463b8350bc0aab34940 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 14 Mar 2012 17:58:48 -0300
Subject: KVM: fix kvm_vcpu_kick build failure on S390

S390's kvm_vcpu_stat does not contain halt_wakeup member.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7149a2e..a612bc8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1514,6 +1514,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 }
 
+#ifndef CONFIG_S390
 /*
  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  */
@@ -1535,6 +1536,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 			smp_send_reschedule(cpu);
 	put_cpu();
 }
+#endif /* !CONFIG_S390 */
 
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
-- 
cgit v0.10.2


From 248997095d652576f1213028a95ca5fff85d089f Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Thu, 15 Mar 2012 18:16:49 -0400
Subject: kvmclock: remove unneeded EXPORT macro

check_and_clear_guest_paused does not need to be exported as it isn't used
by any modules, remove the export.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4ba090c..086eb58 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -133,7 +133,6 @@ bool kvm_check_and_clear_guest_paused(void)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused);
 
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
-- 
cgit v0.10.2


From a0ed46073c14f66dbf0707aaa7588b78da83d7c6 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 1 Mar 2012 19:31:22 +0900
Subject: KVM: MMU: Split the main body of rmap_write_protect() off from others

We will use this in the following patch to implement another function
which needs to write protect pages using the rmap information.

Note that there is a small change in debug printing for large pages:
we do not differentiate them from others to avoid duplicating code.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4cb1642..c8b5694 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1010,42 +1010,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }
 
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-			       struct kvm_memory_slot *slot)
+static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
-	unsigned long *rmapp;
-	u64 *spte;
-	int i, write_protected = 0;
+	u64 *spte = NULL;
+	int write_protected = 0;
 
-	rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
+	while ((spte = rmap_next(rmapp, spte))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-		if (is_writable_pte(*spte)) {
+
+		if (!is_writable_pte(*spte))
+			continue;
+
+		if (level == PT_PAGE_TABLE_LEVEL) {
 			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
-			write_protected = 1;
+		} else {
+			BUG_ON(!is_large_pte(*spte));
+			drop_spte(kvm, spte);
+			--kvm->stat.lpages;
+			spte = NULL;
 		}
-		spte = rmap_next(rmapp, spte);
+
+		write_protected = 1;
 	}
 
-	/* check for huge page mappings */
-	for (i = PT_DIRECTORY_LEVEL;
+	return write_protected;
+}
+
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+			       struct kvm_memory_slot *slot)
+{
+	unsigned long *rmapp;
+	int i, write_protected = 0;
+
+	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		spte = rmap_next(rmapp, NULL);
-		while (spte) {
-			BUG_ON(!(*spte & PT_PRESENT_MASK));
-			BUG_ON(!is_large_pte(*spte));
-			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
-			if (is_writable_pte(*spte)) {
-				drop_spte(kvm, spte);
-				--kvm->stat.lpages;
-				spte = NULL;
-				write_protected = 1;
-			}
-			spte = rmap_next(rmapp, spte);
-		}
+		write_protected |= __rmap_write_protect(kvm, rmapp, i);
 	}
 
 	return write_protected;
-- 
cgit v0.10.2


From 5dc99b2380d59b8aeafa98791f92b96400ed3187 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 1 Mar 2012 19:32:16 +0900
Subject: KVM: Avoid checking huge page mappings in get_dirty_log()

Dropped such mappings when we enabled dirty logging and we will never
create new ones until we stop the logging.

For this we introduce a new function which can be used to write protect
a range of PT level pages: although we do not need to care about a range
of pages at this point, the following patch will need this feature to
optimize the write protection of many pages.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e216ba0..f624ca7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -712,8 +712,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-			       struct kvm_memory_slot *slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c8b5694..dc5f245 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1037,27 +1037,47 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level
 	return write_protected;
 }
 
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
-			       struct kvm_memory_slot *slot)
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+				     struct kvm_memory_slot *slot,
+				     gfn_t gfn_offset, unsigned long mask)
 {
 	unsigned long *rmapp;
-	int i, write_protected = 0;
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, i);
-	}
+	while (mask) {
+		rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
 
-	return write_protected;
+		/* clear the first set bit */
+		mask &= mask - 1;
+	}
 }
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	int i;
+	int write_protected = 0;
 
 	slot = gfn_to_memslot(kvm, gfn);
-	return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+
+	for (i = PT_PAGE_TABLE_LEVEL;
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		rmapp = __gfn_to_rmap(gfn, i, slot);
+		write_protected |= __rmap_write_protect(kvm, rmapp, i);
+	}
+
+	return write_protected;
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 99b7380..813ebf1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3095,13 +3095,11 @@ static void write_protect_slot(struct kvm *kvm,
 
 	/* Not many dirty pages compared to # of shadow pages. */
 	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-		unsigned long gfn_offset;
+		gfn_t offset;
 
-		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
-			unsigned long gfn = memslot->base_gfn + gfn_offset;
+		for_each_set_bit(offset, dirty_bitmap, memslot->npages)
+			kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1);
 
-			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-		}
 		kvm_flush_remote_tlbs(kvm);
 	} else
 		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-- 
cgit v0.10.2


From 60c34612b70711fb14a8dcbc6a79509902450d2e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <takuya.yoshikawa@gmail.com>
Date: Sat, 3 Mar 2012 14:21:48 +0900
Subject: KVM: Switch to srcu-less get_dirty_log()

We have seen some problems of the current implementation of
get_dirty_log() which uses synchronize_srcu_expedited() for updating
dirty bitmaps; e.g. it is noticeable that this sometimes gives us ms
order of latency when we use VGA displays.

Furthermore the recent discussion on the following thread
    "srcu: Implement call_srcu()"
    http://lkml.org/lkml/2012/1/31/211
also motivated us to implement get_dirty_log() without SRCU.

This patch achieves this goal without sacrificing the performance of
both VGA and live migration: in practice the new code is much faster
than the old one unless we have too many dirty pages.

Implementation:

The key part of the implementation is the use of xchg() operation for
clearing dirty bits atomically.  Since this allows us to update only
BITS_PER_LONG pages at once, we need to iterate over the dirty bitmap
until every dirty bit is cleared again for the next call.

Although some people may worry about the problem of using the atomic
memory instruction many times to the concurrently accessible bitmap,
it is usually accessed with mmu_lock held and we rarely see concurrent
accesses: so what we need to care about is the pure xchg() overheads.

Another point to note is that we do not use for_each_set_bit() to check
which ones in each BITS_PER_LONG pages are actually dirty.  Instead we
simply use __ffs() in a loop.  This is much faster than repeatedly call
find_next_bit().

Performance:

The dirty-log-perf unit test showed nice improvements, some times faster
than before, except for some extreme cases; for such cases the speed of
getting dirty page information is much faster than we process it in the
userspace.

For real workloads, both VGA and live migration, we have observed pure
improvements: when the guest was reading a file during live migration,
we originally saw a few ms of latency, but with the new method the
latency was less than 200us.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 813ebf1..0d9a578 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3067,55 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 }
 
 /**
- * write_protect_slot - write protect a slot for dirty logging
- * @kvm: the kvm instance
- * @memslot: the slot we protect
- * @dirty_bitmap: the bitmap indicating which pages are dirty
- * @nr_dirty_pages: the number of dirty pages
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
  *
- * We have two ways to find all sptes to protect:
- * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
- *    checks ones that have a spte mapping a page in the slot.
- * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently.  So, to avoid losing data, we keep the following order for
+ * each bit:
  *
- * Generally speaking, if there are not so many dirty pages compared to the
- * number of shadow pages, we should use the latter.
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Flush TLB's if needed.
+ *   4. Copy the snapshot to the userspace.
  *
- * Note that letting others write into a page marked dirty in the old bitmap
- * by using the remaining tlb entry is not a problem.  That page will become
- * write protected again when we flush the tlb and then be reported dirty to
- * the user space by copying the old bitmap.
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
  */
-static void write_protect_slot(struct kvm *kvm,
-			       struct kvm_memory_slot *memslot,
-			       unsigned long *dirty_bitmap,
-			       unsigned long nr_dirty_pages)
-{
-	spin_lock(&kvm->mmu_lock);
-
-	/* Not many dirty pages compared to # of shadow pages. */
-	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-		gfn_t offset;
-
-		for_each_set_bit(offset, dirty_bitmap, memslot->npages)
-			kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1);
-
-		kvm_flush_remote_tlbs(kvm);
-	} else
-		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-
-	spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-				      struct kvm_dirty_log *log)
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 	int r;
 	struct kvm_memory_slot *memslot;
-	unsigned long n, nr_dirty_pages;
+	unsigned long n, i;
+	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_buffer;
+	bool is_dirty = false;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3124,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		goto out;
 
 	memslot = id_to_memslot(kvm->memslots, log->slot);
+
+	dirty_bitmap = memslot->dirty_bitmap;
 	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
+	if (!dirty_bitmap)
 		goto out;
 
 	n = kvm_dirty_bitmap_bytes(memslot);
-	nr_dirty_pages = memslot->nr_dirty_pages;
 
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (nr_dirty_pages) {
-		struct kvm_memslots *slots, *old_slots;
-		unsigned long *dirty_bitmap, *dirty_bitmap_head;
+	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+	memset(dirty_bitmap_buffer, 0, n);
 
-		dirty_bitmap = memslot->dirty_bitmap;
-		dirty_bitmap_head = memslot->dirty_bitmap_head;
-		if (dirty_bitmap == dirty_bitmap_head)
-			dirty_bitmap_head += n / sizeof(long);
-		memset(dirty_bitmap_head, 0, n);
+	spin_lock(&kvm->mmu_lock);
 
-		r = -ENOMEM;
-		slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
-		if (!slots)
-			goto out;
+	for (i = 0; i < n / sizeof(long); i++) {
+		unsigned long mask;
+		gfn_t offset;
 
-		memslot = id_to_memslot(slots, log->slot);
-		memslot->nr_dirty_pages = 0;
-		memslot->dirty_bitmap = dirty_bitmap_head;
-		update_memslots(slots, NULL);
+		if (!dirty_bitmap[i])
+			continue;
 
-		old_slots = kvm->memslots;
-		rcu_assign_pointer(kvm->memslots, slots);
-		synchronize_srcu_expedited(&kvm->srcu);
-		kfree(old_slots);
+		is_dirty = true;
 
-		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
+		mask = xchg(&dirty_bitmap[i], 0);
+		dirty_bitmap_buffer[i] = mask;
 
-		r = -EFAULT;
-		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-			goto out;
-	} else {
-		r = -EFAULT;
-		if (clear_user(log->dirty_bitmap, n))
-			goto out;
+		offset = i * BITS_PER_LONG;
+		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
 	}
+	if (is_dirty)
+		kvm_flush_remote_tlbs(kvm);
+
+	spin_unlock(&kvm->mmu_lock);
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+		goto out;
 
 	r = 0;
 out:
-- 
cgit v0.10.2


From 93474b25af1eabf5b14743793156e8d307bfcd6b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 1 Mar 2012 19:34:45 +0900
Subject: KVM: Remove unused dirty_bitmap_head and nr_dirty_pages

Now that we do neither double buffering nor heuristic selection of the
write protection method these are not needed anymore.

Note: some drivers have their own implementation of set_bit_le() and
making it generic needs a bit of work; so we use test_and_set_bit_le()
and will later replace it with generic set_bit_le().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5184817..49c2f2fd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -179,8 +179,6 @@ struct kvm_memory_slot {
 	unsigned long flags;
 	unsigned long *rmap;
 	unsigned long *dirty_bitmap;
-	unsigned long *dirty_bitmap_head;
-	unsigned long nr_dirty_pages;
 	struct kvm_arch_memory_slot arch;
 	unsigned long userspace_addr;
 	int user_alloc;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a612bc8..6bd34a6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -522,12 +522,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 		return;
 
 	if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
-		vfree(memslot->dirty_bitmap_head);
+		vfree(memslot->dirty_bitmap);
 	else
-		kfree(memslot->dirty_bitmap_head);
+		kfree(memslot->dirty_bitmap);
 
 	memslot->dirty_bitmap = NULL;
-	memslot->dirty_bitmap_head = NULL;
 }
 
 /*
@@ -611,8 +610,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
- * This makes it possible to do double buffering: see x86's
- * kvm_vm_ioctl_get_dirty_log().
+ * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
  */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
@@ -627,8 +625,6 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
-	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
-	memslot->nr_dirty_pages = 0;
 #endif /* !CONFIG_S390 */
 	return 0;
 }
@@ -1476,8 +1472,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-		if (!test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap))
-			memslot->nr_dirty_pages++;
+		/* TODO: introduce set_bit_le() and use it */
+		test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
 
-- 
cgit v0.10.2


From 52b066fa4e9cbfe45243dd4259b053d3fa7e21f1 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:12 +0000
Subject: powerpc/booke: Set CPU_FTR_DEBUG_LVL_EXC on 32-bit

Currently 32-bit only cares about this for choice of exception
vector, which is done in core-specific code.  However, KVM will
want to distinguish as well.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index b9219e9..3be45cd 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -376,7 +376,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_47X	(CPU_FTRS_440x6)
 #define CPU_FTRS_E200	(CPU_FTR_USE_TB | CPU_FTR_SPE_COMP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
-	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE)
+	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \
+	    CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E500	(CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
 	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_NOEXECUTE)
@@ -385,7 +386,7 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500MC	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
-	    CPU_FTR_DBELL)
+	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E5500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-- 
cgit v0.10.2


From 06aae86799c1b37f216371e05a1eacb2188bee9d Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:14 +0000
Subject: powerpc/e500: split CPU_FTRS_ALWAYS/CPU_FTRS_POSSIBLE

Split e500 (v1/v2) and e500mc/e5500 to allow optimization of feature
checks that differ between the two.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 3be45cd..7108a9c 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -487,8 +487,10 @@ enum {
 	    CPU_FTRS_E200 |
 #endif
 #ifdef CONFIG_E500
-	    CPU_FTRS_E500 | CPU_FTRS_E500_2 | CPU_FTRS_E500MC |
-	    CPU_FTRS_E5500 | CPU_FTRS_E6500 |
+	    CPU_FTRS_E500 | CPU_FTRS_E500_2 |
+#endif
+#ifdef CONFIG_PPC_E500MC
+	    CPU_FTRS_E500MC | CPU_FTRS_E5500 | CPU_FTRS_E6500 |
 #endif
 	    0,
 };
@@ -532,8 +534,10 @@ enum {
 	    CPU_FTRS_E200 &
 #endif
 #ifdef CONFIG_E500
-	    CPU_FTRS_E500 & CPU_FTRS_E500_2 & CPU_FTRS_E500MC &
-	    CPU_FTRS_E5500 & CPU_FTRS_E6500 &
+	    CPU_FTRS_E500 & CPU_FTRS_E500_2 &
+#endif
+#ifdef CONFIG_PPC_E500MC
+	    CPU_FTRS_E500MC & CPU_FTRS_E5500 & CPU_FTRS_E6500 &
 #endif
 	    CPU_FTRS_POSSIBLE,
 };
-- 
cgit v0.10.2


From 043cc4d724da6bb9e4f417c735accec58dfa40bf Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:20 +0000
Subject: KVM: PPC: factor out lpid allocator from book3s_64_mmu_hv

We'll use it on e500mc as well.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index aa795cc..046041f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -452,4 +452,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ			0x7c0007ec
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
+
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index a90e091..b7cd335 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -23,6 +23,9 @@
 #include <linux/types.h>
 #include <linux/kvm_host.h>
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS                        64
+
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	vcpu->arch.gpr[num] = val;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9d6dee0..731e920 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -204,4 +204,9 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
 			     struct kvm_dirty_tlb *cfg);
 
+long kvmppc_alloc_lpid(void);
+void kvmppc_claim_lpid(long lpid);
+void kvmppc_free_lpid(long lpid);
+void kvmppc_init_lpid(unsigned long nr_lpids);
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ddc485a..d031ce1 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -36,13 +36,11 @@
 
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970	63
-#define NR_LPIDS	(LPID_RSVD + 1)
-unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
 
 long kvmppc_alloc_hpt(struct kvm *kvm)
 {
 	unsigned long hpt;
-	unsigned long lpid;
+	long lpid;
 	struct revmap_entry *rev;
 	struct kvmppc_linear_info *li;
 
@@ -72,14 +70,9 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 	}
 	kvm->arch.revmap = rev;
 
-	/* Allocate the guest's logical partition ID */
-	do {
-		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
-		if (lpid >= NR_LPIDS) {
-			pr_err("kvm_alloc_hpt: No LPIDs free\n");
-			goto out_freeboth;
-		}
-	} while (test_and_set_bit(lpid, lpid_inuse));
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		goto out_freeboth;
 
 	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
 	kvm->arch.lpid = lpid;
@@ -96,7 +89,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	clear_bit(kvm->arch.lpid, lpid_inuse);
+	kvmppc_free_lpid(kvm->arch.lpid);
 	vfree(kvm->arch.revmap);
 	if (kvm->arch.hpt_li)
 		kvm_release_hpt(kvm->arch.hpt_li);
@@ -171,8 +164,7 @@ int kvmppc_mmu_hv_init(void)
 	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return -EINVAL;
 
-	memset(lpid_inuse, 0, sizeof(lpid_inuse));
-
+	/* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */
 	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
 		host_lpid = mfspr(SPRN_LPID);	/* POWER7 */
 		rsvd_lpid = LPID_RSVD;
@@ -181,9 +173,11 @@ int kvmppc_mmu_hv_init(void)
 		rsvd_lpid = MAX_LPID_970;
 	}
 
-	set_bit(host_lpid, lpid_inuse);
+	kvmppc_init_lpid(rsvd_lpid + 1);
+
+	kvmppc_claim_lpid(host_lpid);
 	/* rsvd_lpid is reserved for use in partition switching */
-	set_bit(rsvd_lpid, lpid_inuse);
+	kvmppc_claim_lpid(rsvd_lpid);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b5e9046..cd53e08 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -799,6 +799,40 @@ out:
 	return r;
 }
 
+static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)];
+static unsigned long nr_lpids;
+
+long kvmppc_alloc_lpid(void)
+{
+	long lpid;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS);
+		if (lpid >= nr_lpids) {
+			pr_err("%s: No LPIDs free\n", __func__);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	return lpid;
+}
+
+void kvmppc_claim_lpid(long lpid)
+{
+	set_bit(lpid, lpid_inuse);
+}
+
+void kvmppc_free_lpid(long lpid)
+{
+	clear_bit(lpid, lpid_inuse);
+}
+
+void kvmppc_init_lpid(unsigned long nr_lpids_param)
+{
+	nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param);
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+}
+
 int kvm_arch_init(void *opaque)
 {
 	return 0;
-- 
cgit v0.10.2


From 94fa9d9927627a948cef3eff7ebd228dcab5a316 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:22 +0000
Subject: KVM: PPC: booke: add booke-level vcpu load/put

This gives us a place to put load/put actions that correspond to
code that is booke-specific but not specific to a particular core.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 7b612a7..879a1a7 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -29,15 +29,18 @@
 #include <asm/kvm_ppc.h>
 
 #include "44x_tlb.h"
+#include "booke.h"
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvmppc_booke_vcpu_load(vcpu, cpu);
 	kvmppc_44x_tlb_load(vcpu);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvmppc_44x_tlb_put(vcpu);
+	kvmppc_booke_vcpu_put(vcpu);
 }
 
 int kvmppc_core_check_processor_compat(void)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ee9e1ee..a2456c7 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -968,6 +968,14 @@ void kvmppc_decrementer_func(unsigned long data)
 	kvmppc_set_tsr_bits(vcpu, TSR_DIS);
 }
 
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
 int __init kvmppc_booke_init(void)
 {
 	unsigned long ivor[16];
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 2fe2027..05d1d99 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -71,4 +71,7 @@ void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
 /* high-level function, manages flags, host state */
 void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
 
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
+
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index ddcd896..2d5fe04 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -36,6 +36,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvmppc_booke_vcpu_load(vcpu, cpu);
 	kvmppc_e500_tlb_load(vcpu, cpu);
 }
 
@@ -47,6 +48,8 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shadow_msr & MSR_SPE)
 		kvmppc_vcpu_disable_spe(vcpu);
 #endif
+
+	kvmppc_booke_vcpu_put(vcpu);
 }
 
 int kvmppc_core_check_processor_compat(void)
-- 
cgit v0.10.2


From fafd68327858bf30c846d38c7ea144f0827f552e Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:26 +0000
Subject: KVM: PPC: booke: Move vm core init/destroy out of booke.c

e500mc will want to do lpid allocation/deallocation here.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 879a1a7..50e7dbc 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -163,6 +163,15 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
 
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 static int __init kvmppc_44x_init(void)
 {
 	int r;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a2456c7..2ee9bae 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -932,15 +932,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 {
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
-{
-	return 0;
-}
-
-void kvmppc_core_destroy_vm(struct kvm *kvm)
-{
-}
-
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 2d5fe04..ac6c9ae 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -226,6 +226,15 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 static int __init kvmppc_e500_init(void)
 {
 	int r, i;
-- 
cgit v0.10.2


From 29a5a6f9102aed97a06aa984cc294e0e603b3a79 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:29 +0000
Subject: KVM: PPC: e500: rename e500_tlb.h to e500.h

This is in preparation for merging in the contents of
arch/powerpc/include/asm/kvm_e500.h.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index ac6c9ae..5c450ba 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -24,7 +24,7 @@
 #include <asm/kvm_ppc.h>
 
 #include "booke.h"
-#include "e500_tlb.h"
+#include "e500.h"
 
 void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
new file mode 100644
index 0000000..02ecde2
--- /dev/null
+++ b/arch/powerpc/kvm/e500.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.h,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef KVM_E500_H
+#define KVM_E500_H
+
+#include <linux/kvm_host.h>
+#include <asm/mmu-book3e.h>
+#include <asm/tlb.h>
+#include <asm/kvm_e500.h>
+
+/* This geometry is the legacy default -- can be overridden by userspace */
+#define KVM_E500_TLB0_WAY_SIZE		128
+#define KVM_E500_TLB0_WAY_NUM		2
+
+#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
+#define KVM_E500_TLB1_SIZE  16
+
+#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF))
+#define tlbsel_of(index)	((index) >> 16)
+#define esel_of(index)		((index) & 0xFFFF)
+
+#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
+#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
+#define MAS2_ATTRIB_MASK \
+	  (MAS2_X0 | MAS2_X1)
+#define MAS3_ATTRIB_MASK \
+	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
+	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
+
+extern void kvmppc_dump_tlbs(struct kvm_vcpu *);
+extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong);
+extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *);
+extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *);
+extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int);
+extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int);
+extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
+extern void kvmppc_e500_tlb_put(struct kvm_vcpu *);
+extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
+extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
+
+/* TLB helper functions */
+static inline unsigned int
+get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 7) & 0x1f;
+}
+
+static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas2 & 0xfffff000;
+}
+
+static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	unsigned int pgsize = get_tlb_size(tlbe);
+	return 1ULL << 10 << pgsize;
+}
+
+static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	u64 bytes = get_tlb_bytes(tlbe);
+	return get_tlb_eaddr(tlbe) + bytes - 1;
+}
+
+static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas7_3 & ~0xfffULL;
+}
+
+static inline unsigned int
+get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 16) & 0xff;
+}
+
+static inline unsigned int
+get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 12) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 31) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 30) & 0x1;
+}
+
+static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pid & 0xff;
+}
+
+static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
+}
+
+static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & MSR_PR);
+}
+
+static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->mas6 >> 16) & 0xff;
+}
+
+static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->mas6 & 0x1;
+}
+
+static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Manual says that tlbsel has 2 bits wide.
+	 * Since we only have two TLBs, only lower bit is used.
+	 */
+	return (vcpu->arch.shared->mas0 >> 28) & 0x1;
+}
+
+static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->mas0 & 0xfff;
+}
+
+static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->mas0 >> 16) & 0xfff;
+}
+
+static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+			const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	gpa_t gpa;
+
+	if (!get_tlb_v(tlbe))
+		return 0;
+
+	/* Does it match current guest AS? */
+	/* XXX what about IS != DS? */
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
+		return 0;
+
+	gpa = get_tlb_raddr(tlbe);
+	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+		/* Mapping is not for RAM. */
+		return 0;
+
+	return 1;
+}
+
+#endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 6d0b2bd..2a1a228 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -17,7 +17,7 @@
 #include <asm/kvm_e500.h>
 
 #include "booke.h"
-#include "e500_tlb.h"
+#include "e500.h"
 
 #define XOP_TLBIVAX 786
 #define XOP_TLBSX   914
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 6e53e41..1d623a0 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -29,7 +29,7 @@
 #include <asm/kvm_e500.h>
 
 #include "../mm/mmu_decl.h"
-#include "e500_tlb.h"
+#include "e500.h"
 #include "trace.h"
 #include "timing.h"
 
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
deleted file mode 100644
index 5c6d2d7..0000000
--- a/arch/powerpc/kvm/e500_tlb.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
- *
- * Author: Yu Liu, yu.liu@freescale.com
- *
- * Description:
- * This file is based on arch/powerpc/kvm/44x_tlb.h,
- * by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- */
-
-#ifndef __KVM_E500_TLB_H__
-#define __KVM_E500_TLB_H__
-
-#include <linux/kvm_host.h>
-#include <asm/mmu-book3e.h>
-#include <asm/tlb.h>
-#include <asm/kvm_e500.h>
-
-/* This geometry is the legacy default -- can be overridden by userspace */
-#define KVM_E500_TLB0_WAY_SIZE		128
-#define KVM_E500_TLB0_WAY_NUM		2
-
-#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
-#define KVM_E500_TLB1_SIZE  16
-
-#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF))
-#define tlbsel_of(index)	((index) >> 16)
-#define esel_of(index)		((index) & 0xFFFF)
-
-#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
-#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
-#define MAS2_ATTRIB_MASK \
-	  (MAS2_X0 | MAS2_X1)
-#define MAS3_ATTRIB_MASK \
-	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
-	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
-
-extern void kvmppc_dump_tlbs(struct kvm_vcpu *);
-extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong);
-extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *);
-extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *);
-extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int);
-extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int);
-extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
-extern void kvmppc_e500_tlb_put(struct kvm_vcpu *);
-extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
-extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
-extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
-extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
-extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
-
-/* TLB helper functions */
-static inline unsigned int
-get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return (tlbe->mas1 >> 7) & 0x1f;
-}
-
-static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return tlbe->mas2 & 0xfffff000;
-}
-
-static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	unsigned int pgsize = get_tlb_size(tlbe);
-	return 1ULL << 10 << pgsize;
-}
-
-static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	u64 bytes = get_tlb_bytes(tlbe);
-	return get_tlb_eaddr(tlbe) + bytes - 1;
-}
-
-static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return tlbe->mas7_3 & ~0xfffULL;
-}
-
-static inline unsigned int
-get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return (tlbe->mas1 >> 16) & 0xff;
-}
-
-static inline unsigned int
-get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return (tlbe->mas1 >> 12) & 0x1;
-}
-
-static inline unsigned int
-get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return (tlbe->mas1 >> 31) & 0x1;
-}
-
-static inline unsigned int
-get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	return (tlbe->mas1 >> 30) & 0x1;
-}
-
-static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.pid & 0xff;
-}
-
-static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
-{
-	return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
-}
-
-static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
-{
-	return !!(vcpu->arch.shared->msr & MSR_PR);
-}
-
-static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
-{
-	return (vcpu->arch.shared->mas6 >> 16) & 0xff;
-}
-
-static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.shared->mas6 & 0x1;
-}
-
-static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
-{
-	/*
-	 * Manual says that tlbsel has 2 bits wide.
-	 * Since we only have two TLBs, only lower bit is used.
-	 */
-	return (vcpu->arch.shared->mas0 >> 28) & 0x1;
-}
-
-static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.shared->mas0 & 0xfff;
-}
-
-static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu)
-{
-	return (vcpu->arch.shared->mas0 >> 16) & 0xfff;
-}
-
-static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
-			const struct kvm_book3e_206_tlb_entry *tlbe)
-{
-	gpa_t gpa;
-
-	if (!get_tlb_v(tlbe))
-		return 0;
-
-	/* Does it match current guest AS? */
-	/* XXX what about IS != DS? */
-	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
-		return 0;
-
-	gpa = get_tlb_raddr(tlbe);
-	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
-		/* Mapping is not for RAM. */
-		return 0;
-
-	return 1;
-}
-
-#endif /* __KVM_E500_TLB_H__ */
-- 
cgit v0.10.2


From fc6cf99509eb8e5f16e0f81db0c71f5301193005 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:31 +0000
Subject: KVM: PPC: e500: merge <asm/kvm_e500.h> into arch/powerpc/kvm/e500.h

Keeping two separate headers for e500-specific things was a
pain, and wasn't even organized along any logical boundary.

There was TLB stuff in <asm/kvm_e500.h> despite the existence of
arch/powerpc/kvm/e500_tlb.h, and nothing in <asm/kvm_e500.h> needed
to be referenced from outside arch/powerpc/kvm.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: fix bisectability]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
deleted file mode 100644
index 8cd50a5..0000000
--- a/arch/powerpc/include/asm/kvm_e500.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
- *
- * Author: Yu Liu, <yu.liu@freescale.com>
- *
- * Description:
- * This file is derived from arch/powerpc/include/asm/kvm_44x.h,
- * by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- */
-
-#ifndef __ASM_KVM_E500_H__
-#define __ASM_KVM_E500_H__
-
-#include <linux/kvm_host.h>
-
-#define BOOKE_INTERRUPT_SIZE 36
-
-#define E500_PID_NUM   3
-#define E500_TLB_NUM   2
-
-#define E500_TLB_VALID 1
-#define E500_TLB_DIRTY 2
-
-struct tlbe_ref {
-	pfn_t pfn;
-	unsigned int flags; /* E500_TLB_* */
-};
-
-struct tlbe_priv {
-	struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
-};
-
-struct vcpu_id_table;
-
-struct kvmppc_e500_tlb_params {
-	int entries, ways, sets;
-};
-
-struct kvmppc_vcpu_e500 {
-	/* Unmodified copy of the guest's TLB -- shared with host userspace. */
-	struct kvm_book3e_206_tlb_entry *gtlb_arch;
-
-	/* Starting entry number in gtlb_arch[] */
-	int gtlb_offset[E500_TLB_NUM];
-
-	/* KVM internal information associated with each guest TLB entry */
-	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
-
-	struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM];
-
-	unsigned int gtlb_nv[E500_TLB_NUM];
-
-	/*
-	 * information associated with each host TLB entry --
-	 * TLB1 only for now.  If/when guest TLB1 entries can be
-	 * mapped with host TLB0, this will be used for that too.
-	 *
-	 * We don't want to use this for guest TLB0 because then we'd
-	 * have the overhead of doing the translation again even if
-	 * the entry is still in the guest TLB (e.g. we swapped out
-	 * and back, and our host TLB entries got evicted).
-	 */
-	struct tlbe_ref *tlb_refs[E500_TLB_NUM];
-	unsigned int host_tlb1_nv;
-
-	u32 host_pid[E500_PID_NUM];
-	u32 pid[E500_PID_NUM];
-	u32 svr;
-
-	/* vcpu id table */
-	struct vcpu_id_table *idt;
-
-	u32 l1csr0;
-	u32 l1csr1;
-	u32 hid0;
-	u32 hid1;
-	u32 tlb0cfg;
-	u32 tlb1cfg;
-	u64 mcar;
-
-	struct page **shared_tlb_pages;
-	int num_shared_tlb_pages;
-
-	struct kvm_vcpu vcpu;
-};
-
-static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
-{
-	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
-}
-
-#endif /* __ASM_KVM_E500_H__ */
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 5c450ba..76b35d8 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -20,7 +20,6 @@
 #include <asm/reg.h>
 #include <asm/cputable.h>
 #include <asm/tlbflush.h>
-#include <asm/kvm_e500.h>
 #include <asm/kvm_ppc.h>
 
 #include "booke.h"
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 02ecde2..51d13bd 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -1,11 +1,12 @@
 /*
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
- * Author: Yu Liu, yu.liu@freescale.com
+ * Author: Yu Liu <yu.liu@freescale.com>
  *
  * Description:
- * This file is based on arch/powerpc/kvm/44x_tlb.h,
- * by Hollis Blanchard <hollisb@us.ibm.com>.
+ * This file is based on arch/powerpc/kvm/44x_tlb.h and
+ * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>,
+ * Copyright IBM Corp. 2007-2008
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License, version 2, as
@@ -18,7 +19,80 @@
 #include <linux/kvm_host.h>
 #include <asm/mmu-book3e.h>
 #include <asm/tlb.h>
-#include <asm/kvm_e500.h>
+
+#define E500_PID_NUM   3
+#define E500_TLB_NUM   2
+
+#define E500_TLB_VALID 1
+#define E500_TLB_DIRTY 2
+
+struct tlbe_ref {
+	pfn_t pfn;
+	unsigned int flags; /* E500_TLB_* */
+};
+
+struct tlbe_priv {
+	struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
+};
+
+struct vcpu_id_table;
+
+struct kvmppc_e500_tlb_params {
+	int entries, ways, sets;
+};
+
+struct kvmppc_vcpu_e500 {
+	/* Unmodified copy of the guest's TLB -- shared with host userspace. */
+	struct kvm_book3e_206_tlb_entry *gtlb_arch;
+
+	/* Starting entry number in gtlb_arch[] */
+	int gtlb_offset[E500_TLB_NUM];
+
+	/* KVM internal information associated with each guest TLB entry */
+	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
+
+	struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM];
+
+	unsigned int gtlb_nv[E500_TLB_NUM];
+
+	/*
+	 * information associated with each host TLB entry --
+	 * TLB1 only for now.  If/when guest TLB1 entries can be
+	 * mapped with host TLB0, this will be used for that too.
+	 *
+	 * We don't want to use this for guest TLB0 because then we'd
+	 * have the overhead of doing the translation again even if
+	 * the entry is still in the guest TLB (e.g. we swapped out
+	 * and back, and our host TLB entries got evicted).
+	 */
+	struct tlbe_ref *tlb_refs[E500_TLB_NUM];
+	unsigned int host_tlb1_nv;
+
+	u32 host_pid[E500_PID_NUM];
+	u32 pid[E500_PID_NUM];
+	u32 svr;
+
+	/* vcpu id table */
+	struct vcpu_id_table *idt;
+
+	u32 l1csr0;
+	u32 l1csr1;
+	u32 hid0;
+	u32 hid1;
+	u32 tlb0cfg;
+	u32 tlb1cfg;
+	u64 mcar;
+
+	struct page **shared_tlb_pages;
+	int num_shared_tlb_pages;
+
+	struct kvm_vcpu vcpu;
+};
+
+static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
+}
 
 /* This geometry is the legacy default -- can be overridden by userspace */
 #define KVM_E500_TLB0_WAY_SIZE		128
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 2a1a228..7e2d592 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -14,7 +14,6 @@
 
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
-#include <asm/kvm_e500.h>
 
 #include "booke.h"
 #include "e500.h"
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 1d623a0..7d4a918 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -26,7 +26,6 @@
 #include <linux/vmalloc.h>
 #include <linux/hugetlb.h>
 #include <asm/kvm_ppc.h>
-#include <asm/kvm_e500.h>
 
 #include "../mm/mmu_decl.h"
 #include "e500.h"
-- 
cgit v0.10.2


From 52e1718c6fd1a1f54c676c2107dc931e93865fe8 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:32 +0000
Subject: KVM: PPC: e500: clean up arch/powerpc/kvm/e500.h

Move vcpu to the beginning of vcpu_e500 to give it appropriate
prominence, especially if more fields end up getting added to the
end of vcpu_e500 (and vcpu ends up in the middle).

Remove gratuitous "extern" and add parameter names to prototypes.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: fix bisectability]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 51d13bd..a48af00 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -42,6 +42,8 @@ struct kvmppc_e500_tlb_params {
 };
 
 struct kvmppc_vcpu_e500 {
+	struct kvm_vcpu vcpu;
+
 	/* Unmodified copy of the guest's TLB -- shared with host userspace. */
 	struct kvm_book3e_206_tlb_entry *gtlb_arch;
 
@@ -85,8 +87,6 @@ struct kvmppc_vcpu_e500 {
 
 	struct page **shared_tlb_pages;
 	int num_shared_tlb_pages;
-
-	struct kvm_vcpu vcpu;
 };
 
 static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
@@ -113,19 +113,22 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
 	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
 	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
 
-extern void kvmppc_dump_tlbs(struct kvm_vcpu *);
-extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong);
-extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *);
-extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *);
-extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int);
-extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int);
-extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
 extern void kvmppc_e500_tlb_put(struct kvm_vcpu *);
 extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
-extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
-extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
 extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
+int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
+				ulong value);
+int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
+int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
+int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
+int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
+void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 /* TLB helper functions */
 static inline unsigned int
-- 
cgit v0.10.2


From 8fdd21a26876ea6c486c38bfa75fdd18ba299351 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:34 +0000
Subject: KVM: PPC: e500: refactor core-specific TLB code

The PID handling is e500v1/v2-specific, and is moved to e500.c.

The MMU sregs code and kvmppc_core_vcpu_translate will be shared with
e500mc, and is moved from e500.c to e500_tlb.c.

Partially based on patches from Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: fix bisectability]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 20ab5b2..5b81cbc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -426,6 +426,8 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	u32 tlbcfg[4];
+	u32 mmucfg;
 #endif
 	gpa_t paddr_accessed;
 
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 76b35d8..b479ed7 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -22,9 +22,281 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 
+#include "../mm/mmu_decl.h"
 #include "booke.h"
 #include "e500.h"
 
+struct id {
+	unsigned long val;
+	struct id **pentry;
+};
+
+#define NUM_TIDS 256
+
+/*
+ * This table provide mappings from:
+ * (guestAS,guestTID,guestPR) --> ID of physical cpu
+ * guestAS	[0..1]
+ * guestTID	[0..255]
+ * guestPR	[0..1]
+ * ID		[1..255]
+ * Each vcpu keeps one vcpu_id_table.
+ */
+struct vcpu_id_table {
+	struct id id[2][NUM_TIDS][2];
+};
+
+/*
+ * This table provide reversed mappings of vcpu_id_table:
+ * ID --> address of vcpu_id_table item.
+ * Each physical core has one pcpu_id_table.
+ */
+struct pcpu_id_table {
+	struct id *entry[NUM_TIDS];
+};
+
+static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
+
+/* This variable keeps last used shadow ID on local core.
+ * The valid range of shadow ID is [1..255] */
+static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
+
+/*
+ * Allocate a free shadow id and setup a valid sid mapping in given entry.
+ * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_setup_one(struct id *entry)
+{
+	unsigned long sid;
+	int ret = -1;
+
+	sid = ++(__get_cpu_var(pcpu_last_used_sid));
+	if (sid < NUM_TIDS) {
+		__get_cpu_var(pcpu_sids).entry[sid] = entry;
+		entry->val = sid;
+		entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
+		ret = sid;
+	}
+
+	/*
+	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
+	 * the caller will invalidate everything and start over.
+	 *
+	 * sid > NUM_TIDS indicates a race, which we disable preemption to
+	 * avoid.
+	 */
+	WARN_ON(sid > NUM_TIDS);
+
+	return ret;
+}
+
+/*
+ * Check if given entry contain a valid shadow id mapping.
+ * An ID mapping is considered valid only if
+ * both vcpu and pcpu know this mapping.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_lookup(struct id *entry)
+{
+	if (entry && entry->val != 0 &&
+	    __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
+	    entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
+		return entry->val;
+	return -1;
+}
+
+/* Invalidate all id mappings on local core -- call with preempt disabled */
+static inline void local_sid_destroy_all(void)
+{
+	__get_cpu_var(pcpu_last_used_sid) = 0;
+	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
+}
+
+static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
+	return vcpu_e500->idt;
+}
+
+static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->idt);
+	vcpu_e500->idt = NULL;
+}
+
+/* Map guest pid to shadow.
+ * We use PID to keep shadow of current guest non-zero PID,
+ * and use PID1 to keep shadow of guest zero PID.
+ * So that guest tlbe with TID=0 can be accessed at any time */
+static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	preempt_disable();
+	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu),
+			get_cur_pid(&vcpu_e500->vcpu),
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu), 0,
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	preempt_enable();
+}
+
+/* Invalidate all mappings on vcpu */
+static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/* Invalidate one ID mapping on vcpu */
+static inline void kvmppc_e500_id_table_reset_one(
+			       struct kvmppc_vcpu_e500 *vcpu_e500,
+			       int as, int pid, int pr)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+
+	BUG_ON(as >= 2);
+	BUG_ON(pid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	idt->id[as][pid][pr].val = 0;
+	idt->id[as][pid][pr].pentry = NULL;
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/*
+ * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
+ * This function first lookup if a valid mapping exists,
+ * if not, then creates a new one.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	int sid;
+
+	BUG_ON(as >= 2);
+	BUG_ON(gid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	sid = local_sid_lookup(&idt->id[as][gid][pr]);
+
+	while (sid <= 0) {
+		/* No mapping yet */
+		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
+		if (sid <= 0) {
+			_tlbil_all();
+			local_sid_destroy_all();
+		}
+
+		/* Update shadow pid when mappings are changed */
+		if (!avoid_recursion)
+			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+
+	return sid;
+}
+
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe),
+				   get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (vcpu->arch.pid != pid) {
+		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
+		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+}
+
+/* gtlbe must not be mapped by more than one host tlbe */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+                           struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	unsigned int pr, tid, ts, pid;
+	u32 val, eaddr;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	preempt_disable();
+
+	/* One guest ID may be mapped to two shadow IDs */
+	for (pr = 0; pr < 2; pr++) {
+		/*
+		 * The shadow PID can have a valid mapping on at most one
+		 * host CPU.  In the common case, it will be valid on this
+		 * CPU, in which case we do a local invalidation of the
+		 * specific address.
+		 *
+		 * If the shadow PID is not valid on the current host CPU,
+		 * we invalidate the entire shadow PID.
+		 */
+		pid = local_sid_lookup(&idt->id[ts][tid][pr]);
+		if (pid <= 0) {
+			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
+			continue;
+		}
+
+		/*
+		 * The guest is invalidating a 4K entry which is in a PID
+		 * that has a valid shadow mapping on this host CPU.  We
+		 * search host TLB to invalidate it's shadow TLB entry,
+		 * similar to __tlbil_va except that we need to look in AS1.
+		 */
+		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
+		eaddr = get_tlb_eaddr(gtlbe);
+
+		local_irq_save(flags);
+
+		mtspr(SPRN_MAS6, val);
+		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
+		val = mfspr(SPRN_MAS1);
+		if (val & MAS1_VALID) {
+			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+			asm volatile("tlbwe");
+		}
+
+		local_irq_restore(flags);
+	}
+
+	preempt_enable();
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+	/* Recalc shadow pid since MSR changes */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
+}
+
 void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
 {
 }
@@ -36,13 +308,13 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvmppc_booke_vcpu_load(vcpu, cpu);
-	kvmppc_e500_tlb_load(vcpu, cpu);
+
+	/* Shadow PID may be expired on local core */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	kvmppc_e500_tlb_put(vcpu);
-
 #ifdef CONFIG_SPE
 	if (vcpu->arch.shadow_msr & MSR_SPE)
 		kvmppc_vcpu_disable_spe(vcpu);
@@ -63,6 +335,23 @@ int kvmppc_core_check_processor_compat(void)
 	return r;
 }
 
+static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+
+	/* Insert large initial mapping for guest. */
+	tlbe = get_entry(vcpu_e500, 1, 0);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
+	tlbe->mas2 = 0;
+	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
+
+	/* 4K map for serial output. Used by kernel wrapper. */
+	tlbe = get_entry(vcpu_e500, 1, 1);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
+	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+}
+
 int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -78,32 +367,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
-int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
-                               struct kvm_translation *tr)
-{
-	int index;
-	gva_t eaddr;
-	u8 pid;
-	u8 as;
-
-	eaddr = tr->linear_address;
-	pid = (tr->linear_address >> 32) & 0xff;
-	as = (tr->linear_address >> 40) & 0x1;
-
-	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
-	if (index < 0) {
-		tr->valid = 0;
-		return 0;
-	}
-
-	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
-	/* XXX what does "writeable" and "usermode" even mean? */
-	tr->valid = 1;
-
-	return 0;
-}
-
 void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -117,19 +380,6 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
 	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
 
-	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
-	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
-	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
-	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
-	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
-	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
-
-	sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG);
-	sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg;
-	sregs->u.e.tlbcfg[1] = vcpu_e500->tlb1cfg;
-	sregs->u.e.tlbcfg[2] = 0;
-	sregs->u.e.tlbcfg[3] = 0;
-
 	sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
 	sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
 	sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
@@ -137,11 +387,13 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
 
 	kvmppc_get_sregs_ivor(vcpu, sregs);
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
 }
 
 int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
 
 	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
 		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
@@ -149,14 +401,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
 	}
 
-	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
-		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
-		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
-		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
-		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
-		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
-		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
-	}
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
 
 	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
 		return 0;
@@ -195,9 +442,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_vcpu;
 
+	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
+		goto uninit_vcpu;
+
 	err = kvmppc_e500_tlb_init(vcpu_e500);
 	if (err)
-		goto uninit_vcpu;
+		goto uninit_id;
 
 	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 	if (!vcpu->arch.shared)
@@ -207,6 +457,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 uninit_tlb:
 	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_id:
+	kvmppc_e500_id_table_free(vcpu_e500);
 uninit_vcpu:
 	kvm_vcpu_uninit(vcpu);
 free_vcpu:
@@ -220,8 +472,9 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
 	free_page((unsigned long)vcpu->arch.shared);
-	kvm_vcpu_uninit(vcpu);
 	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvmppc_e500_id_table_free(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index a48af00..34cef08 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -35,7 +35,9 @@ struct tlbe_priv {
 	struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
 };
 
+#ifdef CONFIG_KVM_E500
 struct vcpu_id_table;
+#endif
 
 struct kvmppc_e500_tlb_params {
 	int entries, ways, sets;
@@ -70,23 +72,22 @@ struct kvmppc_vcpu_e500 {
 	struct tlbe_ref *tlb_refs[E500_TLB_NUM];
 	unsigned int host_tlb1_nv;
 
-	u32 host_pid[E500_PID_NUM];
-	u32 pid[E500_PID_NUM];
 	u32 svr;
-
-	/* vcpu id table */
-	struct vcpu_id_table *idt;
-
 	u32 l1csr0;
 	u32 l1csr1;
 	u32 hid0;
 	u32 hid1;
-	u32 tlb0cfg;
-	u32 tlb1cfg;
 	u64 mcar;
 
 	struct page **shared_tlb_pages;
 	int num_shared_tlb_pages;
+
+#ifdef CONFIG_KVM_E500
+	u32 pid[E500_PID_NUM];
+
+	/* vcpu id table */
+	struct vcpu_id_table *idt;
+#endif
 };
 
 static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
@@ -113,23 +114,25 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
 	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
 	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
 
-extern void kvmppc_e500_tlb_put(struct kvm_vcpu *);
-extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
-extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
-extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
 int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
 				ulong value);
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
 int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
-int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
 
 void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
+
+#ifdef CONFIG_KVM_E500
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion);
+#endif
+
 /* TLB helper functions */
 static inline unsigned int
 get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
@@ -183,6 +186,12 @@ get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
 	return (tlbe->mas1 >> 30) & 0x1;
 }
 
+static inline unsigned int
+get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
+}
+
 static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.pid & 0xff;
@@ -248,4 +257,31 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static inline struct kvm_book3e_206_tlb_entry *get_entry(
+	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
+{
+	int offset = vcpu_e500->gtlb_offset[tlbsel];
+	return &vcpu_e500->gtlb_arch[offset + entry];
+}
+
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe);
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#ifdef CONFIG_KVM_E500
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe);
+
+static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf;
+
+	return vcpu_e500->pid[tidseld];
+}
+
+/* Force TS=1 for all guest mappings. */
+#define get_tlb_sts(gtlbe)              (MAS1_TS)
+#endif /* CONFIG_KVM_E500 */
+
 #endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 7e2d592..c80794d 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -174,9 +174,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, val);
 		break;
 	case SPRN_TLB0CFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[0]); break;
 	case SPRN_TLB1CFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[1]); break;
 	case SPRN_L1CSR0:
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break;
 	case SPRN_L1CSR1:
@@ -192,7 +192,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, 0); break;
 
 	case SPRN_MMUCFG:
-		kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucfg); break;
 
 	/* extra exceptions */
 	case SPRN_IVOR32:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 7d4a918..9925fc6 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -27,208 +27,14 @@
 #include <linux/hugetlb.h>
 #include <asm/kvm_ppc.h>
 
-#include "../mm/mmu_decl.h"
 #include "e500.h"
 #include "trace.h"
 #include "timing.h"
 
 #define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
 
-struct id {
-	unsigned long val;
-	struct id **pentry;
-};
-
-#define NUM_TIDS 256
-
-/*
- * This table provide mappings from:
- * (guestAS,guestTID,guestPR) --> ID of physical cpu
- * guestAS	[0..1]
- * guestTID	[0..255]
- * guestPR	[0..1]
- * ID		[1..255]
- * Each vcpu keeps one vcpu_id_table.
- */
-struct vcpu_id_table {
-	struct id id[2][NUM_TIDS][2];
-};
-
-/*
- * This table provide reversed mappings of vcpu_id_table:
- * ID --> address of vcpu_id_table item.
- * Each physical core has one pcpu_id_table.
- */
-struct pcpu_id_table {
-	struct id *entry[NUM_TIDS];
-};
-
-static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
-
-/* This variable keeps last used shadow ID on local core.
- * The valid range of shadow ID is [1..255] */
-static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
-
 static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
 
-static struct kvm_book3e_206_tlb_entry *get_entry(
-	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
-{
-	int offset = vcpu_e500->gtlb_offset[tlbsel];
-	return &vcpu_e500->gtlb_arch[offset + entry];
-}
-
-/*
- * Allocate a free shadow id and setup a valid sid mapping in given entry.
- * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
- *
- * The caller must have preemption disabled, and keep it that way until
- * it has finished with the returned shadow id (either written into the
- * TLB or arch.shadow_pid, or discarded).
- */
-static inline int local_sid_setup_one(struct id *entry)
-{
-	unsigned long sid;
-	int ret = -1;
-
-	sid = ++(__get_cpu_var(pcpu_last_used_sid));
-	if (sid < NUM_TIDS) {
-		__get_cpu_var(pcpu_sids).entry[sid] = entry;
-		entry->val = sid;
-		entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
-		ret = sid;
-	}
-
-	/*
-	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
-	 * the caller will invalidate everything and start over.
-	 *
-	 * sid > NUM_TIDS indicates a race, which we disable preemption to
-	 * avoid.
-	 */
-	WARN_ON(sid > NUM_TIDS);
-
-	return ret;
-}
-
-/*
- * Check if given entry contain a valid shadow id mapping.
- * An ID mapping is considered valid only if
- * both vcpu and pcpu know this mapping.
- *
- * The caller must have preemption disabled, and keep it that way until
- * it has finished with the returned shadow id (either written into the
- * TLB or arch.shadow_pid, or discarded).
- */
-static inline int local_sid_lookup(struct id *entry)
-{
-	if (entry && entry->val != 0 &&
-	    __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
-	    entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
-		return entry->val;
-	return -1;
-}
-
-/* Invalidate all id mappings on local core -- call with preempt disabled */
-static inline void local_sid_destroy_all(void)
-{
-	__get_cpu_var(pcpu_last_used_sid) = 0;
-	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
-}
-
-static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
-	return vcpu_e500->idt;
-}
-
-static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	kfree(vcpu_e500->idt);
-}
-
-/* Invalidate all mappings on vcpu */
-static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
-
-	/* Update shadow pid when mappings are changed */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
-/* Invalidate one ID mapping on vcpu */
-static inline void kvmppc_e500_id_table_reset_one(
-			       struct kvmppc_vcpu_e500 *vcpu_e500,
-			       int as, int pid, int pr)
-{
-	struct vcpu_id_table *idt = vcpu_e500->idt;
-
-	BUG_ON(as >= 2);
-	BUG_ON(pid >= NUM_TIDS);
-	BUG_ON(pr >= 2);
-
-	idt->id[as][pid][pr].val = 0;
-	idt->id[as][pid][pr].pentry = NULL;
-
-	/* Update shadow pid when mappings are changed */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
-/*
- * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
- * This function first lookup if a valid mapping exists,
- * if not, then creates a new one.
- *
- * The caller must have preemption disabled, and keep it that way until
- * it has finished with the returned shadow id (either written into the
- * TLB or arch.shadow_pid, or discarded).
- */
-static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
-					unsigned int as, unsigned int gid,
-					unsigned int pr, int avoid_recursion)
-{
-	struct vcpu_id_table *idt = vcpu_e500->idt;
-	int sid;
-
-	BUG_ON(as >= 2);
-	BUG_ON(gid >= NUM_TIDS);
-	BUG_ON(pr >= 2);
-
-	sid = local_sid_lookup(&idt->id[as][gid][pr]);
-
-	while (sid <= 0) {
-		/* No mapping yet */
-		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
-		if (sid <= 0) {
-			_tlbil_all();
-			local_sid_destroy_all();
-		}
-
-		/* Update shadow pid when mappings are changed */
-		if (!avoid_recursion)
-			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-	}
-
-	return sid;
-}
-
-/* Map guest pid to shadow.
- * We use PID to keep shadow of current guest non-zero PID,
- * and use PID1 to keep shadow of guest zero PID.
- * So that guest tlbe with TID=0 can be accessed at any time */
-void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	preempt_disable();
-	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
-			get_cur_as(&vcpu_e500->vcpu),
-			get_cur_pid(&vcpu_e500->vcpu),
-			get_cur_pr(&vcpu_e500->vcpu), 1);
-	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
-			get_cur_as(&vcpu_e500->vcpu), 0,
-			get_cur_pr(&vcpu_e500->vcpu), 1);
-	preempt_enable();
-}
-
 static inline unsigned int gtlb0_get_next_victim(
 		struct kvmppc_vcpu_e500 *vcpu_e500)
 {
@@ -336,6 +142,7 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 }
 
+#ifdef CONFIG_KVM_E500
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -360,75 +167,21 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
 	preempt_enable();
 }
-
-void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
-	/* Shadow PID may be expired on local core */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
-void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
-{
-}
+#endif
 
 static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
 				int tlbsel, int esel)
 {
 	struct kvm_book3e_206_tlb_entry *gtlbe =
 		get_entry(vcpu_e500, tlbsel, esel);
-	struct vcpu_id_table *idt = vcpu_e500->idt;
-	unsigned int pr, tid, ts, pid;
-	u32 val, eaddr;
-	unsigned long flags;
-
-	ts = get_tlb_ts(gtlbe);
-	tid = get_tlb_tid(gtlbe);
-
-	preempt_disable();
-
-	/* One guest ID may be mapped to two shadow IDs */
-	for (pr = 0; pr < 2; pr++) {
-		/*
-		 * The shadow PID can have a valid mapping on at most one
-		 * host CPU.  In the common case, it will be valid on this
-		 * CPU, in which case (for TLB0) we do a local invalidation
-		 * of the specific address.
-		 *
-		 * If the shadow PID is not valid on the current host CPU, or
-		 * if we're invalidating a TLB1 entry, we invalidate the
-		 * entire shadow PID.
-		 */
-		if (tlbsel == 1 ||
-		    (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) {
-			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
-			continue;
-		}
-
-		/*
-		 * The guest is invalidating a TLB0 entry which is in a PID
-		 * that has a valid shadow mapping on this host CPU.  We
-		 * search host TLB0 to invalidate it's shadow TLB entry,
-		 * similar to __tlbil_va except that we need to look in AS1.
-		 */
-		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
-		eaddr = get_tlb_eaddr(gtlbe);
-
-		local_irq_save(flags);
-
-		mtspr(SPRN_MAS6, val);
-		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
-		val = mfspr(SPRN_MAS1);
-		if (val & MAS1_VALID) {
-			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
-			asm volatile("tlbwe");
-		}
 
-		local_irq_restore(flags);
+	if (tlbsel == 1) {
+		kvmppc_e500_tlbil_all(vcpu_e500);
+		return;
 	}
 
-	preempt_enable();
+	/* Guest tlbe is backed by at most one host tlbe per shadow pid. */
+	kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
 }
 
 static int tlb0_set_base(gva_t addr, int sets, int ways)
@@ -546,7 +299,7 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
 	int stlbsel = 1;
 	int i;
 
-	kvmppc_e500_id_table_reset_all(vcpu_e500);
+	kvmppc_e500_tlbil_all(vcpu_e500);
 
 	for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
 		struct tlbe_ref *ref =
@@ -561,19 +314,18 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 		unsigned int eaddr, int as)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	unsigned int victim, pidsel, tsized;
+	unsigned int victim, tsized;
 	int tlbsel;
 
 	/* since we only have two TLBs, only lower bit is used. */
 	tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1;
 	victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
-	pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf;
 	tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f;
 
 	vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
 		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 	vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
-		| MAS1_TID(vcpu_e500->pid[pidsel])
+		| MAS1_TID(get_tlbmiss_tid(vcpu))
 		| MAS1_TSIZE(tsized);
 	vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN)
 		| (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK);
@@ -585,23 +337,22 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 
 /* TID must be supplied by the caller */
 static inline void kvmppc_e500_setup_stlbe(
-	struct kvmppc_vcpu_e500 *vcpu_e500,
+	struct kvm_vcpu *vcpu,
 	struct kvm_book3e_206_tlb_entry *gtlbe,
 	int tsize, struct tlbe_ref *ref, u64 gvaddr,
 	struct kvm_book3e_206_tlb_entry *stlbe)
 {
 	pfn_t pfn = ref->pfn;
+	u32 pr = vcpu->arch.shared->msr & MSR_PR;
 
 	BUG_ON(!(ref->flags & E500_TLB_VALID));
 
-	/* Force TS=1 IPROT=0 for all guest mappings. */
-	stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID;
-	stlbe->mas2 = (gvaddr & MAS2_EPN)
-		| e500_shadow_mas2_attrib(gtlbe->mas2,
-				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT)
-		| e500_shadow_mas3_attrib(gtlbe->mas7_3,
-				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
+	/* Force IPROT=0 for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN) |
+		      e500_shadow_mas2_attrib(gtlbe->mas2, pr);
+	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
 }
 
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -735,7 +486,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	kvmppc_e500_ref_release(ref);
 	kvmppc_e500_ref_setup(ref, gtlbe, pfn);
 
-	kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe);
+	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
+				ref, gvaddr, stlbe);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
@@ -775,14 +527,6 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	return victim;
 }
 
-void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
-	/* Recalc shadow pid since MSR changes */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
 static inline int kvmppc_e500_gtlbe_invalidate(
 				struct kvmppc_vcpu_e500 *vcpu_e500,
 				int tlbsel, int esel)
@@ -810,7 +554,7 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 
 	/* Invalidate all vcpu id mappings */
-	kvmppc_e500_id_table_reset_all(vcpu_e500);
+	kvmppc_e500_tlbil_all(vcpu_e500);
 
 	return EMULATE_DONE;
 }
@@ -843,7 +587,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 	}
 
 	/* Invalidate all vcpu id mappings */
-	kvmppc_e500_id_table_reset_all(vcpu_e500);
+	kvmppc_e500_tlbil_all(vcpu_e500);
 
 	return EMULATE_DONE;
 }
@@ -928,9 +672,7 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 	int stid;
 
 	preempt_disable();
-	stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
-				   get_tlb_tid(gtlbe),
-				   get_cur_pr(&vcpu_e500->vcpu), 0);
+	stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
 
 	stlbe->mas1 |= MAS1_TID(stid);
 	write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
@@ -940,8 +682,8 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	struct kvm_book3e_206_tlb_entry *gtlbe;
-	int tlbsel, esel;
+	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
+	int tlbsel, esel, stlbsel, sesel;
 
 	tlbsel = get_tlb_tlbsel(vcpu);
 	esel = get_tlb_esel(vcpu, tlbsel);
@@ -960,8 +702,6 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	if (tlbe_is_host_safe(vcpu, gtlbe)) {
-		struct kvm_book3e_206_tlb_entry stlbe;
-		int stlbsel, sesel;
 		u64 eaddr;
 		u64 raddr;
 
@@ -988,7 +728,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			 * are mapped on the fly. */
 			stlbsel = 1;
 			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
-					raddr >> PAGE_SHIFT, gtlbe, &stlbe);
+				    raddr >> PAGE_SHIFT, gtlbe, &stlbe);
 			break;
 
 		default:
@@ -1002,6 +742,48 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
+				  gva_t eaddr, unsigned int pid, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel, tlbsel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
+		if (esel >= 0)
+			return index_of(tlbsel, esel);
+	}
+
+	return -1;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+	int index;
+	gva_t eaddr;
+	u8 pid;
+	u8 as;
+
+	eaddr = tr->linear_address;
+	pid = (tr->linear_address >> 32) & 0xff;
+	as = (tr->linear_address >> 40) & 0x1;
+
+	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
+	if (index < 0) {
+		tr->valid = 0;
+		return 0;
+	}
+
+	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
+	/* XXX what does "writeable" and "usermode" even mean? */
+	tr->valid = 1;
+
+	return 0;
+}
+
+
 int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
 	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
@@ -1065,7 +847,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 		sesel = 0; /* unused */
 		priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
 
-		kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
+		kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
 					&priv->ref, eaddr, &stlbe);
 		break;
 
@@ -1086,48 +868,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 	write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
 }
 
-int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
-				gva_t eaddr, unsigned int pid, int as)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	int esel, tlbsel;
-
-	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
-		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
-		if (esel >= 0)
-			return index_of(tlbsel, esel);
-	}
-
-	return -1;
-}
-
-void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
-	if (vcpu->arch.pid != pid) {
-		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
-		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-	}
-}
-
-void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	struct kvm_book3e_206_tlb_entry *tlbe;
-
-	/* Insert large initial mapping for guest. */
-	tlbe = get_entry(vcpu_e500, 1, 0);
-	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
-	tlbe->mas2 = 0;
-	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
-
-	/* 4K map for serial output. Used by kernel wrapper. */
-	tlbe = get_entry(vcpu_e500, 1, 1);
-	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
-	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
-	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
-}
-
 static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int i;
@@ -1154,6 +894,36 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 	vcpu_e500->gtlb_arch = NULL;
 }
 
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
+	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
+	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
+	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
+	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
+	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
+
+	sregs->u.e.mmucfg = vcpu->arch.mmucfg;
+	sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0];
+	sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1];
+	sregs->u.e.tlbcfg[2] = 0;
+	sregs->u.e.tlbcfg[3] = 0;
+}
+
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
+		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
+		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
+		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
+		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
+		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
+		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
+	}
+
+	return 0;
+}
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg)
 {
@@ -1237,14 +1007,16 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	vcpu_e500->gtlb_offset[0] = 0;
 	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
 
-	vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
 	if (params.tlb_sizes[0] <= 2048)
-		vcpu_e500->tlb0cfg |= params.tlb_sizes[0];
-	vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+		vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0];
+	vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
 
-	vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu_e500->tlb1cfg |= params.tlb_sizes[1];
-	vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1];
+	vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
 
 	vcpu_e500->shared_tlb_pages = pages;
 	vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -1280,6 +1052,7 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
 
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
+	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
 	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
 	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
 
@@ -1356,20 +1129,17 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (!vcpu_e500->gtlb_priv[1])
 		goto err;
 
-	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
-		goto err;
-
 	/* Init TLB configuration register */
-	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) &
+	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries;
-	vcpu_e500->tlb0cfg |=
+	vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries;
+	vcpu->arch.tlbcfg[0] |=
 		vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
 
-	vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) &
+	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries;
-	vcpu_e500->tlb0cfg |=
+	vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[1].entries;
+	vcpu->arch.tlbcfg[0] |=
 		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
 
 	return 0;
@@ -1384,8 +1154,6 @@ err:
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	free_gtlb(vcpu_e500);
-	kvmppc_e500_id_table_free(vcpu_e500);
-
 	kfree(vcpu_e500->tlb_refs[0]);
 	kfree(vcpu_e500->tlb_refs[1]);
 }
-- 
cgit v0.10.2


From 4f802fe98bd5bd4fe1dd86df3e5c58546e65ad09 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:37 +0000
Subject: KVM: PPC: e500: Track TLB1 entries with a bitmap

Rather than invalidate everything when a TLB1 entry needs to be
taken down, keep track of which host TLB1 entries are used for
a given guest TLB1 entry, and invalidate just those entries.

Based on code from Ashish Kalra <Ashish.Kalra@freescale.com>
and Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 34cef08..f4dee55 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -2,6 +2,7 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu <yu.liu@freescale.com>
+ *         Ashish Kalra <ashish.kalra@freescale.com>
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.h and
@@ -25,6 +26,7 @@
 
 #define E500_TLB_VALID 1
 #define E500_TLB_DIRTY 2
+#define E500_TLB_BITMAP 4
 
 struct tlbe_ref {
 	pfn_t pfn;
@@ -82,6 +84,9 @@ struct kvmppc_vcpu_e500 {
 	struct page **shared_tlb_pages;
 	int num_shared_tlb_pages;
 
+	u64 *g2h_tlb1_map;
+	unsigned int *h2g_tlb1_rmap;
+
 #ifdef CONFIG_KVM_E500
 	u32 pid[E500_PID_NUM];
 
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 9925fc6..c8ce51d 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, yu.liu@freescale.com
+ *         Ashish Kalra, ashish.kalra@freescale.com
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -175,8 +176,28 @@ static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
 	struct kvm_book3e_206_tlb_entry *gtlbe =
 		get_entry(vcpu_e500, tlbsel, esel);
 
-	if (tlbsel == 1) {
-		kvmppc_e500_tlbil_all(vcpu_e500);
+	if (tlbsel == 1 &&
+	    vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) {
+		u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
+		int hw_tlb_indx;
+		unsigned long flags;
+
+		local_irq_save(flags);
+		while (tmp) {
+			hw_tlb_indx = __ilog2_u64(tmp & -tmp);
+			mtspr(SPRN_MAS0,
+			      MAS0_TLBSEL(1) |
+			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
+			mtspr(SPRN_MAS1, 0);
+			asm volatile("tlbwe");
+			vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
+			tmp &= tmp - 1;
+		}
+		mb();
+		vcpu_e500->g2h_tlb1_map[esel] = 0;
+		vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP;
+		local_irq_restore(flags);
+
 		return;
 	}
 
@@ -282,6 +303,16 @@ static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
 	}
 }
 
+static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	if (vcpu_e500->g2h_tlb1_map)
+		memset(vcpu_e500->g2h_tlb1_map,
+		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries, 0);
+	if (vcpu_e500->h2g_tlb1_rmap)
+		memset(vcpu_e500->h2g_tlb1_rmap,
+		       sizeof(unsigned int) * host_tlb_params[1].entries, 0);
+}
+
 static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int tlbsel = 0;
@@ -511,7 +542,7 @@ static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 /* XXX for both one-one and one-to-many , for now use TLB1 */
 static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
-		struct kvm_book3e_206_tlb_entry *stlbe)
+		struct kvm_book3e_206_tlb_entry *stlbe, int esel)
 {
 	struct tlbe_ref *ref;
 	unsigned int victim;
@@ -524,6 +555,14 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	ref = &vcpu_e500->tlb_refs[1][victim];
 	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
 
+	vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim;
+	vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
+	if (vcpu_e500->h2g_tlb1_rmap[victim]) {
+		unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim];
+		vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim);
+	}
+	vcpu_e500->h2g_tlb1_rmap[victim] = esel;
+
 	return victim;
 }
 
@@ -728,7 +767,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			 * are mapped on the fly. */
 			stlbsel = 1;
 			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
-				    raddr >> PAGE_SHIFT, gtlbe, &stlbe);
+				    raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel);
 			break;
 
 		default:
@@ -856,7 +895,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 
 		stlbsel = 1;
 		sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
-					     gtlbe, &stlbe);
+					     gtlbe, &stlbe, esel);
 		break;
 	}
 
@@ -872,6 +911,9 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int i;
 
+	clear_tlb1_bitmap(vcpu_e500);
+	kfree(vcpu_e500->g2h_tlb1_map);
+
 	clear_tlb_refs(vcpu_e500);
 	kfree(vcpu_e500->gtlb_priv[0]);
 	kfree(vcpu_e500->gtlb_priv[1]);
@@ -932,6 +974,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	char *virt;
 	struct page **pages;
 	struct tlbe_priv *privs[2] = {};
+	u64 *g2h_bitmap = NULL;
 	size_t array_len;
 	u32 sets;
 	int num_pages, ret, i;
@@ -993,10 +1036,16 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	if (!privs[0] || !privs[1])
 		goto err_put_page;
 
+	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
+	                     GFP_KERNEL);
+	if (!g2h_bitmap)
+		goto err_put_page;
+
 	free_gtlb(vcpu_e500);
 
 	vcpu_e500->gtlb_priv[0] = privs[0];
 	vcpu_e500->gtlb_priv[1] = privs[1];
+	vcpu_e500->g2h_tlb1_map = g2h_bitmap;
 
 	vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *)
 		(virt + (cfg->array & (PAGE_SIZE - 1)));
@@ -1129,6 +1178,18 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (!vcpu_e500->gtlb_priv[1])
 		goto err;
 
+	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) *
+					  vcpu_e500->gtlb_params[1].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->g2h_tlb1_map)
+		goto err;
+
+	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
+					   host_tlb_params[1].entries,
+					   GFP_KERNEL);
+	if (!vcpu_e500->h2g_tlb1_rmap)
+		goto err;
+
 	/* Init TLB configuration register */
 	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
@@ -1154,6 +1215,7 @@ err:
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	free_gtlb(vcpu_e500);
+	kfree(vcpu_e500->h2g_tlb1_rmap);
 	kfree(vcpu_e500->tlb_refs[0]);
 	kfree(vcpu_e500->tlb_refs[1]);
 }
-- 
cgit v0.10.2


From ab9fc4056af338248640ddb18497be386360363d Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:39 +0000
Subject: KVM: PPC: e500: emulate tlbilx

tlbilx is the new, preferred invalidation instruction.  It is not
found on e500 prior to e500mc, but there should be no harm in
supporting it on all e500.

Based on code from Ashish Kalra <Ashish.Kalra@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index f4dee55..ce3f163 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -124,6 +124,7 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb);
 int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index c80794d..af02c18 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -22,6 +22,7 @@
 #define XOP_TLBSX   914
 #define XOP_TLBRE   946
 #define XOP_TLBWE   978
+#define XOP_TLBILX  18
 
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
@@ -29,6 +30,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int emulated = EMULATE_DONE;
 	int ra;
 	int rb;
+	int rt;
 
 	switch (get_op(inst)) {
 	case 31:
@@ -47,6 +49,13 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			emulated = kvmppc_e500_emul_tlbsx(vcpu,rb);
 			break;
 
+		case XOP_TLBILX:
+			ra = get_ra(inst);
+			rb = get_rb(inst);
+			rt = get_rt(inst);
+			emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb);
+			break;
+
 		case XOP_TLBIVAX:
 			ra = get_ra(inst);
 			rb = get_rb(inst);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c8ce51d..6eb5d65 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -631,6 +631,58 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 	return EMULATE_DONE;
 }
 
+static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+		       int pid, int rt)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+	int tid, esel;
+
+	/* invalidate all entries */
+	for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
+		tlbe = get_entry(vcpu_e500, tlbsel, esel);
+		tid = get_tlb_tid(tlbe);
+		if (rt == 0 || tid == pid) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+		}
+	}
+}
+
+static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
+		       int ra, int rb)
+{
+	int tlbsel, esel;
+	gva_t ea;
+
+	ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb);
+	if (ra)
+		ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra);
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
+		if (esel >= 0) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+			break;
+		}
+	}
+}
+
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int pid = get_cur_spid(vcpu);
+
+	if (rt == 0 || rt == 1) {
+		tlbilx_all(vcpu_e500, 0, pid, rt);
+		tlbilx_all(vcpu_e500, 1, pid, rt);
+	} else if (rt == 3) {
+		tlbilx_one(vcpu_e500, pid, ra, rb);
+	}
+
+	return EMULATE_DONE;
+}
+
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-- 
cgit v0.10.2


From cfac57847a67c4903f34a77e971521531bbc7c77 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:40 +0000
Subject: powerpc/booke: Provide exception macros with interrupt name

DO_KVM will need to identify the particular exception type.

There is an existing set of arbitrary numbers that Linux passes,
but it's an undocumented mess that sort of corresponds to server/classic
exception vectors but not really.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 7dd2981..d1192c5 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -248,10 +248,11 @@ _ENTRY(_start);
 
 interrupt_base:
 	/* Critical Input Interrupt */
-	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
 
 	/* Machine Check Interrupt */
-	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
 	MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception)
 
 	/* Data Storage Interrupt */
@@ -261,7 +262,8 @@ interrupt_base:
 	INSTRUCTION_STORAGE_EXCEPTION
 
 	/* External Input Interrupt */
-	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+	EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \
+		  do_IRQ, EXC_XFER_LITE)
 
 	/* Alignment Interrupt */
 	ALIGNMENT_EXCEPTION
@@ -273,29 +275,32 @@ interrupt_base:
 #ifdef CONFIG_PPC_FPU
 	FP_UNAVAILABLE_EXCEPTION
 #else
-	EXCEPTION(0x2010, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \
+		  FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
 #endif
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
 	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
-	EXCEPTION(0x2020, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
+		  AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
 
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
-	EXCEPTION(0x1010, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Watchdog Timer Interrupt */
 	/* TODO: Add watchdog support */
 #ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x1020, WatchdogTimer, WatchdogException)
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException)
 #else
-	CRITICAL_EXCEPTION(0x1020, WatchdogTimer, unknown_exception)
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception)
 #endif
 
 	/* Data TLB Error Interrupt */
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 0e41753..51fd072 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -2,6 +2,8 @@
 #define __HEAD_BOOKE_H__
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
+#include <asm/kvm_asm.h>
+
 /*
  * Macros used for common Book-e exception handling
  */
@@ -28,7 +30,7 @@
  */
 #define THREAD_NORMSAVE(offset)	(THREAD_NORMSAVES + (offset * 4))
 
-#define NORMAL_EXCEPTION_PROLOG						     \
+#define NORMAL_EXCEPTION_PROLOG(intno)						     \
 	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
 	mfspr	r10, SPRN_SPRG_THREAD;					     \
 	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
@@ -113,7 +115,7 @@
  * registers as the normal prolog above. Instead we use a portion of the
  * critical/machine check exception stack at low physical addresses.
  */
-#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \
+#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \
 	mtspr	SPRN_SPRG_WSCRATCH_##exc_level,r8;			     \
 	BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \
 	stw	r9,GPR9(r8);		/* save various registers	   */\
@@ -162,12 +164,13 @@
 	SAVE_4GPRS(3, r11);						     \
 	SAVE_2GPRS(7, r11)
 
-#define CRITICAL_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1)
+#define CRITICAL_EXCEPTION_PROLOG(intno) \
+		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1)
 #define DEBUG_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1)
+		EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1)
 #define MCHECK_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1)
+		EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \
+			SPRN_MCSRR0, SPRN_MCSRR1)
 
 /*
  * Exception vectors.
@@ -181,16 +184,16 @@ label:
 	.long	func;						\
 	.long	ret_from_except_full
 
-#define EXCEPTION(n, label, hdlr, xfer)				\
+#define EXCEPTION(n, intno, label, hdlr, xfer)			\
 	START_EXCEPTION(label);					\
-	NORMAL_EXCEPTION_PROLOG;				\
+	NORMAL_EXCEPTION_PROLOG(intno);				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	xfer(n, hdlr)
 
-#define CRITICAL_EXCEPTION(n, label, hdlr)			\
-	START_EXCEPTION(label);					\
-	CRITICAL_EXCEPTION_PROLOG;				\
-	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+#define CRITICAL_EXCEPTION(n, intno, label, hdlr)			\
+	START_EXCEPTION(label);						\
+	CRITICAL_EXCEPTION_PROLOG(intno);				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
 			  NOCOPY, crit_transfer_to_handler, \
 			  ret_from_crit_exc)
@@ -302,7 +305,7 @@ label:
 
 #define DEBUG_CRIT_EXCEPTION						      \
 	START_EXCEPTION(DebugCrit);					      \
-	CRITICAL_EXCEPTION_PROLOG;					      \
+	CRITICAL_EXCEPTION_PROLOG(DEBUG);				      \
 									      \
 	/*								      \
 	 * If there is a single step or branch-taken exception in an	      \
@@ -355,7 +358,7 @@ label:
 
 #define DATA_STORAGE_EXCEPTION						      \
 	START_EXCEPTION(DataStorage)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE);		      \
 	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r5,_ESR(r11);						      \
 	mfspr	r4,SPRN_DEAR;		/* Grab the DEAR */		      \
@@ -363,7 +366,7 @@ label:
 
 #define INSTRUCTION_STORAGE_EXCEPTION					      \
 	START_EXCEPTION(InstructionStorage)				      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(INST_STORAGE);		      \
 	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r5,_ESR(r11);						      \
 	mr      r4,r12;                 /* Pass SRR0 as arg2 */		      \
@@ -372,7 +375,7 @@ label:
 
 #define ALIGNMENT_EXCEPTION						      \
 	START_EXCEPTION(Alignment)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(ALIGNMENT);		      \
 	mfspr   r4,SPRN_DEAR;           /* Grab the DEAR and save it */	      \
 	stw     r4,_DEAR(r11);						      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -380,7 +383,7 @@ label:
 
 #define PROGRAM_EXCEPTION						      \
 	START_EXCEPTION(Program)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(PROGRAM);		      \
 	mfspr	r4,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r4,_ESR(r11);						      \
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -388,7 +391,7 @@ label:
 
 #define DECREMENTER_EXCEPTION						      \
 	START_EXCEPTION(Decrementer)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(DECREMENTER);		      \
 	lis     r0,TSR_DIS@h;           /* Setup the DEC interrupt mask */    \
 	mtspr   SPRN_TSR,r0;		/* Clear the DEC interrupt */	      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -396,7 +399,7 @@ label:
 
 #define FP_UNAVAILABLE_EXCEPTION					      \
 	START_EXCEPTION(FloatingPointUnavailable)			      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL);		      \
 	beq	1f;							      \
 	bl	load_up_fpu;		/* if from user, just load it up */   \
 	b	fast_exception_return;					      \
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 28e6259..7c406dd 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -301,19 +301,20 @@ _ENTRY(__early_start)
 
 interrupt_base:
 	/* Critical Input Interrupt */
-	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
 
 	/* Machine Check Interrupt */
 #ifdef CONFIG_E200
 	/* no RFMCI, MCSRRs on E200 */
-	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
 #else
 	MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
 #endif
 
 	/* Data Storage Interrupt */
 	START_EXCEPTION(DataStorage)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
 	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
 	stw	r5,_ESR(r11)
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
@@ -328,7 +329,7 @@ interrupt_base:
 	INSTRUCTION_STORAGE_EXCEPTION
 
 	/* External Input Interrupt */
-	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+	EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE)
 
 	/* Alignment Interrupt */
 	ALIGNMENT_EXCEPTION
@@ -342,32 +343,36 @@ interrupt_base:
 #else
 #ifdef CONFIG_E200
 	/* E200 treats 'normal' floating point instructions as FP Unavail exception */
-	EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  program_check_exception, EXC_XFER_EE)
 #else
-	EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 #endif
 #endif
 
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(SYSCALL)
 	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
-	EXCEPTION(0x2900, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
 
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
-	EXCEPTION(0x3100, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x3100, FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Watchdog Timer Interrupt */
 #ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x3200, WatchdogTimer, WatchdogException)
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException)
 #else
-	CRITICAL_EXCEPTION(0x3200, WatchdogTimer, unknown_exception)
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception)
 #endif
 
 	/* Data TLB Error Interrupt */
@@ -538,31 +543,38 @@ interrupt_base:
 #ifdef CONFIG_SPE
 	/* SPE Unavailable */
 	START_EXCEPTION(SPEUnavailable)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL)
 	bne	load_up_spe
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	EXC_XFER_EE_LITE(0x2010, KernelSPE)
 #else
-	EXCEPTION(0x2020, SPEUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 #endif /* CONFIG_SPE */
 
 	/* SPE Floating Point Data */
 #ifdef CONFIG_SPE
-	EXCEPTION(0x2030, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE);
+	EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, \
+		  SPEFloatingPointException, EXC_XFER_EE);
 
 	/* SPE Floating Point Round */
-	EXCEPTION(0x2050, SPEFloatingPointRound, SPEFloatingPointRoundException, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  SPEFloatingPointRoundException, EXC_XFER_EE)
 #else
-	EXCEPTION(0x2040, SPEFloatingPointData, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2050, SPEFloatingPointRound, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, \
+		  unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  unknown_exception, EXC_XFER_EE)
 #endif /* CONFIG_SPE */
 
 	/* Performance Monitor */
-	EXCEPTION(0x2060, PerformanceMonitor, performance_monitor_exception, EXC_XFER_STD)
+	EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \
+		  performance_monitor_exception, EXC_XFER_STD)
 
-	EXCEPTION(0x2070, Doorbell, doorbell_exception, EXC_XFER_STD)
+	EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD)
 
-	CRITICAL_EXCEPTION(0x2080, CriticalDoorbell, unknown_exception)
+	CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \
+			   CriticalDoorbell, unknown_exception)
 
 	/* Debug Interrupt */
 	DEBUG_DEBUG_EXCEPTION
-- 
cgit v0.10.2


From d30f6e480055e5be12e7a03fd11ea912a451daa5 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:43 +0000
Subject: KVM: PPC: booke: category E.HV (GS-mode) support

Chips such as e500mc that implement category E.HV in Power ISA 2.06
provide hardware virtualization features, including a new MSR mode for
guest state.  The guest OS can perform many operations without trapping
into the hypervisor, including transitions to and from guest userspace.

Since we can use SRR1[GS] to reliably tell whether an exception came from
guest state, instead of messing around with IVPR, we use DO_KVM similarly
to book3s.

Current issues include:
 - Machine checks from guest state are not routed to the host handler.
 - The guest can cause a host oops by executing an emulated instruction
   in a page that lacks read permission.  Existing e500/4xx support has
   the same problem.

Includes work by Ashish Kalra <Ashish.Kalra@freescale.com>,
Varun Sethi <Varun.Sethi@freescale.com>, and
Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: remove pt_regs usage]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index efa74ac..d7365b0 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -19,6 +19,7 @@
 
 #define PPC_DBELL_MSG_BRDCAST	(0x04000000)
 #define PPC_DBELL_TYPE(x)	(((x) & 0xf) << (63-36))
+#define PPC_DBELL_LPID(x)	((x) << (63 - 49))
 enum ppc_dbell {
 	PPC_DBELL = 0,		/* doorbell */
 	PPC_DBELL_CRIT = 1,	/* critical doorbell */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 7b1f0e0..0978152 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -48,6 +48,14 @@
 #define BOOKE_INTERRUPT_SPE_FP_DATA 33
 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34
 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35
+#define BOOKE_INTERRUPT_DOORBELL 36
+#define BOOKE_INTERRUPT_DOORBELL_CRITICAL 37
+
+/* booke_hv */
+#define BOOKE_INTERRUPT_GUEST_DBELL 38
+#define BOOKE_INTERRUPT_GUEST_DBELL_CRIT 39
+#define BOOKE_INTERRUPT_HV_SYSCALL 40
+#define BOOKE_INTERRUPT_HV_PRIV 41
 
 /* book3s */
 
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
new file mode 100644
index 0000000..30a600f
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ASM_KVM_BOOKE_HV_ASM_H
+#define ASM_KVM_BOOKE_HV_ASM_H
+
+#ifdef __ASSEMBLY__
+
+/*
+ * All exceptions from guest state must go through KVM
+ * (except for those which are delivered directly to the guest) --
+ * there are no exceptions for which we fall through directly to
+ * the normal host handler.
+ *
+ * Expected inputs (normal exceptions):
+ *   SCRATCH0 = saved r10
+ *   r10 = thread struct
+ *   r11 = appropriate SRR1 variant (currently used as scratch)
+ *   r13 = saved CR
+ *   *(r10 + THREAD_NORMSAVE(0)) = saved r11
+ *   *(r10 + THREAD_NORMSAVE(2)) = saved r13
+ *
+ * Expected inputs (crit/mcheck/debug exceptions):
+ *   appropriate SCRATCH = saved r8
+ *   r8 = exception level stack frame
+ *   r9 = *(r8 + _CCR) = saved CR
+ *   r11 = appropriate SRR1 variant (currently used as scratch)
+ *   *(r8 + GPR9) = saved r9
+ *   *(r8 + GPR10) = saved r10 (r10 not yet clobbered)
+ *   *(r8 + GPR11) = saved r11
+ */
+.macro DO_KVM intno srr1
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mtocrf	0x80, r11	/* check MSR[GS] without clobbering reg */
+	bf	3, kvmppc_resume_\intno\()_\srr1
+	b	kvmppc_handler_\intno\()_\srr1
+kvmppc_resume_\intno\()_\srr1:
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+.endm
+
+#endif /*__ASSEMBLY__ */
+#endif /* ASM_KVM_BOOKE_HV_ASM_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5b81cbc..e645623 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -106,6 +106,8 @@ struct kvm_vcpu_stat {
 	u32 dec_exits;
 	u32 ext_intr_exits;
 	u32 halt_wakeup;
+	u32 dbell_exits;
+	u32 gdbell_exits;
 #ifdef CONFIG_PPC_BOOK3S
 	u32 pf_storage;
 	u32 pf_instruc;
@@ -140,6 +142,7 @@ enum kvm_exit_types {
 	EMULATED_TLBSX_EXITS,
 	EMULATED_TLBWE_EXITS,
 	EMULATED_RFI_EXITS,
+	EMULATED_RFCI_EXITS,
 	DEC_EXITS,
 	EXT_INTR_EXITS,
 	HALT_WAKEUP,
@@ -147,6 +150,8 @@ enum kvm_exit_types {
 	FP_UNAVAIL,
 	DEBUG_EXITS,
 	TIMEINGUEST,
+	DBELL_EXITS,
+	GDBELL_EXITS,
 	__NUMBER_OF_KVM_EXIT_TYPES
 };
 
@@ -217,10 +222,10 @@ struct kvm_arch_memory_slot {
 };
 
 struct kvm_arch {
+	unsigned int lpid;
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	struct revmap_entry *revmap;
-	unsigned int lpid;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
 	unsigned long sdr1;
@@ -345,6 +350,17 @@ struct kvm_vcpu_arch {
 	u64 vsr[64];
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	u32 host_mas4;
+	u32 host_mas6;
+	u32 shadow_epcr;
+	u32 epcr;
+	u32 shadow_msrp;
+	u32 eplc;
+	u32 epsc;
+	u32 oldpir;
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S
 	/* For Gekko paired singles */
 	u32 qpr[32];
@@ -428,6 +444,7 @@ struct kvm_vcpu_arch {
 	ulong queued_esr;
 	u32 tlbcfg[4];
 	u32 mmucfg;
+	u32 epr;
 #endif
 	gpa_t paddr_accessed;
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 731e920..e709975 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -139,6 +139,9 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 
+extern int kvmppc_bookehv_init(void);
+extern void kvmppc_bookehv_exit(void);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index cdb54218..eeabcdb 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -104,6 +104,8 @@
 #define MAS4_TSIZED_MASK	0x00000f80	/* Default TSIZE */
 #define MAS4_TSIZED_SHIFT	7
 
+#define MAS5_SGS		0x80000000
+
 #define MAS6_SPID0		0x3FFF0000
 #define MAS6_SPID1		0x00007FFE
 #define MAS6_ISIZE(x)		MAS1_TSIZE(x)
@@ -118,6 +120,10 @@
 
 #define MAS7_RPN		0xFFFFFFFF
 
+#define MAS8_TGS		0x80000000 /* Guest space */
+#define MAS8_VF			0x40000000 /* Virtualization Fault */
+#define MAS8_TLPID		0x000000ff
+
 /* Bit definitions for MMUCFG */
 #define MMUCFG_MAVN	0x00000003	/* MMU Architecture Version Number */
 #define MMUCFG_MAVN_V1	0x00000000	/* v1.0 */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 8e2d037..2a25ab0 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -243,6 +243,9 @@ struct thread_struct {
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	void*		kvm_shadow_vcpu; /* KVM internal data */
 #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	struct kvm_vcpu	*kvm_vcpu;
+#endif
 #ifdef CONFIG_PPC64
 	unsigned long	dscr;
 	int		dscr_inherit;
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 9d7f0fb..f0cb7f4 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -257,7 +257,9 @@
 #define   LPCR_LPES_SH	2
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
+#ifndef SPRN_LPID
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
+#endif
 #define   LPID_RSVD	0x3ff		/* Reserved LPID for partn switching */
 #define	SPRN_HMER	0x150	/* Hardware m? error recovery */
 #define	SPRN_HMEER	0x151	/* Hardware m? enable error recovery */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index b86faa9..815e404 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -61,18 +61,30 @@ extern u32 booke_wdt_period;
 #define SPRN_SPRG7W	0x117	/* Special Purpose Register General 7 Write */
 #define SPRN_EPCR	0x133	/* Embedded Processor Control Register */
 #define SPRN_DBCR2	0x136	/* Debug Control Register 2 */
+#define SPRN_MSRP	0x137	/* MSR Protect Register */
 #define SPRN_IAC3	0x13A	/* Instruction Address Compare 3 */
 #define SPRN_IAC4	0x13B	/* Instruction Address Compare 4 */
 #define SPRN_DVC1	0x13E	/* Data Value Compare Register 1 */
 #define SPRN_DVC2	0x13F	/* Data Value Compare Register 2 */
+#define SPRN_LPID	0x152	/* Logical Partition ID */
 #define SPRN_MAS8	0x155	/* MMU Assist Register 8 */
 #define SPRN_TLB0PS	0x158	/* TLB 0 Page Size Register */
 #define SPRN_TLB1PS	0x159	/* TLB 1 Page Size Register */
 #define SPRN_MAS5_MAS6	0x15c	/* MMU Assist Register 5 || 6 */
 #define SPRN_MAS8_MAS1	0x15d	/* MMU Assist Register 8 || 1 */
 #define SPRN_EPTCFG	0x15e	/* Embedded Page Table Config */
+#define SPRN_GSPRG0	0x170	/* Guest SPRG0 */
+#define SPRN_GSPRG1	0x171	/* Guest SPRG1 */
+#define SPRN_GSPRG2	0x172	/* Guest SPRG2 */
+#define SPRN_GSPRG3	0x173	/* Guest SPRG3 */
 #define SPRN_MAS7_MAS3	0x174	/* MMU Assist Register 7 || 3 */
 #define SPRN_MAS0_MAS1	0x175	/* MMU Assist Register 0 || 1 */
+#define SPRN_GSRR0	0x17A	/* Guest SRR0 */
+#define SPRN_GSRR1	0x17B	/* Guest SRR1 */
+#define SPRN_GEPR	0x17C	/* Guest EPR */
+#define SPRN_GDEAR	0x17D	/* Guest DEAR */
+#define SPRN_GPIR	0x17E	/* Guest PIR */
+#define SPRN_GESR	0x17F	/* Guest Exception Syndrome Register */
 #define SPRN_IVOR0	0x190	/* Interrupt Vector Offset Register 0 */
 #define SPRN_IVOR1	0x191	/* Interrupt Vector Offset Register 1 */
 #define SPRN_IVOR2	0x192	/* Interrupt Vector Offset Register 2 */
@@ -93,6 +105,13 @@ extern u32 booke_wdt_period;
 #define SPRN_IVOR39	0x1B1	/* Interrupt Vector Offset Register 39 */
 #define SPRN_IVOR40	0x1B2	/* Interrupt Vector Offset Register 40 */
 #define SPRN_IVOR41	0x1B3	/* Interrupt Vector Offset Register 41 */
+#define SPRN_GIVOR2	0x1B8	/* Guest IVOR2 */
+#define SPRN_GIVOR3	0x1B9	/* Guest IVOR3 */
+#define SPRN_GIVOR4	0x1BA	/* Guest IVOR4 */
+#define SPRN_GIVOR8	0x1BB	/* Guest IVOR8 */
+#define SPRN_GIVOR13	0x1BC	/* Guest IVOR13 */
+#define SPRN_GIVOR14	0x1BD	/* Guest IVOR14 */
+#define SPRN_GIVPR	0x1BF	/* Guest IVPR */
 #define SPRN_SPEFSCR	0x200	/* SPE & Embedded FP Status & Control */
 #define SPRN_BBEAR	0x201	/* Branch Buffer Entry Address Register */
 #define SPRN_BBTAR	0x202	/* Branch Buffer Target Address Register */
@@ -245,6 +264,10 @@ extern u32 booke_wdt_period;
 #define MCSR_LDG	0x00002000UL /* Guarded Load */
 #define MCSR_TLBSYNC	0x00000002UL /* Multiple tlbsyncs detected */
 #define MCSR_BSL2_ERR	0x00000001UL /* Backside L2 cache error */
+
+#define MSRP_UCLEP	0x04000000 /* Protect MSR[UCLE] */
+#define MSRP_DEP	0x00000200 /* Protect MSR[DE] */
+#define MSRP_PMMP	0x00000004 /* Protect MSR[PMM] */
 #endif
 
 #ifdef CONFIG_E200
@@ -599,6 +622,17 @@ extern u32 booke_wdt_period;
 #define SPRN_EPCR_DMIUH		0x00400000	/* Disable MAS Interrupt updates
 						 * for hypervisor */
 
+/* Bit definitions for EPLC/EPSC */
+#define EPC_EPR		0x80000000 /* 1 = user, 0 = kernel */
+#define EPC_EPR_SHIFT	31
+#define EPC_EAS		0x40000000 /* Address Space */
+#define EPC_EAS_SHIFT	30
+#define EPC_EGS		0x20000000 /* 1 = guest, 0 = hypervisor */
+#define EPC_EGS_SHIFT	29
+#define EPC_ELPID	0x00ff0000
+#define EPC_ELPID_SHIFT	16
+#define EPC_EPID	0x00003fff
+#define EPC_EPID_SHIFT	0
 
 /*
  * The IBM-403 is an even more odd special case, as it is much
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 34b8afe9..bbede58 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -116,6 +116,9 @@ int main(void)
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
 #endif
+#ifdef CONFIG_KVM_BOOKE_HV
+	DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
+#endif
 
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
@@ -387,6 +390,7 @@ int main(void)
 #ifdef CONFIG_KVM
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
+	DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
 	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
@@ -429,9 +433,11 @@ int main(void)
 	DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
 	DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
 
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+
 	/* book3s */
 #ifdef CONFIG_KVM_BOOK3S_64_HV
-	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
 	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
 	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
@@ -446,7 +452,6 @@ int main(void)
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
-	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
 	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
@@ -597,6 +602,12 @@ int main(void)
 	DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4));
+	DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6));
+	DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
 						arch.timing_exit.tv32.tbu));
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 51fd072..5f051ee 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -3,6 +3,7 @@
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
 #include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
 
 /*
  * Macros used for common Book-e exception handling
@@ -36,8 +37,9 @@
 	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
 	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
 	mfcr	r13;			/* save CR in r13 for now	   */\
-	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
-	andi.	r11,r11,MSR_PR;						     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	DO_KVM	BOOKE_INTERRUPT_##intno SPRN_SRR1;			     \
+	andi.	r11, r11, MSR_PR;	/* check whether user or kernel    */\
 	mr	r11, r1;						     \
 	beq	1f;							     \
 	/* if from user, start at top of this thread's kernel stack */       \
@@ -123,8 +125,9 @@
 	stw	r10,GPR10(r8);						     \
 	stw	r11,GPR11(r8);						     \
 	stw	r9,_CCR(r8);		/* save CR on stack		   */\
-	mfspr	r10,exc_level_srr1;	/* check whether user or kernel    */\
-	andi.	r10,r10,MSR_PR;						     \
+	mfspr	r11,exc_level_srr1;	/* check whether user or kernel    */\
+	DO_KVM	BOOKE_INTERRUPT_##intno exc_level_srr1;		             \
+	andi.	r11,r11,MSR_PR;						     \
 	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
 	lwz	r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
 	addi	r11,r11,EXC_LVL_FRAME_OVERHEAD;	/* allocate stack frame    */\
@@ -173,6 +176,23 @@
 			SPRN_MCSRR0, SPRN_MCSRR1)
 
 /*
+ * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite
+ * being delivered to the host.  This exception can only happen
+ * inside a KVM guest -- so we just handle up to the DO_KVM rather
+ * than try to fit this into one of the existing prolog macros.
+ */
+#define GUEST_DOORBELL_EXCEPTION \
+	START_EXCEPTION(GuestDoorbell);					     \
+	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
+	mfspr	r10, SPRN_SPRG_THREAD;					     \
+	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
+	mfcr	r13;			/* save CR in r13 for now	   */\
+	DO_KVM	BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1;			     \
+	trap
+
+/*
  * Exception vectors.
  */
 #define	START_EXCEPTION(label)						     \
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 8f64709..2c33cd3 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -90,6 +90,9 @@ config KVM_BOOK3S_64_PR
 	depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
 	select KVM_BOOK3S_PR
 
+config KVM_BOOKE_HV
+	bool
+
 config KVM_440
 	bool "KVM support for PowerPC 440 processors"
 	depends on EXPERIMENTAL && 44x
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 2ee9bae..75dbaeb 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -17,6 +17,8 @@
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ *          Scott Wood <scottwood@freescale.com>
+ *          Varun Sethi <varun.sethi@freescale.com>
  */
 
 #include <linux/errno.h>
@@ -30,9 +32,12 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
-#include "timing.h"
 #include <asm/cacheflush.h>
+#include <asm/dbell.h>
+#include <asm/hw_irq.h>
+#include <asm/irq.h>
 
+#include "timing.h"
 #include "booke.h"
 
 unsigned long kvmppc_booke_handlers;
@@ -55,6 +60,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "dec",        VCPU_STAT(dec_exits) },
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+	{ "doorbell", VCPU_STAT(dbell_exits) },
+	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
 	{ NULL }
 };
 
@@ -121,6 +128,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
 	u32 old_msr = vcpu->arch.shared->msr;
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	new_msr |= MSR_GS;
+#endif
+
 	vcpu->arch.shared->msr = new_msr;
 
 	kvmppc_mmu_msr_notify(vcpu, old_msr);
@@ -195,6 +206,75 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_GSRR0, srr0);
+	mtspr(SPRN_GSRR1, srr1);
+#else
+	vcpu->arch.shared->srr0 = srr0;
+	vcpu->arch.shared->srr1 = srr1;
+#endif
+}
+
+static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.csrr0 = srr0;
+	vcpu->arch.csrr1 = srr1;
+}
+
+static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) {
+		vcpu->arch.dsrr0 = srr0;
+		vcpu->arch.dsrr1 = srr1;
+	} else {
+		set_guest_csrr(vcpu, srr0, srr1);
+	}
+}
+
+static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.mcsrr0 = srr0;
+	vcpu->arch.mcsrr1 = srr1;
+}
+
+static unsigned long get_guest_dear(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	return mfspr(SPRN_GDEAR);
+#else
+	return vcpu->arch.shared->dar;
+#endif
+}
+
+static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_GDEAR, dear);
+#else
+	vcpu->arch.shared->dar = dear;
+#endif
+}
+
+static unsigned long get_guest_esr(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	return mfspr(SPRN_GESR);
+#else
+	return vcpu->arch.shared->esr;
+#endif
+}
+
+static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_GESR, esr);
+#else
+	vcpu->arch.shared->esr = esr;
+#endif
+}
+
 /* Deliver the interrupt of the corresponding priority, if possible. */
 static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                                         unsigned int priority)
@@ -206,6 +286,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
 	bool crit;
 	bool keep_irq = false;
+	enum int_class int_class;
 
 	/* Truncate crit indicators in 32 bit mode */
 	if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -241,16 +322,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_AP_UNAVAIL:
 	case BOOKE_IRQPRIO_ALIGNMENT:
 		allowed = 1;
-		msr_mask = MSR_CE|MSR_ME|MSR_DE;
+		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_CRITICAL:
-	case BOOKE_IRQPRIO_WATCHDOG:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
-		msr_mask = MSR_ME;
+		allowed = allowed && !crit;
+		msr_mask = MSR_GS | MSR_ME;
+		int_class = INT_CLASS_CRIT;
 		break;
 	case BOOKE_IRQPRIO_MACHINE_CHECK:
 		allowed = vcpu->arch.shared->msr & MSR_ME;
-		msr_mask = 0;
+		allowed = allowed && !crit;
+		msr_mask = MSR_GS;
+		int_class = INT_CLASS_MC;
 		break;
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_FIT:
@@ -259,28 +344,62 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_EXTERNAL:
 		allowed = vcpu->arch.shared->msr & MSR_EE;
 		allowed = allowed && !crit;
-		msr_mask = MSR_CE|MSR_ME|MSR_DE;
+		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_DEBUG:
 		allowed = vcpu->arch.shared->msr & MSR_DE;
-		msr_mask = MSR_ME;
+		allowed = allowed && !crit;
+		msr_mask = MSR_GS | MSR_ME;
+		int_class = INT_CLASS_CRIT;
 		break;
 	}
 
 	if (allowed) {
-		vcpu->arch.shared->srr0 = vcpu->arch.pc;
-		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
+		switch (int_class) {
+		case INT_CLASS_NONCRIT:
+			set_guest_srr(vcpu, vcpu->arch.pc,
+				      vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_CRIT:
+			set_guest_csrr(vcpu, vcpu->arch.pc,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_DBG:
+			set_guest_dsrr(vcpu, vcpu->arch.pc,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_MC:
+			set_guest_mcsrr(vcpu, vcpu->arch.pc,
+					vcpu->arch.shared->msr);
+			break;
+		}
+
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		if (update_esr == true)
-			vcpu->arch.shared->esr = vcpu->arch.queued_esr;
+			set_guest_esr(vcpu, vcpu->arch.queued_esr);
 		if (update_dear == true)
-			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
+			set_guest_dear(vcpu, vcpu->arch.queued_dear);
 		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 
 		if (!keep_irq)
 			clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	/*
+	 * If an interrupt is pending but masked, raise a guest doorbell
+	 * so that we are notified when the guest enables the relevant
+	 * MSR bit.
+	 */
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC);
+#endif
+
 	return allowed;
 }
 
@@ -344,6 +463,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
+	if (!current->thread.kvm_vcpu) {
+		WARN(1, "no vcpu\n");
+		return -EPERM;
+	}
+
 	local_irq_disable();
 
 	kvmppc_core_prepare_to_enter(vcpu);
@@ -363,6 +487,38 @@ out:
 	return ret;
 }
 
+static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+
+	er = kvmppc_emulate_instruction(run, vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		/* don't overwrite subtypes, just account kvm_stats */
+		kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
+		/* Future optimization: only reload non-volatiles if
+		 * they were actually modified by emulation. */
+		return RESUME_GUEST_NV;
+
+	case EMULATE_DO_DCR:
+		run->exit_reason = KVM_EXIT_DCR;
+		return RESUME_HOST;
+
+	case EMULATE_FAIL:
+		/* XXX Deliver Program interrupt to guest. */
+		printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+		       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
+		/* For debugging, encode the failing instruction and
+		 * report it to userspace. */
+		run->hw.hardware_exit_reason = ~0ULL << 32;
+		run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
+		return RESUME_HOST;
+
+	default:
+		BUG();
+	}
+}
+
 /**
  * kvmppc_handle_exit
  *
@@ -371,12 +527,30 @@ out:
 int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        unsigned int exit_nr)
 {
-	enum emulation_result er;
 	int r = RESUME_HOST;
 
 	/* update before a new last_exit_type is rewritten */
 	kvmppc_update_timing_stats(vcpu);
 
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_EXTERNAL:
+		do_IRQ(current->thread.regs);
+		break;
+
+	case BOOKE_INTERRUPT_DECREMENTER:
+		timer_interrupt(current->thread.regs);
+		break;
+
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
+	case BOOKE_INTERRUPT_DOORBELL:
+		doorbell_exception(current->thread.regs);
+		break;
+#endif
+	case BOOKE_INTERRUPT_MACHINE_CHECK:
+		/* FIXME */
+		break;
+	}
+
 	local_irq_enable();
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -384,30 +558,56 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	switch (exit_nr) {
 	case BOOKE_INTERRUPT_MACHINE_CHECK:
-		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
-		kvmppc_dump_vcpu(vcpu);
-		r = RESUME_HOST;
+		kvm_resched(vcpu);
+		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
 		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
-		if (need_resched())
-			cond_resched();
+		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_DECREMENTER:
-		/* Since we switched IVPR back to the host's value, the host
-		 * handled this interrupt the moment we enabled interrupts.
-		 * Now we just offer it a chance to reschedule the guest. */
 		kvmppc_account_exit(vcpu, DEC_EXITS);
-		if (need_resched())
-			cond_resched();
+		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
+	case BOOKE_INTERRUPT_DOORBELL:
+		kvmppc_account_exit(vcpu, DBELL_EXITS);
+		kvm_resched(vcpu);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL_CRIT:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_CE or MSR_ME was not
+		 * set.  Once we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_EE was not set.  Once
+		 * we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_HV_PRIV:
+		r = emulation_exit(run, vcpu);
+		break;
+
 	case BOOKE_INTERRUPT_PROGRAM:
-		if (vcpu->arch.shared->msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) {
 			/* Program traps generated by user-level software must be handled
 			 * by the guest kernel. */
 			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
@@ -416,32 +616,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 		}
 
-		er = kvmppc_emulate_instruction(run, vcpu);
-		switch (er) {
-		case EMULATE_DONE:
-			/* don't overwrite subtypes, just account kvm_stats */
-			kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
-			/* Future optimization: only reload non-volatiles if
-			 * they were actually modified by emulation. */
-			r = RESUME_GUEST_NV;
-			break;
-		case EMULATE_DO_DCR:
-			run->exit_reason = KVM_EXIT_DCR;
-			r = RESUME_HOST;
-			break;
-		case EMULATE_FAIL:
-			/* XXX Deliver Program interrupt to guest. */
-			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-			       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
-			/* For debugging, encode the failing instruction and
-			 * report it to userspace. */
-			run->hw.hardware_exit_reason = ~0ULL << 32;
-			run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
-			r = RESUME_HOST;
-			break;
-		default:
-			BUG();
-		}
+		r = emulation_exit(run, vcpu);
 		break;
 
 	case BOOKE_INTERRUPT_FP_UNAVAIL:
@@ -506,6 +681,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	case BOOKE_INTERRUPT_HV_SYSCALL:
+		if (!(vcpu->arch.shared->msr & MSR_PR)) {
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+		} else {
+			/*
+			 * hcall from guest userspace -- send privileged
+			 * instruction program check.
+			 */
+			kvmppc_core_queue_program(vcpu, ESR_PPR);
+		}
+
+		r = RESUME_GUEST;
+		break;
+#else
 	case BOOKE_INTERRUPT_SYSCALL:
 		if (!(vcpu->arch.shared->msr & MSR_PR) &&
 		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
@@ -519,6 +709,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
 		r = RESUME_GUEST;
 		break;
+#endif
 
 	case BOOKE_INTERRUPT_DTLB_MISS: {
 		unsigned long eaddr = vcpu->arch.fault_dear;
@@ -659,12 +850,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	int r;
 
 	vcpu->arch.pc = 0;
-	vcpu->arch.shared->msr = 0;
-	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
 	vcpu->arch.shared->pir = vcpu->vcpu_id;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
+	kvmppc_set_msr(vcpu, 0);
 
+#ifndef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
 	vcpu->arch.shadow_pid = 1;
+	vcpu->arch.shared->msr = 0;
+#endif
 
 	/* Eye-catching numbers so we know if the guest takes an interrupt
 	 * before it's programmed its own IVPR/IVORs. */
@@ -745,8 +939,8 @@ static void get_sregs_base(struct kvm_vcpu *vcpu,
 	sregs->u.e.csrr0 = vcpu->arch.csrr0;
 	sregs->u.e.csrr1 = vcpu->arch.csrr1;
 	sregs->u.e.mcsr = vcpu->arch.mcsr;
-	sregs->u.e.esr = vcpu->arch.shared->esr;
-	sregs->u.e.dear = vcpu->arch.shared->dar;
+	sregs->u.e.esr = get_guest_esr(vcpu);
+	sregs->u.e.dear = get_guest_dear(vcpu);
 	sregs->u.e.tsr = vcpu->arch.tsr;
 	sregs->u.e.tcr = vcpu->arch.tcr;
 	sregs->u.e.dec = kvmppc_get_dec(vcpu, tb);
@@ -763,8 +957,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	vcpu->arch.csrr0 = sregs->u.e.csrr0;
 	vcpu->arch.csrr1 = sregs->u.e.csrr1;
 	vcpu->arch.mcsr = sregs->u.e.mcsr;
-	vcpu->arch.shared->esr = sregs->u.e.esr;
-	vcpu->arch.shared->dar = sregs->u.e.dear;
+	set_guest_esr(vcpu, sregs->u.e.esr);
+	set_guest_dear(vcpu, sregs->u.e.dear);
 	vcpu->arch.vrsave = sregs->u.e.vrsave;
 	kvmppc_set_tcr(vcpu, sregs->u.e.tcr);
 
@@ -961,14 +1155,17 @@ void kvmppc_decrementer_func(unsigned long data)
 
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	current->thread.kvm_vcpu = vcpu;
 }
 
 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	current->thread.kvm_vcpu = NULL;
 }
 
 int __init kvmppc_booke_init(void)
 {
+#ifndef CONFIG_KVM_BOOKE_HV
 	unsigned long ivor[16];
 	unsigned long max_ivor = 0;
 	int i;
@@ -1011,7 +1208,7 @@ int __init kvmppc_booke_init(void)
 	}
 	flush_icache_range(kvmppc_booke_handlers,
 	                   kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
-
+#endif /* !BOOKE_HV */
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 05d1d99..d53bcf2 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -48,7 +48,20 @@
 #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
 /* Internal pseudo-irqprio for level triggered externals */
 #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
-#define BOOKE_IRQPRIO_MAX 20
+#define BOOKE_IRQPRIO_DBELL 21
+#define BOOKE_IRQPRIO_DBELL_CRIT 22
+#define BOOKE_IRQPRIO_MAX 23
+
+#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \
+			  (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \
+			  (1 << BOOKE_IRQPRIO_DBELL) | \
+			  (1 << BOOKE_IRQPRIO_DECREMENTER) | \
+			  (1 << BOOKE_IRQPRIO_FIT) | \
+			  (1 << BOOKE_IRQPRIO_EXTERNAL))
+
+#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \
+			  (1 << BOOKE_IRQPRIO_WATCHDOG) | \
+			  (1 << BOOKE_IRQPRIO_CRITICAL))
 
 extern unsigned long kvmppc_booke_handlers;
 
@@ -74,4 +87,13 @@ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
 
+enum int_class {
+	INT_CLASS_NONCRIT,
+	INT_CLASS_CRIT,
+	INT_CLASS_MC,
+	INT_CLASS_DBG,
+};
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type);
+
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 3e652da..904412b 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -99,6 +99,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return emulated;
 }
 
+/*
+ * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode).
+ * Their backing store is in real registers, and these functions
+ * will return the wrong result if called for them in another context
+ * (such as debugging).
+ */
 int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 {
 	int emulated = EMULATE_DONE;
@@ -122,9 +128,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
-	/* Note: SPRG4-7 are user-readable. These values are
-	 * loaded into the real SPRGs when resuming the
-	 * guest. */
+	/*
+	 * Note: SPRG4-7 are user-readable.
+	 * These values are loaded into the real SPRGs when resuming the
+	 * guest (PR-mode only).
+	 */
 	case SPRN_SPRG4:
 		vcpu->arch.shared->sprg4 = spr_val; break;
 	case SPRN_SPRG5:
@@ -136,6 +144,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
 	case SPRN_IVPR:
 		vcpu->arch.ivpr = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVPR, spr_val);
+#endif
 		break;
 	case SPRN_IVOR0:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val;
@@ -145,6 +156,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		break;
 	case SPRN_IVOR2:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR2, spr_val);
+#endif
 		break;
 	case SPRN_IVOR3:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
@@ -163,6 +177,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		break;
 	case SPRN_IVOR8:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR8, spr_val);
+#endif
 		break;
 	case SPRN_IVOR9:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val;
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
new file mode 100644
index 0000000..9eaeebd
--- /dev/null
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -0,0 +1,587 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ * Author: Scott Wood <scotwood@freescale.com>
+ *
+ * This file is derived from arch/powerpc/kvm/booke_interrupts.S
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu-44x.h>
+#include <asm/page.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+#include <asm/bitsperlong.h>
+
+#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
+
+#define GET_VCPU(vcpu, thread)	\
+	PPC_LL	vcpu, THREAD_KVM_VCPU(thread)
+
+#define SET_VCPU(vcpu)		\
+        PPC_STL	vcpu, (THREAD + THREAD_KVM_VCPU)(r2)
+
+#define LONGBYTES		(BITS_PER_LONG / 8)
+
+#define VCPU_GPR(n)     	(VCPU_GPRS + (n * LONGBYTES))
+#define VCPU_GUEST_SPRG(n)	(VCPU_GUEST_SPRGS + (n * LONGBYTES))
+
+/* The host stack layout: */
+#define HOST_R1         (0 * LONGBYTES) /* Implied by stwu. */
+#define HOST_CALLEE_LR  (1 * LONGBYTES)
+#define HOST_RUN        (2 * LONGBYTES) /* struct kvm_run */
+/*
+ * r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option.
+ */
+#define HOST_R2         (3 * LONGBYTES)
+#define HOST_NV_GPRS    (4 * LONGBYTES)
+#define HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + LONGBYTES)
+#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
+#define HOST_STACK_LR   (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */
+
+#define NEED_EMU		0x00000001 /* emulation -- save nv regs */
+#define NEED_DEAR		0x00000002 /* save faulting DEAR */
+#define NEED_ESR		0x00000004 /* save faulting ESR */
+
+/*
+ * On entry:
+ * r4 = vcpu, r5 = srr0, r6 = srr1
+ * saved in vcpu: cr, ctr, r3-r13
+ */
+.macro kvm_handler_common intno, srr0, flags
+	mfspr	r10, SPRN_PID
+	lwz	r8, VCPU_HOST_PID(r4)
+	PPC_LL	r11, VCPU_SHARED(r4)
+	PPC_STL	r14, VCPU_GPR(r14)(r4) /* We need a non-volatile GPR. */
+	li	r14, \intno
+
+	stw	r10, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r8
+
+	.if	\flags & NEED_EMU
+	lwz	r9, VCPU_KVM(r4)
+	.endif
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	PPC_STL	r8, VCPU_TIMING_EXIT_TBL(r4)
+	bne-	1b
+	PPC_STL	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
+	oris	r8, r6, MSR_CE@h
+#ifndef CONFIG_64BIT
+	stw	r6, (VCPU_SHARED_MSR + 4)(r11)
+#else
+	std	r6, (VCPU_SHARED_MSR)(r11)
+#endif
+	ori	r8, r8, MSR_ME | MSR_RI
+	PPC_STL	r5, VCPU_PC(r4)
+
+	/*
+	 * Make sure CE/ME/RI are set (if appropriate for exception type)
+	 * whether or not the guest had it set.  Since mfmsr/mtmsr are
+	 * somewhat expensive, skip in the common case where the guest
+	 * had all these bits set (and thus they're still set if
+	 * appropriate for the exception type).
+	 */
+	cmpw	r6, r8
+	.if	\flags & NEED_EMU
+	lwz	r9, KVM_LPID(r9)
+	.endif
+	beq	1f
+	mfmsr	r7
+	.if	\srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0
+	oris	r7, r7, MSR_CE@h
+	.endif
+	.if	\srr0 != SPRN_MCSRR0
+	ori	r7, r7, MSR_ME | MSR_RI
+	.endif
+	mtmsr	r7
+1:
+
+	.if	\flags & NEED_EMU
+	/*
+	 * This assumes you have external PID support.
+	 * To support a bookehv CPU without external PID, you'll
+	 * need to look up the TLB entry and create a temporary mapping.
+	 *
+	 * FIXME: we don't currently handle if the lwepx faults.  PR-mode
+	 * booke doesn't handle it either.  Since Linux doesn't use
+	 * broadcast tlbivax anymore, the only way this should happen is
+	 * if the guest maps its memory execute-but-not-read, or if we
+	 * somehow take a TLB miss in the middle of this entry code and
+	 * evict the relevant entry.  On e500mc, all kernel lowmem is
+	 * bolted into TLB1 large page mappings, and we don't use
+	 * broadcast invalidates, so we should not take a TLB miss here.
+	 *
+	 * Later we'll need to deal with faults here.  Disallowing guest
+	 * mappings that are execute-but-not-read could be an option on
+	 * e500mc, but not on chips with an LRAT if it is used.
+	 */
+
+	mfspr	r3, SPRN_EPLC	/* will already have correct ELPID and EGS */
+	PPC_STL	r15, VCPU_GPR(r15)(r4)
+	PPC_STL	r16, VCPU_GPR(r16)(r4)
+	PPC_STL	r17, VCPU_GPR(r17)(r4)
+	PPC_STL	r18, VCPU_GPR(r18)(r4)
+	PPC_STL	r19, VCPU_GPR(r19)(r4)
+	mr	r8, r3
+	PPC_STL	r20, VCPU_GPR(r20)(r4)
+	rlwimi	r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS
+	PPC_STL	r21, VCPU_GPR(r21)(r4)
+	rlwimi	r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR
+	PPC_STL	r22, VCPU_GPR(r22)(r4)
+	rlwimi	r8, r10, EPC_EPID_SHIFT, EPC_EPID
+	PPC_STL	r23, VCPU_GPR(r23)(r4)
+	PPC_STL	r24, VCPU_GPR(r24)(r4)
+	PPC_STL	r25, VCPU_GPR(r25)(r4)
+	PPC_STL	r26, VCPU_GPR(r26)(r4)
+	PPC_STL	r27, VCPU_GPR(r27)(r4)
+	PPC_STL	r28, VCPU_GPR(r28)(r4)
+	PPC_STL	r29, VCPU_GPR(r29)(r4)
+	PPC_STL	r30, VCPU_GPR(r30)(r4)
+	PPC_STL	r31, VCPU_GPR(r31)(r4)
+	mtspr	SPRN_EPLC, r8
+	isync
+	lwepx	r9, 0, r5
+	mtspr	SPRN_EPLC, r3
+	stw	r9, VCPU_LAST_INST(r4)
+	.endif
+
+	.if	\flags & NEED_ESR
+	mfspr	r8, SPRN_ESR
+	PPC_STL	r8, VCPU_FAULT_ESR(r4)
+	.endif
+
+	.if	\flags & NEED_DEAR
+	mfspr	r9, SPRN_DEAR
+	PPC_STL	r9, VCPU_FAULT_DEAR(r4)
+	.endif
+
+	b	kvmppc_resume_host
+.endm
+
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	GET_VCPU(r11, r10)
+	PPC_STL r3, VCPU_GPR(r3)(r11)
+	mfspr	r3, SPRN_SPRG_RSCRATCH0
+	PPC_STL	r4, VCPU_GPR(r4)(r11)
+	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
+	PPC_STL	r5, VCPU_GPR(r5)(r11)
+	PPC_STL	r13, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(r10)(r11)
+	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
+	PPC_STL	r6, VCPU_GPR(r6)(r11)
+	PPC_STL	r4, VCPU_GPR(r11)(r11)
+	mfspr	r6, \srr1
+	PPC_STL	r7, VCPU_GPR(r7)(r11)
+	PPC_STL	r8, VCPU_GPR(r8)(r11)
+	PPC_STL	r9, VCPU_GPR(r9)(r11)
+	PPC_STL r3, VCPU_GPR(r13)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(r12)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+.macro kvm_lvl_handler intno scratch srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mfspr	r10, SPRN_SPRG_THREAD
+	GET_VCPU(r11, r10)
+	PPC_STL r3, VCPU_GPR(r3)(r11)
+	mfspr	r3, \scratch
+	PPC_STL	r4, VCPU_GPR(r4)(r11)
+	PPC_LL	r4, GPR9(r8)
+	PPC_STL	r5, VCPU_GPR(r5)(r11)
+	PPC_STL	r9, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(r8)(r11)
+	PPC_LL	r3, GPR10(r8)
+	PPC_STL	r6, VCPU_GPR(r6)(r11)
+	PPC_STL	r4, VCPU_GPR(r9)(r11)
+	mfspr	r6, \srr1
+	PPC_LL	r4, GPR11(r8)
+	PPC_STL	r7, VCPU_GPR(r7)(r11)
+	PPC_STL	r8, VCPU_GPR(r8)(r11)
+	PPC_STL r3, VCPU_GPR(r10)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(r12)(r11)
+	PPC_STL	r4, VCPU_GPR(r11)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \
+	SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
+
+
+/* Registers:
+ *  SPRG_SCRATCH0: guest r10
+ *  r4: vcpu pointer
+ *  r11: vcpu->arch.shared
+ *  r14: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+	/* Save remaining volatile guest register state to vcpu. */
+	mfspr	r3, SPRN_VRSAVE
+	PPC_STL	r0, VCPU_GPR(r0)(r4)
+	PPC_STL	r1, VCPU_GPR(r1)(r4)
+	mflr	r5
+	mfspr	r6, SPRN_SPRG4
+	PPC_STL	r2, VCPU_GPR(r2)(r4)
+	PPC_STL	r5, VCPU_LR(r4)
+	mfspr	r7, SPRN_SPRG5
+	PPC_STL	r3, VCPU_VRSAVE(r4)
+	PPC_STL	r6, VCPU_SHARED_SPRG4(r11)
+	mfspr	r8, SPRN_SPRG6
+	PPC_STL	r7, VCPU_SHARED_SPRG5(r11)
+	mfspr	r9, SPRN_SPRG7
+	PPC_STL	r8, VCPU_SHARED_SPRG6(r11)
+	mfxer	r3
+	PPC_STL	r9, VCPU_SHARED_SPRG7(r11)
+
+	/* save guest MAS registers and restore host mas4 & mas6 */
+	mfspr	r5, SPRN_MAS0
+	PPC_STL	r3, VCPU_XER(r4)
+	mfspr	r6, SPRN_MAS1
+	stw	r5, VCPU_SHARED_MAS0(r11)
+	mfspr	r7, SPRN_MAS2
+	stw	r6, VCPU_SHARED_MAS1(r11)
+#ifndef CONFIG_64BIT
+	stw	r7, (VCPU_SHARED_MAS2 + 4)(r11)
+#else
+	std	r7, (VCPU_SHARED_MAS2)(r11)
+#endif
+	mfspr	r5, SPRN_MAS3
+	mfspr	r6, SPRN_MAS4
+	stw	r5, VCPU_SHARED_MAS7_3+4(r11)
+	mfspr	r7, SPRN_MAS6
+	stw	r6, VCPU_SHARED_MAS4(r11)
+	mfspr	r5, SPRN_MAS7
+	lwz	r6, VCPU_HOST_MAS4(r4)
+	stw	r7, VCPU_SHARED_MAS6(r11)
+	lwz	r8, VCPU_HOST_MAS6(r4)
+	mtspr	SPRN_MAS4, r6
+	stw	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r8
+	mfspr	r3, SPRN_EPCR
+	rlwinm	r3, r3, 0, ~SPRN_EPCR_DMIUH
+	mtspr	SPRN_EPCR, r3
+	isync
+
+	/* Restore host stack pointer */
+	PPC_LL	r1, VCPU_HOST_STACK(r4)
+	PPC_LL	r2, HOST_R2(r1)
+
+	/* Switch to kernel stack and jump to handler. */
+	PPC_LL	r3, HOST_RUN(r1)
+	mr	r5, r14 /* intno */
+	mr	r14, r4 /* Save vcpu pointer. */
+	bl	kvmppc_handle_exit
+
+	/* Restore vcpu pointer and the nonvolatiles we used. */
+	mr	r4, r14
+	PPC_LL	r14, VCPU_GPR(r14)(r4)
+
+	andi.	r5, r3, RESUME_FLAG_NV
+	beq	skip_nv_load
+	PPC_LL	r15, VCPU_GPR(r15)(r4)
+	PPC_LL	r16, VCPU_GPR(r16)(r4)
+	PPC_LL	r17, VCPU_GPR(r17)(r4)
+	PPC_LL	r18, VCPU_GPR(r18)(r4)
+	PPC_LL	r19, VCPU_GPR(r19)(r4)
+	PPC_LL	r20, VCPU_GPR(r20)(r4)
+	PPC_LL	r21, VCPU_GPR(r21)(r4)
+	PPC_LL	r22, VCPU_GPR(r22)(r4)
+	PPC_LL	r23, VCPU_GPR(r23)(r4)
+	PPC_LL	r24, VCPU_GPR(r24)(r4)
+	PPC_LL	r25, VCPU_GPR(r25)(r4)
+	PPC_LL	r26, VCPU_GPR(r26)(r4)
+	PPC_LL	r27, VCPU_GPR(r27)(r4)
+	PPC_LL	r28, VCPU_GPR(r28)(r4)
+	PPC_LL	r29, VCPU_GPR(r29)(r4)
+	PPC_LL	r30, VCPU_GPR(r30)(r4)
+	PPC_LL	r31, VCPU_GPR(r31)(r4)
+skip_nv_load:
+	/* Should we return to the guest? */
+	andi.	r5, r3, RESUME_FLAG_HOST
+	beq	lightweight_exit
+
+	srawi	r3, r3, 2 /* Shift -ERR back down. */
+
+heavyweight_exit:
+	/* Not returning to guest. */
+	PPC_LL	r5, HOST_STACK_LR(r1)
+
+	/*
+	 * We already saved guest volatile register state; now save the
+	 * non-volatiles.
+	 */
+
+	PPC_STL	r15, VCPU_GPR(r15)(r4)
+	PPC_STL	r16, VCPU_GPR(r16)(r4)
+	PPC_STL	r17, VCPU_GPR(r17)(r4)
+	PPC_STL	r18, VCPU_GPR(r18)(r4)
+	PPC_STL	r19, VCPU_GPR(r19)(r4)
+	PPC_STL	r20, VCPU_GPR(r20)(r4)
+	PPC_STL	r21, VCPU_GPR(r21)(r4)
+	PPC_STL	r22, VCPU_GPR(r22)(r4)
+	PPC_STL	r23, VCPU_GPR(r23)(r4)
+	PPC_STL	r24, VCPU_GPR(r24)(r4)
+	PPC_STL	r25, VCPU_GPR(r25)(r4)
+	PPC_STL	r26, VCPU_GPR(r26)(r4)
+	PPC_STL	r27, VCPU_GPR(r27)(r4)
+	PPC_STL	r28, VCPU_GPR(r28)(r4)
+	PPC_STL	r29, VCPU_GPR(r29)(r4)
+	PPC_STL	r30, VCPU_GPR(r30)(r4)
+	PPC_STL	r31, VCPU_GPR(r31)(r4)
+
+	/* Load host non-volatile register state from host stack. */
+	PPC_LL	r14, HOST_NV_GPR(r14)(r1)
+	PPC_LL	r15, HOST_NV_GPR(r15)(r1)
+	PPC_LL	r16, HOST_NV_GPR(r16)(r1)
+	PPC_LL	r17, HOST_NV_GPR(r17)(r1)
+	PPC_LL	r18, HOST_NV_GPR(r18)(r1)
+	PPC_LL	r19, HOST_NV_GPR(r19)(r1)
+	PPC_LL	r20, HOST_NV_GPR(r20)(r1)
+	PPC_LL	r21, HOST_NV_GPR(r21)(r1)
+	PPC_LL	r22, HOST_NV_GPR(r22)(r1)
+	PPC_LL	r23, HOST_NV_GPR(r23)(r1)
+	PPC_LL	r24, HOST_NV_GPR(r24)(r1)
+	PPC_LL	r25, HOST_NV_GPR(r25)(r1)
+	PPC_LL	r26, HOST_NV_GPR(r26)(r1)
+	PPC_LL	r27, HOST_NV_GPR(r27)(r1)
+	PPC_LL	r28, HOST_NV_GPR(r28)(r1)
+	PPC_LL	r29, HOST_NV_GPR(r29)(r1)
+	PPC_LL	r30, HOST_NV_GPR(r30)(r1)
+	PPC_LL	r31, HOST_NV_GPR(r31)(r1)
+
+	/* Return to kvm_vcpu_run(). */
+	mtlr	r5
+	addi	r1, r1, HOST_STACK_SIZE
+	/* r3 still contains the return code from kvmppc_handle_exit(). */
+	blr
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+	stwu	r1, -HOST_STACK_SIZE(r1)
+	PPC_STL	r1, VCPU_HOST_STACK(r4)	/* Save stack pointer to vcpu. */
+
+	/* Save host state to stack. */
+	PPC_STL	r3, HOST_RUN(r1)
+	mflr	r3
+	PPC_STL	r3, HOST_STACK_LR(r1)
+
+	/* Save host non-volatile register state to stack. */
+	PPC_STL	r14, HOST_NV_GPR(r14)(r1)
+	PPC_STL	r15, HOST_NV_GPR(r15)(r1)
+	PPC_STL	r16, HOST_NV_GPR(r16)(r1)
+	PPC_STL	r17, HOST_NV_GPR(r17)(r1)
+	PPC_STL	r18, HOST_NV_GPR(r18)(r1)
+	PPC_STL	r19, HOST_NV_GPR(r19)(r1)
+	PPC_STL	r20, HOST_NV_GPR(r20)(r1)
+	PPC_STL	r21, HOST_NV_GPR(r21)(r1)
+	PPC_STL	r22, HOST_NV_GPR(r22)(r1)
+	PPC_STL	r23, HOST_NV_GPR(r23)(r1)
+	PPC_STL	r24, HOST_NV_GPR(r24)(r1)
+	PPC_STL	r25, HOST_NV_GPR(r25)(r1)
+	PPC_STL	r26, HOST_NV_GPR(r26)(r1)
+	PPC_STL	r27, HOST_NV_GPR(r27)(r1)
+	PPC_STL	r28, HOST_NV_GPR(r28)(r1)
+	PPC_STL	r29, HOST_NV_GPR(r29)(r1)
+	PPC_STL	r30, HOST_NV_GPR(r30)(r1)
+	PPC_STL	r31, HOST_NV_GPR(r31)(r1)
+
+	/* Load guest non-volatiles. */
+	PPC_LL	r14, VCPU_GPR(r14)(r4)
+	PPC_LL	r15, VCPU_GPR(r15)(r4)
+	PPC_LL	r16, VCPU_GPR(r16)(r4)
+	PPC_LL	r17, VCPU_GPR(r17)(r4)
+	PPC_LL	r18, VCPU_GPR(r18)(r4)
+	PPC_LL	r19, VCPU_GPR(r19)(r4)
+	PPC_LL	r20, VCPU_GPR(r20)(r4)
+	PPC_LL	r21, VCPU_GPR(r21)(r4)
+	PPC_LL	r22, VCPU_GPR(r22)(r4)
+	PPC_LL	r23, VCPU_GPR(r23)(r4)
+	PPC_LL	r24, VCPU_GPR(r24)(r4)
+	PPC_LL	r25, VCPU_GPR(r25)(r4)
+	PPC_LL	r26, VCPU_GPR(r26)(r4)
+	PPC_LL	r27, VCPU_GPR(r27)(r4)
+	PPC_LL	r28, VCPU_GPR(r28)(r4)
+	PPC_LL	r29, VCPU_GPR(r29)(r4)
+	PPC_LL	r30, VCPU_GPR(r30)(r4)
+	PPC_LL	r31, VCPU_GPR(r31)(r4)
+
+
+lightweight_exit:
+	PPC_STL	r2, HOST_R2(r1)
+
+	mfspr	r3, SPRN_PID
+	stw	r3, VCPU_HOST_PID(r4)
+	lwz	r3, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r3
+
+	/* Save vcpu pointer for the exception handlers
+	 * must be done before loading guest r2.
+	 */
+//	SET_VCPU(r4)
+
+	PPC_LL	r11, VCPU_SHARED(r4)
+	/* Save host mas4 and mas6 and load guest MAS registers */
+	mfspr	r3, SPRN_MAS4
+	stw	r3, VCPU_HOST_MAS4(r4)
+	mfspr	r3, SPRN_MAS6
+	stw	r3, VCPU_HOST_MAS6(r4)
+	lwz	r3, VCPU_SHARED_MAS0(r11)
+	lwz	r5, VCPU_SHARED_MAS1(r11)
+#ifndef CONFIG_64BIT
+	lwz	r6, (VCPU_SHARED_MAS2 + 4)(r11)
+#else
+	ld	r6, (VCPU_SHARED_MAS2)(r11)
+#endif
+	lwz	r7, VCPU_SHARED_MAS7_3+4(r11)
+	lwz	r8, VCPU_SHARED_MAS4(r11)
+	mtspr	SPRN_MAS0, r3
+	mtspr	SPRN_MAS1, r5
+	mtspr	SPRN_MAS2, r6
+	mtspr	SPRN_MAS3, r7
+	mtspr	SPRN_MAS4, r8
+	lwz	r3, VCPU_SHARED_MAS6(r11)
+	lwz	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r3
+	mtspr	SPRN_MAS7, r5
+	/* Disable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	oris	r3, r3, SPRN_EPCR_DMIUH@h
+	mtspr	SPRN_EPCR, r3
+
+	/*
+	 * Host interrupt handlers may have clobbered these guest-readable
+	 * SPRGs, so we need to reload them here with the guest's values.
+	 */
+	lwz	r3, VCPU_VRSAVE(r4)
+	lwz	r5, VCPU_SHARED_SPRG4(r11)
+	mtspr	SPRN_VRSAVE, r3
+	lwz	r6, VCPU_SHARED_SPRG5(r11)
+	mtspr	SPRN_SPRG4W, r5
+	lwz	r7, VCPU_SHARED_SPRG6(r11)
+	mtspr	SPRN_SPRG5W, r6
+	lwz	r8, VCPU_SHARED_SPRG7(r11)
+	mtspr	SPRN_SPRG6W, r7
+	mtspr	SPRN_SPRG7W, r8
+
+	/* Load some guest volatiles. */
+	PPC_LL	r3, VCPU_LR(r4)
+	PPC_LL	r5, VCPU_XER(r4)
+	PPC_LL	r6, VCPU_CTR(r4)
+	PPC_LL	r7, VCPU_CR(r4)
+	PPC_LL	r8, VCPU_PC(r4)
+#ifndef CONFIG_64BIT
+	lwz	r9, (VCPU_SHARED_MSR + 4)(r11)
+#else
+	ld	r9, (VCPU_SHARED_MSR)(r11)
+#endif
+	PPC_LL	r0, VCPU_GPR(r0)(r4)
+	PPC_LL	r1, VCPU_GPR(r1)(r4)
+	PPC_LL	r2, VCPU_GPR(r2)(r4)
+	PPC_LL	r10, VCPU_GPR(r10)(r4)
+	PPC_LL	r11, VCPU_GPR(r11)(r4)
+	PPC_LL	r12, VCPU_GPR(r12)(r4)
+	PPC_LL	r13, VCPU_GPR(r13)(r4)
+	mtlr	r3
+	mtxer	r5
+	mtctr	r6
+	mtcr	r7
+	mtsrr0	r8
+	mtsrr1	r9
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r7, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	PPC_STL	r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	bne	1b
+	PPC_STL	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
+	/* Finish loading guest volatiles and jump to guest. */
+	PPC_LL	r5, VCPU_GPR(r5)(r4)
+	PPC_LL	r6, VCPU_GPR(r6)(r4)
+	PPC_LL	r7, VCPU_GPR(r7)(r4)
+	PPC_LL	r8, VCPU_GPR(r8)(r4)
+	PPC_LL	r9, VCPU_GPR(r9)(r4)
+
+	PPC_LL	r3, VCPU_GPR(r3)(r4)
+	PPC_LL	r4, VCPU_GPR(r4)(r4)
+	rfi
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd53e08..6a530e4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -114,6 +114,11 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
 		goto out;
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	if (!cpu_has_feature(CPU_FTR_EMB_HV))
+		goto out;
+#endif
+
 	r = true;
 
 out:
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index 8167d42..bf191e7 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 	case SIGNAL_EXITS:
 		vcpu->stat.signal_exits++;
 		break;
+	case DBELL_EXITS:
+		vcpu->stat.dbell_exits++;
+		break;
+	case GDBELL_EXITS:
+		vcpu->stat.gdbell_exits++;
+		break;
 	}
 }
 
-- 
cgit v0.10.2


From 8fae845f4956de0becc115e926d33eff46722e94 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:45 +0000
Subject: KVM: PPC: booke: standard PPC floating point support

e500mc has a normal PPC FPU, rather than SPE which is found
on e500v1/v2.

Based on code from Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index caf82d0..1622c35 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -17,6 +17,7 @@ extern struct task_struct *_switch(struct thread_struct *prev,
 				   struct thread_struct *next);
 
 extern void giveup_fpu(struct task_struct *);
+extern void load_up_fpu(void);
 extern void disable_kernel_fp(void);
 extern void enable_kernel_fp(void);
 extern void flush_fp_to_thread(struct task_struct *);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 75dbaeb..0b77be1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -457,6 +457,11 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int ret;
+#ifdef CONFIG_PPC_FPU
+	unsigned int fpscr;
+	int fpexc_mode;
+	u64 fpr[32];
+#endif
 
 	if (!vcpu->arch.sane) {
 		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -479,7 +484,46 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	kvm_guest_enter();
+
+#ifdef CONFIG_PPC_FPU
+	/* Save userspace FPU state in stack */
+	enable_kernel_fp();
+	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+	fpscr = current->thread.fpscr.val;
+	fpexc_mode = current->thread.fpexc_mode;
+
+	/* Restore guest FPU state to thread */
+	memcpy(current->thread.fpr, vcpu->arch.fpr, sizeof(vcpu->arch.fpr));
+	current->thread.fpscr.val = vcpu->arch.fpscr;
+
+	/*
+	 * Since we can't trap on MSR_FP in GS-mode, we consider the guest
+	 * as always using the FPU.  Kernel usage of FP (via
+	 * enable_kernel_fp()) in this thread must not occur while
+	 * vcpu->fpu_active is set.
+	 */
+	vcpu->fpu_active = 1;
+
+	kvmppc_load_guest_fp(vcpu);
+#endif
+
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+
+#ifdef CONFIG_PPC_FPU
+	kvmppc_save_guest_fp(vcpu);
+
+	vcpu->fpu_active = 0;
+
+	/* Save guest FPU state from thread */
+	memcpy(vcpu->arch.fpr, current->thread.fpr, sizeof(vcpu->arch.fpr));
+	vcpu->arch.fpscr = current->thread.fpscr.val;
+
+	/* Restore userspace FPU state from stack */
+	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = fpscr;
+	current->thread.fpexc_mode = fpexc_mode;
+#endif
+
 	kvm_guest_exit();
 
 out:
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index d53bcf2..62c4fe5 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/kvm_host.h>
 #include <asm/kvm_ppc.h>
+#include <asm/switch_to.h>
 #include "timing.h"
 
 /* interrupt priortity ordering */
@@ -96,4 +97,34 @@ enum int_class {
 
 void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type);
 
+/*
+ * Load up guest vcpu FP state if it's needed.
+ * It also set the MSR_FP in thread so that host know
+ * we're holding FPU, and then host can help to save
+ * guest vcpu FP state if other threads require to use FPU.
+ * This simulates an FP unavailable fault.
+ *
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_FPU
+	if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) {
+		load_up_fpu();
+		current->thread.regs->msr |= MSR_FP;
+	}
+#endif
+}
+
+/*
+ * Save guest vcpu FP state into thread.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_FPU
+	if (vcpu->fpu_active && (current->thread.regs->msr & MSR_FP))
+		giveup_fpu(current);
+#endif
+}
 #endif /* __KVM_BOOKE_H__ */
-- 
cgit v0.10.2


From 73196cd364a2d972d73fa08da9d81ca3215bed68 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Tue, 20 Dec 2011 15:34:47 +0000
Subject: KVM: PPC: e500mc support

Add processor support for e500mc, using hardware virtualization support
(GS-mode).

Current issues include:
 - No support for external proxy (coreint) interrupt mode in the guest.

Includes work by Ashish Kalra <Ashish.Kalra@freescale.com>,
Varun Sethi <Varun.Sethi@freescale.com>, and
Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 7108a9c..67c34af 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -168,6 +168,7 @@ extern const char *powerpc_base_platform;
 #define CPU_FTR_LWSYNC			ASM_CONST(0x0000000008000000)
 #define CPU_FTR_NOEXECUTE		ASM_CONST(0x0000000010000000)
 #define CPU_FTR_INDEXED_DCR		ASM_CONST(0x0000000020000000)
+#define CPU_FTR_EMB_HV			ASM_CONST(0x0000000040000000)
 
 /*
  * Add the 64-bit processor unique features in the top half of the word;
@@ -386,11 +387,11 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500MC	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
-	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC)
+	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
 #define CPU_FTRS_E5500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_DEBUG_LVL_EXC)
+	    CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
 #define CPU_FTRS_E6500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
@@ -539,6 +540,7 @@ enum {
 #ifdef CONFIG_PPC_E500MC
 	    CPU_FTRS_E500MC & CPU_FTRS_E5500 & CPU_FTRS_E6500 &
 #endif
+	    ~CPU_FTR_EMB_HV &	/* can be removed at runtime */
 	    CPU_FTRS_POSSIBLE,
 };
 #endif /* __powerpc64__ */
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index b921c3f..1bea4d8 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -277,6 +277,7 @@ struct kvm_sync_regs {
 #define KVM_CPU_E500V2		2
 #define KVM_CPU_3S_32		3
 #define KVM_CPU_3S_64		4
+#define KVM_CPU_E500MC		5
 
 /* for KVM_CAP_SPAPR_TCE */
 struct kvm_create_spapr_tce {
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index 8053db0..69fdd23 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -73,6 +73,7 @@ _GLOBAL(__setup_cpu_e500v2)
 	mtlr	r4
 	blr
 _GLOBAL(__setup_cpu_e500mc)
+	mr	r5, r4
 	mflr	r4
 	bl	__e500_icache_setup
 	bl	__e500_dcache_setup
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 7c406dd..89c6d6f 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -380,10 +380,16 @@ interrupt_base:
 	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
 	mfspr	r10, SPRN_SPRG_THREAD
 	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
 	stw	r12, THREAD_NORMSAVE(1)(r10)
 	stw	r13, THREAD_NORMSAVE(2)(r10)
 	mfcr	r13
 	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1
 	mfspr	r10, SPRN_DEAR		/* Get faulting address */
 
 	/* If we are faulting a kernel address, we have to use the
@@ -468,10 +474,16 @@ interrupt_base:
 	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
 	mfspr	r10, SPRN_SPRG_THREAD
 	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
 	stw	r12, THREAD_NORMSAVE(1)(r10)
 	stw	r13, THREAD_NORMSAVE(2)(r10)
 	mfcr	r13
 	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1
 	mfspr	r10, SPRN_SRR0		/* Get faulting address */
 
 	/* If we are faulting a kernel address, we have to use the
@@ -580,6 +592,17 @@ interrupt_base:
 	DEBUG_DEBUG_EXCEPTION
 	DEBUG_CRIT_EXCEPTION
 
+	GUEST_DOORBELL_EXCEPTION
+
+	CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \
+			   unknown_exception)
+
+	/* Hypercall */
+	EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE)
+
+	/* Embedded Hypervisor Privilege */
+	EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)
+
 /*
  * Local functions
  */
@@ -883,8 +906,31 @@ _GLOBAL(__setup_e500mc_ivors)
 	mtspr	SPRN_IVOR36,r3
 	li	r3,CriticalDoorbell@l
 	mtspr	SPRN_IVOR37,r3
+
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r3, SPRN_MMUCFG
+	andis.	r3, r3, MMUCFG_LPIDSIZE@h
+	beq	no_hv
+	li	r3,GuestDoorbell@l
+	mtspr	SPRN_IVOR38,r3
+	li	r3,CriticalGuestDoorbell@l
+	mtspr	SPRN_IVOR39,r3
+	li	r3,Hypercall@l
+	mtspr	SPRN_IVOR40,r3
+	li	r3,Ehvpriv@l
+	mtspr	SPRN_IVOR41,r3
+skip_hv_ivors:
 	sync
 	blr
+no_hv:
+	lwz	r3, CPU_SPEC_FEATURES(r5)
+	rlwinm	r3, r3, 0, ~CPU_FTR_EMB_HV
+	stw	r3, CPU_SPEC_FEATURES(r5)
+	b	skip_hv_ivors
 
 /*
  * extern void giveup_altivec(struct task_struct *prev)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 2c33cd3..58f6e68 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -109,7 +109,7 @@ config KVM_440
 
 config KVM_EXIT_TIMING
 	bool "Detailed exit timing"
-	depends on KVM_440 || KVM_E500
+	depends on KVM_440 || KVM_E500 || KVM_E500MC
 	---help---
 	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
 	  report is available in debugfs kvm/vm#_vcpu#_timing.
@@ -132,6 +132,21 @@ config KVM_E500
 
 	  If unsure, say N.
 
+config KVM_E500MC
+	bool "KVM support for PowerPC E500MC/E5500 processors"
+	depends on EXPERIMENTAL && PPC_E500MC
+	select KVM
+	select KVM_MMIO
+	select KVM_BOOKE_HV
+	---help---
+	  Support running unmodified E500MC/E5500 (32-bit) guest kernels in
+	  virtual machines on E500MC/E5500 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 3688aee..62febd7 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -38,6 +38,16 @@ kvm-e500-objs := \
 	e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
+kvm-e500mc-objs := \
+	$(common-objs-y) \
+	booke.o \
+	booke_emulate.o \
+	bookehv_interrupts.o \
+	e500mc.o \
+	e500_tlb.o \
+	e500_emulate.o
+kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
+
 kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
 	../../../virt/kvm/coalesced_mmio.o \
 	fpu.o \
@@ -89,6 +99,7 @@ kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
 obj-$(CONFIG_KVM_E500) += kvm.o
+obj-$(CONFIG_KVM_E500MC) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index ce3f163..3143085 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -2,7 +2,9 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu <yu.liu@freescale.com>
+ *         Scott Wood <scottwood@freescale.com>
  *         Ashish Kalra <ashish.kalra@freescale.com>
+ *         Varun Sethi <varun.sethi@freescale.com>
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.h and
@@ -100,6 +102,7 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
 }
 
+
 /* This geometry is the legacy default -- can be overridden by userspace */
 #define KVM_E500_TLB0_WAY_SIZE		128
 #define KVM_E500_TLB0_WAY_NUM		2
@@ -250,10 +253,12 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 	if (!get_tlb_v(tlbe))
 		return 0;
 
+#ifndef CONFIG_KVM_BOOKE_HV
 	/* Does it match current guest AS? */
 	/* XXX what about IS != DS? */
 	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
 		return 0;
+#endif
 
 	gpa = get_tlb_raddr(tlbe);
 	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
@@ -274,7 +279,11 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
 			   struct kvm_book3e_206_tlb_entry *gtlbe);
 void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500);
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_BOOKE_HV
+#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe)       get_tlb_tid(gtlbe)
+#define get_tlbmiss_tid(vcpu)           get_cur_pid(vcpu)
+#define get_tlb_sts(gtlbe)              (gtlbe->mas1 & MAS1_TS)
+#else
 unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
 				      struct kvm_book3e_206_tlb_entry *gtlbe);
 
@@ -288,6 +297,6 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
 
 /* Force TS=1 for all guest mappings. */
 #define get_tlb_sts(gtlbe)              (MAS1_TS)
-#endif /* CONFIG_KVM_E500 */
+#endif /* !BOOKE_HV */
 
 #endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index af02c18..98b6c1c 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -85,6 +85,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
 
 	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
 	case SPRN_PID:
 		kvmppc_set_pid(vcpu, spr_val);
 		break;
@@ -114,6 +115,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
 		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
 		break;
+#endif
 	case SPRN_L1CSR0:
 		vcpu_e500->l1csr0 = spr_val;
 		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
@@ -143,7 +145,14 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_IVOR35:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;
 		break;
-
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val;
+		break;
+	case SPRN_IVOR37:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val;
+		break;
+#endif
 	default:
 		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
 	}
@@ -155,9 +164,11 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int emulated = EMULATE_DONE;
-	unsigned long val;
 
 	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
+		unsigned long val;
+
 	case SPRN_PID:
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break;
 	case SPRN_PID1:
@@ -182,6 +193,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		val = vcpu->arch.shared->mas7_3 >> 32;
 		kvmppc_set_gpr(vcpu, rt, val);
 		break;
+#endif
 	case SPRN_TLB0CFG:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[0]); break;
 	case SPRN_TLB1CFG:
@@ -216,6 +228,14 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_IVOR35:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]);
 		break;
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]);
+		break;
+	case SPRN_IVOR37:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]);
+		break;
+#endif
 	default:
 		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
 	}
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 6eb5d65..e232bb4 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -2,7 +2,9 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, yu.liu@freescale.com
+ *         Scott Wood, scottwood@freescale.com
  *         Ashish Kalra, ashish.kalra@freescale.com
+ *         Varun Sethi, varun.sethi@freescale.com
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -64,6 +66,7 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
 	/* Mask off reserved bits. */
 	mas3 &= MAS3_ATTRIB_MASK;
 
+#ifndef CONFIG_KVM_BOOKE_HV
 	if (!usermode) {
 		/* Guest is in supervisor mode,
 		 * so we need to translate guest
@@ -71,8 +74,9 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
 		mas3 &= ~E500_TLB_USER_PERM_MASK;
 		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
 	}
-
-	return mas3 | E500_TLB_SUPER_PERM_MASK;
+	mas3 |= E500_TLB_SUPER_PERM_MASK;
+#endif
+	return mas3;
 }
 
 static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
@@ -98,7 +102,16 @@ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
 	mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
 	mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
 	mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_MAS8, stlbe->mas8);
+#endif
 	asm volatile("isync; tlbwe" : : : "memory");
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/* Must clear mas8 for other host tlbwe's */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+#endif
 	local_irq_restore(flags);
 
 	trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
@@ -384,6 +397,10 @@ static inline void kvmppc_e500_setup_stlbe(
 		      e500_shadow_mas2_attrib(gtlbe->mas2, pr);
 	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
 			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
+#endif
 }
 
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
new file mode 100644
index 0000000..fe6c1de
--- /dev/null
+++ b/arch/powerpc/kvm/e500mc.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Varun Sethi, <varun.sethi@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/e500.c,
+ * by Yu Liu <yu.liu@freescale.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
+
+#include "booke.h"
+#include "e500.h"
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type)
+{
+	enum ppc_dbell dbell_type;
+	unsigned long tag;
+
+	switch (type) {
+	case INT_CLASS_NONCRIT:
+		dbell_type = PPC_G_DBELL;
+		break;
+	case INT_CLASS_CRIT:
+		dbell_type = PPC_G_DBELL_CRIT;
+		break;
+	case INT_CLASS_MC:
+		dbell_type = PPC_G_DBELL_MC;
+		break;
+	default:
+		WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type);
+		return;
+	}
+
+
+	tag = PPC_DBELL_LPID(vcpu->kvm->arch.lpid) | vcpu->vcpu_id;
+	mb();
+	ppc_msgsnd(dbell_type, 0, tag);
+}
+
+/* gtlbe must not be mapped by more than one host tlb entry */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned int tid, ts;
+	u32 val, eaddr, lpid;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+	lpid = vcpu_e500->vcpu.kvm->arch.lpid;
+
+	/* We search the host TLB to invalidate its shadow TLB entry */
+	val = (tid << 16) | ts;
+	eaddr = get_tlb_eaddr(gtlbe);
+
+	local_irq_save(flags);
+
+	mtspr(SPRN_MAS6, val);
+	mtspr(SPRN_MAS5, MAS5_SGS | lpid);
+
+	asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr));
+	val = mfspr(SPRN_MAS1);
+	if (val & MAS1_VALID) {
+		mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+		asm volatile("tlbwe");
+	}
+	mtspr(SPRN_MAS5, 0);
+	/* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+
+	local_irq_restore(flags);
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS5, MAS5_SGS | vcpu_e500->vcpu.kvm->arch.lpid);
+	asm volatile("tlbilxlpid");
+	mtspr(SPRN_MAS5, 0);
+	local_irq_restore(flags);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	vcpu->arch.pid = pid;
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_booke_vcpu_load(vcpu, cpu);
+
+	mtspr(SPRN_LPID, vcpu->kvm->arch.lpid);
+	mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+	mtspr(SPRN_GPIR, vcpu->vcpu_id);
+	mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp);
+	mtspr(SPRN_EPLC, vcpu->arch.eplc);
+	mtspr(SPRN_EPSC, vcpu->arch.epsc);
+
+	mtspr(SPRN_GIVPR, vcpu->arch.ivpr);
+	mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
+	mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
+	mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0);
+	mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1);
+	mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2);
+	mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3);
+
+	mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0);
+	mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1);
+
+	mtspr(SPRN_GEPR, vcpu->arch.epr);
+	mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
+	mtspr(SPRN_GESR, vcpu->arch.shared->esr);
+
+	if (vcpu->arch.oldpir != mfspr(SPRN_PIR))
+		kvmppc_e500_tlbil_all(vcpu_e500);
+
+	kvmppc_load_guest_fp(vcpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.eplc = mfspr(SPRN_EPLC);
+	vcpu->arch.epsc = mfspr(SPRN_EPSC);
+
+	vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0);
+	vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1);
+	vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2);
+	vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3);
+
+	vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0);
+	vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1);
+
+	vcpu->arch.epr = mfspr(SPRN_GEPR);
+	vcpu->arch.shared->dar = mfspr(SPRN_GDEAR);
+	vcpu->arch.shared->esr = mfspr(SPRN_GESR);
+
+	vcpu->arch.oldpir = mfspr(SPRN_PIR);
+
+	kvmppc_booke_vcpu_put(vcpu);
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0)
+		r = 0;
+	else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
+		r = 0;
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
+				 SPRN_EPCR_DUVD;
+	vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP;
+	vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT);
+	vcpu->arch.epsc = vcpu->arch.eplc;
+
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu_e500->svr = mfspr(SPRN_SVR);
+
+	vcpu->arch.cpu_type = KVM_CPU_E500MC;
+
+	return 0;
+}
+
+void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM |
+			       KVM_SREGS_E_PC;
+	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL;
+
+	sregs->u.e.impl.fsl.features = 0;
+	sregs->u.e.impl.fsl.svr = vcpu_e500->svr;
+	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
+	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
+
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+
+	sregs->u.e.ivor_high[3] =
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+	sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
+	sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
+
+	kvmppc_get_sregs_ivor(vcpu, sregs);
+}
+
+int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
+
+	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
+		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
+		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0;
+		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
+	}
+
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	if (sregs->u.e.features & KVM_SREGS_E_PM) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] =
+			sregs->u.e.ivor_high[3];
+	}
+
+	if (sregs->u.e.features & KVM_SREGS_E_PC) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] =
+			sregs->u.e.ivor_high[4];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] =
+			sregs->u.e.ivor_high[5];
+	}
+
+	return kvmppc_set_sregs_ivor(vcpu, sregs);
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	struct kvm_vcpu *vcpu;
+	int err;
+
+	vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu_e500) {
+		err = -ENOMEM;
+		goto out;
+	}
+	vcpu = &vcpu_e500->vcpu;
+
+	/* Invalid PIR value -- this LPID dosn't have valid state on any cpu */
+	vcpu->arch.oldpir = 0xffffffff;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		goto uninit_vcpu;
+
+	vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_tlb;
+
+	return vcpu;
+
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared);
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	int lpid;
+
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		return lpid;
+
+	kvm->arch.lpid = lpid;
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_lpid(kvm->arch.lpid);
+}
+
+static int __init kvmppc_e500mc_init(void)
+{
+	int r;
+
+	r = kvmppc_booke_init();
+	if (r)
+		return r;
+
+	kvmppc_init_lpid(64);
+	kvmppc_claim_lpid(0); /* host */
+
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+}
+
+static void __exit kvmppc_e500mc_exit(void)
+{
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500mc_init);
+module_exit(kvmppc_e500mc_exit);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6a530e4..14bf8d5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -235,7 +235,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
-#ifdef CONFIG_KVM_E500
+#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
 #endif
 		r = 1;
@@ -629,7 +629,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
-#ifdef CONFIG_KVM_E500
+#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
 		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
@@ -706,7 +706,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 
-#ifdef CONFIG_KVM_E500
+#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
 	case KVM_DIRTY_TLB: {
 		struct kvm_dirty_tlb dirty;
 		r = -EFAULT;
-- 
cgit v0.10.2


From 4ab969199ec6a14604ceaffb21fe78cc4881d3b8 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 15 Feb 2012 13:28:48 +0000
Subject: KVM: PPC: e500mc: Add doorbell emulation support

When one vcpu wants to kick another, it can issue a special IPI instruction
called msgsnd. This patch emulates this instruction, its clearing counterpart
and the infrastructure required to actually trigger that interrupt inside
a guest vcpu.

With this patch, SMP guests on e500mc work.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index d7365b0..154c067 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -19,7 +19,9 @@
 
 #define PPC_DBELL_MSG_BRDCAST	(0x04000000)
 #define PPC_DBELL_TYPE(x)	(((x) & 0xf) << (63-36))
+#define PPC_DBELL_TYPE_MASK	PPC_DBELL_TYPE(0xf)
 #define PPC_DBELL_LPID(x)	((x) << (63 - 49))
+#define PPC_DBELL_PIR_MASK	0x3fff
 enum ppc_dbell {
 	PPC_DBELL = 0,		/* doorbell */
 	PPC_DBELL_CRIT = 1,	/* critical doorbell */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0b77be1..85bd5b8 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -326,6 +326,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_CRITICAL:
+	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
 		allowed = allowed && !crit;
 		msr_mask = MSR_GS | MSR_ME;
@@ -342,6 +343,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		keep_irq = true;
 		/* fall through */
 	case BOOKE_IRQPRIO_EXTERNAL:
+	case BOOKE_IRQPRIO_DBELL:
 		allowed = vcpu->arch.shared->msr & MSR_EE;
 		allowed = allowed && !crit;
 		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 98b6c1c..99155f8 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -14,16 +14,74 @@
 
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
+#include <asm/dbell.h>
 
 #include "booke.h"
 #include "e500.h"
 
+#define XOP_MSGSND  206
+#define XOP_MSGCLR  238
 #define XOP_TLBIVAX 786
 #define XOP_TLBSX   914
 #define XOP_TLBRE   946
 #define XOP_TLBWE   978
 #define XOP_TLBILX  18
 
+#ifdef CONFIG_KVM_E500MC
+static int dbell2prio(ulong param)
+{
+	int msg = param & PPC_DBELL_TYPE_MASK;
+	int prio = -1;
+
+	switch (msg) {
+	case PPC_DBELL_TYPE(PPC_DBELL):
+		prio = BOOKE_IRQPRIO_DBELL;
+		break;
+	case PPC_DBELL_TYPE(PPC_DBELL_CRIT):
+		prio = BOOKE_IRQPRIO_DBELL_CRIT;
+		break;
+	default:
+		break;
+	}
+
+	return prio;
+}
+
+static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb)
+{
+	ulong param = vcpu->arch.gpr[rb];
+	int prio = dbell2prio(param);
+
+	if (prio < 0)
+		return EMULATE_FAIL;
+
+	clear_bit(prio, &vcpu->arch.pending_exceptions);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb)
+{
+	ulong param = vcpu->arch.gpr[rb];
+	int prio = dbell2prio(rb);
+	int pir = param & PPC_DBELL_PIR_MASK;
+	int i;
+	struct kvm_vcpu *cvcpu;
+
+	if (prio < 0)
+		return EMULATE_FAIL;
+
+	kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) {
+		int cpir = cvcpu->arch.shared->pir;
+		if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) {
+			set_bit(prio, &cvcpu->arch.pending_exceptions);
+			kvm_vcpu_kick(cvcpu);
+		}
+	}
+
+	return EMULATE_DONE;
+}
+#endif
+
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
@@ -36,6 +94,16 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case 31:
 		switch (get_xop(inst)) {
 
+#ifdef CONFIG_KVM_E500MC
+		case XOP_MSGSND:
+			emulated = kvmppc_e500_emul_msgsnd(vcpu, get_rb(inst));
+			break;
+
+		case XOP_MSGCLR:
+			emulated = kvmppc_e500_emul_msgclr(vcpu, get_rb(inst));
+			break;
+#endif
+
 		case XOP_TLBRE:
 			emulated = kvmppc_e500_emul_tlbre(vcpu);
 			break;
-- 
cgit v0.10.2


From 79300f8cb9be201f916d075b3ef2e032d83a0d75 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 15 Feb 2012 19:12:29 +0000
Subject: KVM: PPC: e500mc: implicitly set MSR_GS

When setting MSR for an e500mc guest, we implicitly always set MSR_GS
to make sure the guest is in guest state. Since we have this implicit
rule there, we don't need to explicitly pass MSR_GS to set_msr().

Remove all explicit setters of MSR_GS.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 85bd5b8..fcbe928 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -280,7 +280,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                                         unsigned int priority)
 {
 	int allowed = 0;
-	ulong uninitialized_var(msr_mask);
+	ulong msr_mask = 0;
 	bool update_esr = false, update_dear = false;
 	ulong crit_raw = vcpu->arch.shared->critical;
 	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
@@ -322,20 +322,19 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_AP_UNAVAIL:
 	case BOOKE_IRQPRIO_ALIGNMENT:
 		allowed = 1;
-		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
+		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_DBELL_CRIT:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
 		allowed = allowed && !crit;
-		msr_mask = MSR_GS | MSR_ME;
+		msr_mask = MSR_ME;
 		int_class = INT_CLASS_CRIT;
 		break;
 	case BOOKE_IRQPRIO_MACHINE_CHECK:
 		allowed = vcpu->arch.shared->msr & MSR_ME;
 		allowed = allowed && !crit;
-		msr_mask = MSR_GS;
 		int_class = INT_CLASS_MC;
 		break;
 	case BOOKE_IRQPRIO_DECREMENTER:
@@ -346,13 +345,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_DBELL:
 		allowed = vcpu->arch.shared->msr & MSR_EE;
 		allowed = allowed && !crit;
-		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
+		msr_mask = MSR_CE | MSR_ME | MSR_DE;
 		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_DEBUG:
 		allowed = vcpu->arch.shared->msr & MSR_DE;
 		allowed = allowed && !crit;
-		msr_mask = MSR_GS | MSR_ME;
+		msr_mask = MSR_ME;
 		int_class = INT_CLASS_CRIT;
 		break;
 	}
-- 
cgit v0.10.2


From a2723ce7fe4b99bc2df492067c3f81de2ee89aab Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 15 Feb 2012 23:06:24 +0000
Subject: KVM: PPC: e500mc: Move r1/r2 restoration very early

If we hit any exception whatsoever in the restore path and r1/r2 aren't the
host registers, we don't get a working oops. So it's always a good idea to
restore them as early as possible.

This time, it actually has practical reasons to do so too, since we need to
have the host page fault handler fix up our guest instruction read code. And
for that to work we need r1/r2 restored.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 9eaeebd..63023ae 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -67,6 +67,12 @@
  * saved in vcpu: cr, ctr, r3-r13
  */
 .macro kvm_handler_common intno, srr0, flags
+	/* Restore host stack pointer */
+	PPC_STL	r1, VCPU_GPR(r1)(r4)
+	PPC_STL	r2, VCPU_GPR(r2)(r4)
+	PPC_LL	r1, VCPU_HOST_STACK(r4)
+	PPC_LL	r2, HOST_R2(r1)
+
 	mfspr	r10, SPRN_PID
 	lwz	r8, VCPU_HOST_PID(r4)
 	PPC_LL	r11, VCPU_SHARED(r4)
@@ -290,10 +296,8 @@ _GLOBAL(kvmppc_resume_host)
 	/* Save remaining volatile guest register state to vcpu. */
 	mfspr	r3, SPRN_VRSAVE
 	PPC_STL	r0, VCPU_GPR(r0)(r4)
-	PPC_STL	r1, VCPU_GPR(r1)(r4)
 	mflr	r5
 	mfspr	r6, SPRN_SPRG4
-	PPC_STL	r2, VCPU_GPR(r2)(r4)
 	PPC_STL	r5, VCPU_LR(r4)
 	mfspr	r7, SPRN_SPRG5
 	PPC_STL	r3, VCPU_VRSAVE(r4)
@@ -334,10 +338,6 @@ _GLOBAL(kvmppc_resume_host)
 	mtspr	SPRN_EPCR, r3
 	isync
 
-	/* Restore host stack pointer */
-	PPC_LL	r1, VCPU_HOST_STACK(r4)
-	PPC_LL	r2, HOST_R2(r1)
-
 	/* Switch to kernel stack and jump to handler. */
 	PPC_LL	r3, HOST_RUN(r1)
 	mr	r5, r14 /* intno */
-- 
cgit v0.10.2


From 1d628af78a28c77143bcdd4ed09e93bb235d4198 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 15 Feb 2012 23:24:28 +0000
Subject: KVM: PPC: e500mc: add load inst fixup

There's always a chance we're unable to read a guest instruction. The guest
could have its TLB mapped execute-, but not readable, something odd happens
and our TLB gets flushed. So it's a good idea to be prepared for that case
and have a fallback that allows us to fix things up in that case.

Add fixup code that keeps guest code from potentially crashing our host kernel.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 63023ae..e9e7350 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -28,6 +28,7 @@
 #include <asm/asm-compat.h>
 #include <asm/asm-offsets.h>
 #include <asm/bitsperlong.h>
+#include <asm/thread_info.h>
 
 #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
 
@@ -171,9 +172,36 @@
 	PPC_STL	r30, VCPU_GPR(r30)(r4)
 	PPC_STL	r31, VCPU_GPR(r31)(r4)
 	mtspr	SPRN_EPLC, r8
+
+	/* disable preemption, so we are sure we hit the fixup handler */
+#ifdef CONFIG_PPC64
+	clrrdi	r8,r1,THREAD_SHIFT
+#else
+	rlwinm	r8,r1,0,0,31-THREAD_SHIFT       /* current thread_info */
+#endif
+	li	r7, 1
+	stw	r7, TI_PREEMPT(r8)
+
 	isync
-	lwepx	r9, 0, r5
+
+	/*
+	 * In case the read goes wrong, we catch it and write an invalid value
+	 * in LAST_INST instead.
+	 */
+1:	lwepx	r9, 0, r5
+2:
+.section .fixup, "ax"
+3:	li	r9, KVM_INST_FETCH_FAILED
+	b	2b
+.previous
+.section __ex_table,"a"
+	PPC_LONG_ALIGN
+	PPC_LONG 1b,3b
+.previous
+
 	mtspr	SPRN_EPLC, r3
+	li	r7, 0
+	stw	r7, TI_PREEMPT(r8)
 	stw	r9, VCPU_LAST_INST(r4)
 	.endif
 
-- 
cgit v0.10.2


From bf7ca4bdcb8f1eb15d11879efa824b45443ddb69 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 15 Feb 2012 23:40:00 +0000
Subject: KVM: PPC: rename CONFIG_KVM_E500 -> CONFIG_KVM_E500V2

The CONFIG_KVM_E500 option really indicates that we're running on a V2 machine,
not on a machine of the generic E500 class. So indicate that properly and
change the config name accordingly.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 58f6e68..44a998d 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -109,7 +109,7 @@ config KVM_440
 
 config KVM_EXIT_TIMING
 	bool "Detailed exit timing"
-	depends on KVM_440 || KVM_E500 || KVM_E500MC
+	depends on KVM_440 || KVM_E500V2 || KVM_E500MC
 	---help---
 	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
 	  report is available in debugfs kvm/vm#_vcpu#_timing.
@@ -118,14 +118,14 @@ config KVM_EXIT_TIMING
 
 	  If unsure, say N.
 
-config KVM_E500
-	bool "KVM support for PowerPC E500 processors"
+config KVM_E500V2
+	bool "KVM support for PowerPC E500v2 processors"
 	depends on EXPERIMENTAL && E500
 	select KVM
 	select KVM_MMIO
 	---help---
 	  Support running unmodified E500 guest kernels in virtual machines on
-	  E500 host processors.
+	  E500v2 host processors.
 
 	  This module provides access to the hardware capabilities through
 	  a character device node named /dev/kvm.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 62febd7..25225ae 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -36,7 +36,7 @@ kvm-e500-objs := \
 	e500.o \
 	e500_tlb.o \
 	e500_emulate.o
-kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
+kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
 
 kvm-e500mc-objs := \
 	$(common-objs-y) \
@@ -98,7 +98,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
-obj-$(CONFIG_KVM_E500) += kvm.o
+obj-$(CONFIG_KVM_E500V2) += kvm.o
 obj-$(CONFIG_KVM_E500MC) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index fcbe928..9fcc760 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -762,7 +762,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gpa_t gpaddr;
 		gfn_t gfn;
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_E500V2
 		if (!(vcpu->arch.shared->msr & MSR_PR) &&
 		    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
 			kvmppc_map_magic(vcpu);
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 3143085..7967f3f 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -39,7 +39,7 @@ struct tlbe_priv {
 	struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
 };
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_E500V2
 struct vcpu_id_table;
 #endif
 
@@ -89,7 +89,7 @@ struct kvmppc_vcpu_e500 {
 	u64 *g2h_tlb1_map;
 	unsigned int *h2g_tlb1_rmap;
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_E500V2
 	u32 pid[E500_PID_NUM];
 
 	/* vcpu id table */
@@ -136,7 +136,7 @@ void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_E500V2
 unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
 				 unsigned int as, unsigned int gid,
 				 unsigned int pr, int avoid_recursion);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index e232bb4..279e10a 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -156,7 +156,7 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 }
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_E500V2
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 14bf8d5..58ad860 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -79,7 +79,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	}
 	case HC_VENDOR_KVM | KVM_HC_FEATURES:
 		r = HC_EV_SUCCESS;
-#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500)
+#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
 		/* XXX Missing magic page on 44x */
 		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
 #endif
@@ -235,7 +235,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
-#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
 #endif
 		r = 1;
@@ -629,7 +629,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
-#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
 		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
@@ -706,7 +706,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 
-#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
 	case KVM_DIRTY_TLB: {
 		struct kvm_dirty_tlb dirty;
 		r = -EFAULT;
-- 
cgit v0.10.2


From b2e19b20708edb6413dea38e6285a6e546dce06b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 15 Feb 2012 23:41:20 +0000
Subject: KVM: PPC: make e500v2 kvm and e500mc cpu mutually exclusive

We can't run e500v2 kvm on e500mc kernels, so indicate that by
making the 2 options mutually exclusive in kconfig.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 44a998d..f4dacb9 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -120,7 +120,7 @@ config KVM_EXIT_TIMING
 
 config KVM_E500V2
 	bool "KVM support for PowerPC E500v2 processors"
-	depends on EXPERIMENTAL && E500
+	depends on EXPERIMENTAL && E500 && !PPC_E500MC
 	select KVM
 	select KVM_MMIO
 	---help---
-- 
cgit v0.10.2


From acab05290696db0a5431a9ad171be649ab56e87b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 13:07:32 +0000
Subject: KVM: PPC: booke: remove leftover debugging

The e500mc patches left some debug code in that we don't need. Remove it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9fcc760..17d5318 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -469,11 +469,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
-	if (!current->thread.kvm_vcpu) {
-		WARN(1, "no vcpu\n");
-		return -EPERM;
-	}
-
 	local_irq_disable();
 
 	kvmppc_core_prepare_to_enter(vcpu);
-- 
cgit v0.10.2


From d1ff54992d3008f4253ab3176913bb85d770e935 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 13:24:03 +0000
Subject: KVM: PPC: booke: deliver program int on emulation failure

When we fail to emulate an instruction for the guest, we better go in and
tell it that we failed to emulate it, by throwing an illegal instruction
exception.

Please beware that we basically never get around to telling the guest that
we failed thanks to the debugging code right above it. If user space however
decides that it wants to ignore the debug, we would at least do "the right
thing" afterwards.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 17d5318..9979be1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -545,13 +545,13 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		return RESUME_HOST;
 
 	case EMULATE_FAIL:
-		/* XXX Deliver Program interrupt to guest. */
 		printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
 		       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
 		/* For debugging, encode the failing instruction and
 		 * report it to userspace. */
 		run->hw.hardware_exit_reason = ~0ULL << 32;
 		run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
+		kvmppc_core_queue_program(vcpu, ESR_PIL);
 		return RESUME_HOST;
 
 	default:
-- 
cgit v0.10.2


From a8e4ef841429d338b8700998afb3dfc18c1f25d9 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 14:07:37 +0000
Subject: KVM: PPC: booke: rework rescheduling checks

Instead of checking whether we should reschedule only when we exited
due to an interrupt, let's always check before entering the guest back
again. This gets the target more in line with the other archs.

Also while at it, generalize the whole thing so that eventually we could
have a single kvmppc_prepare_to_enter function for all ppc targets that
does signal and reschedule checking for us.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e709975..7f0a3da 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -95,7 +95,7 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
 extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 
-extern void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
 extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 7d54f4e..c8ead7b 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
 	return true;
 }
 
-void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -283,6 +283,8 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 
 	/* Tell the guest about our interrupt status */
 	kvmppc_update_int_pending(vcpu, *pending, old_pending);
+
+	return 0;
 }
 
 pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9979be1..3da0e42 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -439,8 +439,9 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
 }
 
 /* Check pending exceptions and deliver one, if possible. */
-void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 {
+	int r = 0;
 	WARN_ON_ONCE(!irqs_disabled());
 
 	kvmppc_core_check_exceptions(vcpu);
@@ -451,8 +452,46 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 		local_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
-		kvmppc_core_check_exceptions(vcpu);
+		r = 1;
 	};
+
+	return r;
+}
+
+/*
+ * Common checks before entering the guest world.  Call with interrupts
+ * disabled.
+ *
+ * returns !0 if a signal is pending and check_signal is true
+ */
+static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu, bool check_signal)
+{
+	int r = 0;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	while (true) {
+		if (need_resched()) {
+			local_irq_enable();
+			cond_resched();
+			local_irq_disable();
+			continue;
+		}
+
+		if (check_signal && signal_pending(current)) {
+			r = 1;
+			break;
+		}
+
+		if (kvmppc_core_prepare_to_enter(vcpu)) {
+			/* interrupts got enabled in between, so we
+			   are back at square 1 */
+			continue;
+		}
+
+		break;
+	}
+
+	return r;
 }
 
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
@@ -470,10 +509,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	local_irq_disable();
-
-	kvmppc_core_prepare_to_enter(vcpu);
-
-	if (signal_pending(current)) {
+	if (kvmppc_prepare_to_enter(vcpu, true)) {
 		kvm_run->exit_reason = KVM_EXIT_INTR;
 		ret = -EINTR;
 		goto out;
@@ -598,25 +634,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	switch (exit_nr) {
 	case BOOKE_INTERRUPT_MACHINE_CHECK:
-		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
 		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
-		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_DECREMENTER:
 		kvmppc_account_exit(vcpu, DEC_EXITS);
-		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_DOORBELL:
 		kvmppc_account_exit(vcpu, DBELL_EXITS);
-		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
@@ -865,19 +897,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		BUG();
 	}
 
+	/*
+	 * To avoid clobbering exit_reason, only check for signals if we
+	 * aren't already exiting to userspace for some other reason.
+	 */
 	local_irq_disable();
-
-	kvmppc_core_prepare_to_enter(vcpu);
-
-	if (!(r & RESUME_HOST)) {
-		/* To avoid clobbering exit_reason, only check for signals if
-		 * we aren't already exiting to userspace for some other
-		 * reason. */
-		if (signal_pending(current)) {
-			run->exit_reason = KVM_EXIT_INTR;
-			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
-		}
+	if (kvmppc_prepare_to_enter(vcpu, !(r & RESUME_HOST))) {
+		run->exit_reason = KVM_EXIT_INTR;
+		r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+		kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 	}
 
 	return r;
-- 
cgit v0.10.2


From 8b3a00fcd3c9ea4e2cbae12af3cd8c9d7d1e109a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 14:12:46 +0000
Subject: KVM: PPC: booke: BOOKE_IRQPRIO_MAX is n+1

The semantics of BOOKE_IRQPRIO_MAX changed to denote the highest available
irqprio + 1, so let's reflect that in the code too.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3da0e42..11b0625 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -425,7 +425,7 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
 	}
 
 	priority = __ffs(*pending);
-	while (priority <= BOOKE_IRQPRIO_MAX) {
+	while (priority < BOOKE_IRQPRIO_MAX) {
 		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
 			break;
 
-- 
cgit v0.10.2


From 73ede8d32be6adc298fe3c2716e77c352c504c8c Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 14:34:55 +0000
Subject: KVM: PPC: bookehv: fix exit timing

When using exit timing stats, we clobber r9 in the NEED_EMU case,
so better move that part down a few lines and fix it that way.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index e9e7350..e4a1172 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -83,10 +83,6 @@
 	stw	r10, VCPU_GUEST_PID(r4)
 	mtspr	SPRN_PID, r8
 
-	.if	\flags & NEED_EMU
-	lwz	r9, VCPU_KVM(r4)
-	.endif
-
 #ifdef CONFIG_KVM_EXIT_TIMING
 	/* save exit time */
 1:	mfspr	r7, SPRN_TBRU
@@ -98,6 +94,10 @@
 	PPC_STL	r9, VCPU_TIMING_EXIT_TBU(r4)
 #endif
 
+	.if	\flags & NEED_EMU
+	lwz	r9, VCPU_KVM(r4)
+	.endif
+
 	oris	r8, r6, MSR_CE@h
 #ifndef CONFIG_64BIT
 	stw	r6, (VCPU_SHARED_MSR + 4)(r11)
-- 
cgit v0.10.2


From 8764b46ee3873b685a7823fc79388bae7d19e51e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 14:40:26 +0000
Subject: KVM: PPC: bookehv: remove negation for CONFIG_64BIT

Instead if doing

  #ifndef CONFIG_64BIT
  ...
  #else
  ...
  #endif

we should rather do

  #ifdef CONFIG_64BIT
  ...
  #else
  ...
  #endif

which is a lot easier to read. Change the bookehv implementation to
stick with this rule.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index e4a1172..af771de 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -99,10 +99,10 @@
 	.endif
 
 	oris	r8, r6, MSR_CE@h
-#ifndef CONFIG_64BIT
-	stw	r6, (VCPU_SHARED_MSR + 4)(r11)
-#else
+#ifdef CONFIG_64BIT
 	std	r6, (VCPU_SHARED_MSR)(r11)
+#else
+	stw	r6, (VCPU_SHARED_MSR + 4)(r11)
 #endif
 	ori	r8, r8, MSR_ME | MSR_RI
 	PPC_STL	r5, VCPU_PC(r4)
@@ -344,10 +344,10 @@ _GLOBAL(kvmppc_resume_host)
 	stw	r5, VCPU_SHARED_MAS0(r11)
 	mfspr	r7, SPRN_MAS2
 	stw	r6, VCPU_SHARED_MAS1(r11)
-#ifndef CONFIG_64BIT
-	stw	r7, (VCPU_SHARED_MAS2 + 4)(r11)
-#else
+#ifdef CONFIG_64BIT
 	std	r7, (VCPU_SHARED_MAS2)(r11)
+#else
+	stw	r7, (VCPU_SHARED_MAS2 + 4)(r11)
 #endif
 	mfspr	r5, SPRN_MAS3
 	mfspr	r6, SPRN_MAS4
@@ -530,10 +530,10 @@ lightweight_exit:
 	stw	r3, VCPU_HOST_MAS6(r4)
 	lwz	r3, VCPU_SHARED_MAS0(r11)
 	lwz	r5, VCPU_SHARED_MAS1(r11)
-#ifndef CONFIG_64BIT
-	lwz	r6, (VCPU_SHARED_MAS2 + 4)(r11)
-#else
+#ifdef CONFIG_64BIT
 	ld	r6, (VCPU_SHARED_MAS2)(r11)
+#else
+	lwz	r6, (VCPU_SHARED_MAS2 + 4)(r11)
 #endif
 	lwz	r7, VCPU_SHARED_MAS7_3+4(r11)
 	lwz	r8, VCPU_SHARED_MAS4(r11)
@@ -572,10 +572,10 @@ lightweight_exit:
 	PPC_LL	r6, VCPU_CTR(r4)
 	PPC_LL	r7, VCPU_CR(r4)
 	PPC_LL	r8, VCPU_PC(r4)
-#ifndef CONFIG_64BIT
-	lwz	r9, (VCPU_SHARED_MSR + 4)(r11)
-#else
+#ifdef CONFIG_64BIT
 	ld	r9, (VCPU_SHARED_MSR)(r11)
+#else
+	lwz	r9, (VCPU_SHARED_MSR + 4)(r11)
 #endif
 	PPC_LL	r0, VCPU_GPR(r0)(r4)
 	PPC_LL	r1, VCPU_GPR(r1)(r4)
-- 
cgit v0.10.2


From 8a3da55784cf2aea8ef6acdd1f50e5ad52f76574 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 14:45:33 +0000
Subject: KVM: PPC: bookehv: remove SET_VCPU

The SET_VCPU macro is a leftover from times when the vcpu struct wasn't
stored in the thread on vcpu_load/put. It's not needed anymore. Remove it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index af771de..dfa606d 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -35,9 +35,6 @@
 #define GET_VCPU(vcpu, thread)	\
 	PPC_LL	vcpu, THREAD_KVM_VCPU(thread)
 
-#define SET_VCPU(vcpu)		\
-        PPC_STL	vcpu, (THREAD + THREAD_KVM_VCPU)(r2)
-
 #define LONGBYTES		(BITS_PER_LONG / 8)
 
 #define VCPU_GPR(n)     	(VCPU_GPRS + (n * LONGBYTES))
@@ -517,11 +514,6 @@ lightweight_exit:
 	lwz	r3, VCPU_GUEST_PID(r4)
 	mtspr	SPRN_PID, r3
 
-	/* Save vcpu pointer for the exception handlers
-	 * must be done before loading guest r2.
-	 */
-//	SET_VCPU(r4)
-
 	PPC_LL	r11, VCPU_SHARED(r4)
 	/* Save host mas4 and mas6 and load guest MAS registers */
 	mfspr	r3, SPRN_MAS4
-- 
cgit v0.10.2


From e9ba39c1f3dff93efddacbd4569ada05633e2a9b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 14:53:04 +0000
Subject: KVM: PPC: bookehv: disable MAS register updates early

We need to make sure that no MAS updates happen automatically while we
have the guest MAS registers loaded. So move the disabling code a bit
higher up so that it covers the full time we have guest values in MAS
registers.

The race this patch fixes should never occur, but it makes the code a
bit more logical to do it this way around.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index dfa606d..3a1db90 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -358,6 +358,7 @@ _GLOBAL(kvmppc_resume_host)
 	mtspr	SPRN_MAS4, r6
 	stw	r5, VCPU_SHARED_MAS7_3+0(r11)
 	mtspr	SPRN_MAS6, r8
+	/* Enable MAS register updates via exception */
 	mfspr	r3, SPRN_EPCR
 	rlwinm	r3, r3, 0, ~SPRN_EPCR_DMIUH
 	mtspr	SPRN_EPCR, r3
@@ -515,6 +516,11 @@ lightweight_exit:
 	mtspr	SPRN_PID, r3
 
 	PPC_LL	r11, VCPU_SHARED(r4)
+	/* Disable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	oris	r3, r3, SPRN_EPCR_DMIUH@h
+	mtspr	SPRN_EPCR, r3
+	isync
 	/* Save host mas4 and mas6 and load guest MAS registers */
 	mfspr	r3, SPRN_MAS4
 	stw	r3, VCPU_HOST_MAS4(r4)
@@ -538,10 +544,6 @@ lightweight_exit:
 	lwz	r5, VCPU_SHARED_MAS7_3+0(r11)
 	mtspr	SPRN_MAS6, r3
 	mtspr	SPRN_MAS7, r5
-	/* Disable MAS register updates via exception */
-	mfspr	r3, SPRN_EPCR
-	oris	r3, r3, SPRN_EPCR_DMIUH@h
-	mtspr	SPRN_EPCR, r3
 
 	/*
 	 * Host interrupt handlers may have clobbered these guest-readable
-- 
cgit v0.10.2


From 5fd8505ea4b7456d57eacefbf00b669f15f5f0c0 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 16 Feb 2012 15:04:54 +0000
Subject: KVM: PPC: bookehv: add comment about shadow_msr

For BookE HV the guest visible MSR is shared->msr and is identical to
the MSR that is in use while the guest is running, because we can't trap
reads from/to MSR.

So shadow_msr is unused there. Indicate that with a comment.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e645623..97ecdaf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -386,6 +386,7 @@ struct kvm_vcpu_arch {
 #endif
 	u32 vrsave; /* also USPRG0 */
 	u32 mmucr;
+	/* shadow_msr is unused for BookE HV */
 	ulong shadow_msr;
 	ulong csrr0;
 	ulong csrr1;
-- 
cgit v0.10.2


From c35c9d84cf141ebf05e2e481c0faccedc4f8f7ff Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 20 Feb 2012 12:21:18 +0100
Subject: KVM: PPC: booke: Readd debug abort code for machine check

When during guest execution we get a machine check interrupt, we don't
know how to handle it yet. So let's add the error printing code back
again that we dropped accidently earlier and tell user space that something
went really wrong.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 11b0625..af02d9d 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -634,7 +634,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	switch (exit_nr) {
 	case BOOKE_INTERRUPT_MACHINE_CHECK:
-		r = RESUME_GUEST;
+		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
+		kvmppc_dump_vcpu(vcpu);
+		/* For debugging, send invalid exit reason to user space */
+		run->hw.hardware_exit_reason = ~1ULL << 32;
+		run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR);
+		r = RESUME_HOST;
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
-- 
cgit v0.10.2


From 0268597c811ccf55e0bda20907c1a9e6001365cf Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 20 Feb 2012 12:33:22 +0100
Subject: KVM: PPC: booke: add GS documentation for program interrupt

The comment for program interrupts triggered when using bookehv was
misleading. Update it to mention why MSR_GS indicates that we have
to inject an interrupt into the guest again, not emulate it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index af02d9d..7df3f3a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -685,8 +685,14 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	case BOOKE_INTERRUPT_PROGRAM:
 		if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) {
-			/* Program traps generated by user-level software must be handled
-			 * by the guest kernel. */
+			/*
+			 * Program traps generated by user-level software must
+			 * be handled by the guest kernel.
+			 *
+			 * In GS mode, hypervisor privileged instructions trap
+			 * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are
+			 * actual program interrupts, handled by the guest.
+			 */
 			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
 			r = RESUME_GUEST;
 			kvmppc_account_exit(vcpu, USR_PR_INST);
-- 
cgit v0.10.2


From 55cdf08b9a6c99f1335fa6ad42372dcfb3715b56 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 20 Feb 2012 12:39:36 +0100
Subject: KVM: PPC: bookehv: remove unused code

There was some unused code in the exit code path that must have been
a leftover from earlier iterations. While it did no harm, it's superfluous
and thus should be removed.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 3a1db90..2d1f56c 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -91,10 +91,6 @@
 	PPC_STL	r9, VCPU_TIMING_EXIT_TBU(r4)
 #endif
 
-	.if	\flags & NEED_EMU
-	lwz	r9, VCPU_KVM(r4)
-	.endif
-
 	oris	r8, r6, MSR_CE@h
 #ifdef CONFIG_64BIT
 	std	r6, (VCPU_SHARED_MSR)(r11)
@@ -112,9 +108,6 @@
 	 * appropriate for the exception type).
 	 */
 	cmpw	r6, r8
-	.if	\flags & NEED_EMU
-	lwz	r9, KVM_LPID(r9)
-	.endif
 	beq	1f
 	mfmsr	r7
 	.if	\srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0
-- 
cgit v0.10.2


From c6b3733bef2ffece49336dba7220fefdae5fa908 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 20 Feb 2012 17:48:47 +0100
Subject: KVM: PPC: e500: fix typo in tlb code

The tlbncfg registers should be populated with their respective TLB's
values. Fix the obvious typo.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 279e10a..e05232b 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -1268,8 +1268,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 
 	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[1].entries;
-	vcpu->arch.tlbcfg[0] |=
+	vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries;
+	vcpu->arch.tlbcfg[1] |=
 		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
 
 	return 0;
-- 
cgit v0.10.2


From 95f2e921446dbc2e9a785734049ee349a67434bd Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 20 Feb 2012 22:45:12 +0100
Subject: KVM: PPC: booke: Support perfmon interrupts

When during guest context we get a performance monitor interrupt, we
currently bail out and oops. Let's route it to its correct handler
instead.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 7df3f3a..ee39c8a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -679,6 +679,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
+	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:
+		r = RESUME_GUEST;
+		break;
+
 	case BOOKE_INTERRUPT_HV_PRIV:
 		r = emulation_exit(run, vcpu);
 		break;
-- 
cgit v0.10.2


From 4e642ccbd6a3f1410155c7700f54b56b6c7df9a2 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 20 Feb 2012 23:57:26 +0100
Subject: KVM: PPC: booke: expose good state on irq reinject

When reinjecting an interrupt into the host interrupt handler after we're
back in host kernel land, we need to tell the kernel where the interrupt
happened. We can't tell it that we were in guest state, because that might
lead to random code walking host addresses. So instead, we tell it that
we came from the interrupt reinject code.

This helps getting reasonable numbers out of perf.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ee39c8a..488936b 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -595,37 +595,63 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 }
 
-/**
- * kvmppc_handle_exit
- *
- * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
- */
-int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int exit_nr)
+static void kvmppc_fill_pt_regs(struct pt_regs *regs)
 {
-	int r = RESUME_HOST;
+	ulong r1, ip, msr, lr;
+
+	asm("mr %0, 1" : "=r"(r1));
+	asm("mflr %0" : "=r"(lr));
+	asm("mfmsr %0" : "=r"(msr));
+	asm("bl 1f; 1: mflr %0" : "=r"(ip));
+
+	memset(regs, 0, sizeof(*regs));
+	regs->gpr[1] = r1;
+	regs->nip = ip;
+	regs->msr = msr;
+	regs->link = lr;
+}
 
-	/* update before a new last_exit_type is rewritten */
-	kvmppc_update_timing_stats(vcpu);
+static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
+				     unsigned int exit_nr)
+{
+	struct pt_regs regs;
 
 	switch (exit_nr) {
 	case BOOKE_INTERRUPT_EXTERNAL:
-		do_IRQ(current->thread.regs);
+		kvmppc_fill_pt_regs(&regs);
+		do_IRQ(&regs);
 		break;
-
 	case BOOKE_INTERRUPT_DECREMENTER:
-		timer_interrupt(current->thread.regs);
+		kvmppc_fill_pt_regs(&regs);
+		timer_interrupt(&regs);
 		break;
-
 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
 	case BOOKE_INTERRUPT_DOORBELL:
-		doorbell_exception(current->thread.regs);
+		kvmppc_fill_pt_regs(&regs);
+		doorbell_exception(&regs);
 		break;
 #endif
 	case BOOKE_INTERRUPT_MACHINE_CHECK:
 		/* FIXME */
 		break;
 	}
+}
+
+/**
+ * kvmppc_handle_exit
+ *
+ * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
+ */
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+	int r = RESUME_HOST;
+
+	/* update before a new last_exit_type is rewritten */
+	kvmppc_update_timing_stats(vcpu);
+
+	/* restart interrupts if they were meant for the host */
+	kvmppc_restart_interrupt(vcpu, exit_nr);
 
 	local_irq_enable();
 
-- 
cgit v0.10.2


From 7cc1e8ee78f469ecff8aa29465325f1e4c5e1b5f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 22 Feb 2012 16:26:34 +0100
Subject: KVM: PPC: booke: Reinject performance monitor interrupts

When we get a performance monitor interrupt, we need to make sure that
the host receives it. So reinject it like we reinject the other host
destined interrupts.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 51010bf..c9aac24 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -33,6 +33,7 @@
 extern void __replay_interrupt(unsigned int vector);
 
 extern void timer_interrupt(struct pt_regs *);
+extern void performance_monitor_exception(struct pt_regs *regs);
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 488936b..8e8aa4c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -634,6 +634,10 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
 	case BOOKE_INTERRUPT_MACHINE_CHECK:
 		/* FIXME */
 		break;
+	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:
+		kvmppc_fill_pt_regs(&regs);
+		performance_monitor_exception(&regs);
+		break;
 	}
 }
 
-- 
cgit v0.10.2


From 03660ba27020250eae0b5a2722e0c7bec4968c3c Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 28 Feb 2012 12:00:41 +0100
Subject: KVM: PPC: Booke: only prepare to enter when we enter

So far, we've always called prepare_to_enter even when all we did was return
to the host. This patch changes that semantic to only call prepare_to_enter
when we actually want to get back into the guest.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 8e8aa4c..9f27258 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -464,7 +464,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
  *
  * returns !0 if a signal is pending and check_signal is true
  */
-static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu, bool check_signal)
+static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 {
 	int r = 0;
 
@@ -477,7 +477,7 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu, bool check_signal)
 			continue;
 		}
 
-		if (check_signal && signal_pending(current)) {
+		if (signal_pending(current)) {
 			r = 1;
 			break;
 		}
@@ -509,7 +509,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	local_irq_disable();
-	if (kvmppc_prepare_to_enter(vcpu, true)) {
+	if (kvmppc_prepare_to_enter(vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_INTR;
 		ret = -EINTR;
 		goto out;
@@ -946,11 +946,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * To avoid clobbering exit_reason, only check for signals if we
 	 * aren't already exiting to userspace for some other reason.
 	 */
-	local_irq_disable();
-	if (kvmppc_prepare_to_enter(vcpu, !(r & RESUME_HOST))) {
-		run->exit_reason = KVM_EXIT_INTR;
-		r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-		kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+	if (!(r & RESUME_HOST)) {
+		local_irq_disable();
+		if (kvmppc_prepare_to_enter(vcpu)) {
+			run->exit_reason = KVM_EXIT_INTR;
+			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+		}
 	}
 
 	return r;
-- 
cgit v0.10.2


From 3aaefef200f618dc455cdf18053a7aeb262b5a11 Mon Sep 17 00:00:00 2001
From: Matt Evans <matt@ozlabs.org>
Date: Mon, 30 Jan 2012 20:25:31 +0000
Subject: KVM: PPC: Book3s: PR: Add SPAPR H_BULK_REMOVE support

SPAPR support includes various in-kernel hypercalls, improving performance
by cutting out the exit to userspace.  H_BULK_REMOVE is implemented in this
patch.

Signed-off-by: Matt Evans <matt@ozlabs.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index b958932..6d1bfe2 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -98,6 +98,83 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+/* Request defs for kvmppc_h_pr_bulk_remove() */
+#define H_BULK_REMOVE_TYPE             0xc000000000000000ULL
+#define   H_BULK_REMOVE_REQUEST        0x4000000000000000ULL
+#define   H_BULK_REMOVE_RESPONSE       0x8000000000000000ULL
+#define   H_BULK_REMOVE_END            0xc000000000000000ULL
+#define H_BULK_REMOVE_CODE             0x3000000000000000ULL
+#define   H_BULK_REMOVE_SUCCESS        0x0000000000000000ULL
+#define   H_BULK_REMOVE_NOT_FOUND      0x1000000000000000ULL
+#define   H_BULK_REMOVE_PARM           0x2000000000000000ULL
+#define   H_BULK_REMOVE_HW             0x3000000000000000ULL
+#define H_BULK_REMOVE_RC               0x0c00000000000000ULL
+#define H_BULK_REMOVE_FLAGS            0x0300000000000000ULL
+#define   H_BULK_REMOVE_ABSOLUTE       0x0000000000000000ULL
+#define   H_BULK_REMOVE_ANDCOND        0x0100000000000000ULL
+#define   H_BULK_REMOVE_AVPN           0x0200000000000000ULL
+#define H_BULK_REMOVE_PTEX             0x00ffffffffffffffULL
+#define H_BULK_REMOVE_MAX_BATCH        4
+
+static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	int i;
+	int paramnr = 4;
+	int ret = H_SUCCESS;
+
+	for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) {
+		unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i));
+		unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1);
+		unsigned long pteg, rb, flags;
+		unsigned long pte[2];
+		unsigned long v = 0;
+
+		if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) {
+			break; /* Exit success */
+		} else if ((tsh & H_BULK_REMOVE_TYPE) !=
+			   H_BULK_REMOVE_REQUEST) {
+			ret = H_PARAMETER;
+			break; /* Exit fail */
+		}
+
+		tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS;
+		tsh |= H_BULK_REMOVE_RESPONSE;
+
+		if ((tsh & H_BULK_REMOVE_ANDCOND) &&
+		    (tsh & H_BULK_REMOVE_AVPN)) {
+			tsh |= H_BULK_REMOVE_PARM;
+			kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
+			ret = H_PARAMETER;
+			break; /* Exit fail */
+		}
+
+		pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX);
+		copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+
+		/* tsl = AVPN */
+		flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26;
+
+		if ((pte[0] & HPTE_V_VALID) == 0 ||
+		    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) ||
+		    ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) {
+			tsh |= H_BULK_REMOVE_NOT_FOUND;
+		} else {
+			/* Splat the pteg in (userland) hpt */
+			copy_to_user((void __user *)pteg, &v, sizeof(v));
+
+			rb = compute_tlbie_rb(pte[0], pte[1],
+					      tsh & H_BULK_REMOVE_PTEX);
+			vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+			tsh |= H_BULK_REMOVE_SUCCESS;
+			tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43;
+		}
+		kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
+	}
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags = kvmppc_get_gpr(vcpu, 4);
@@ -144,10 +221,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 	case H_PROTECT:
 		return kvmppc_h_pr_protect(vcpu);
 	case H_BULK_REMOVE:
-		/* We just flush all PTEs, so user space can
-		   handle the HPT modifications */
-		kvmppc_mmu_pte_flush(vcpu, 0, 0);
-		break;
+		return kvmppc_h_pr_bulk_remove(vcpu);
 	case H_CEDE:
 		kvm_vcpu_block(vcpu);
 		vcpu->stat.halt_wakeup++;
-- 
cgit v0.10.2


From f6127716c346c73ab1513edee53231800188c5ba Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 5 Mar 2012 16:00:28 +0100
Subject: KVM: PPC: Save/Restore CR over vcpu_run

On PPC, CR2-CR4 are nonvolatile, thus have to be saved across function calls.
We didn't respect that for any architecture until Paul spotted it in his
patch for Book3S-HV. This patch saves/restores CR for all KVM capable PPC hosts.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 2d1f56c..57e2fa4 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -49,7 +49,8 @@
  * kernel with the -ffixed-r2 gcc option.
  */
 #define HOST_R2         (3 * LONGBYTES)
-#define HOST_NV_GPRS    (4 * LONGBYTES)
+#define HOST_CR         (4 * LONGBYTES)
+#define HOST_NV_GPRS    (5 * LONGBYTES)
 #define HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
 #define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + LONGBYTES)
 #define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
@@ -396,6 +397,7 @@ skip_nv_load:
 heavyweight_exit:
 	/* Not returning to guest. */
 	PPC_LL	r5, HOST_STACK_LR(r1)
+	lwz	r6, HOST_CR(r1)
 
 	/*
 	 * We already saved guest volatile register state; now save the
@@ -442,6 +444,7 @@ heavyweight_exit:
 
 	/* Return to kvm_vcpu_run(). */
 	mtlr	r5
+	mtcr	r6
 	addi	r1, r1, HOST_STACK_SIZE
 	/* r3 still contains the return code from kvmppc_handle_exit(). */
 	blr
@@ -457,8 +460,11 @@ _GLOBAL(__kvmppc_vcpu_run)
 	/* Save host state to stack. */
 	PPC_STL	r3, HOST_RUN(r1)
 	mflr	r3
+	mfcr	r5
 	PPC_STL	r3, HOST_STACK_LR(r1)
 
+	stw	r5, HOST_CR(r1)
+
 	/* Save host non-volatile register state to stack. */
 	PPC_STL	r14, HOST_NV_GPR(r14)(r1)
 	PPC_STL	r15, HOST_NV_GPR(r15)(r1)
-- 
cgit v0.10.2


From f0888f70151c7f53de2b45ee20ff1905837943e8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 3 Feb 2012 00:54:17 +0000
Subject: KVM: PPC: Book3S HV: Make secondary threads more robust against stray
 IPIs

Currently on POWER7, if we are running the guest on a core and we don't
need all the hardware threads, we do nothing to ensure that the unused
threads aren't executing in the kernel (other than checking that they
are offline).  We just assume they're napping and we don't do anything
to stop them trying to enter the kernel while the guest is running.
This means that a stray IPI can wake up the hardware thread and it will
then try to enter the kernel, but since the core is in guest context,
it will execute code from the guest in hypervisor mode once it turns the
MMU on, which tends to lead to crashes or hangs in the host.

This fixes the problem by adding two new one-byte flags in the
kvmppc_host_state structure in the PACA which are used to interlock
between the primary thread and the unused secondary threads when entering
the guest.  With these flags, the primary thread can ensure that the
unused secondaries are not already in kernel mode (i.e. handling a stray
IPI) and then indicate that they should not try to enter the kernel
if they do get woken for any reason.  Instead they will go into KVM code,
find that there is no vcpu to run, acknowledge and clear the IPI and go
back to nap mode.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 1f2f5b6..88609b2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -79,6 +79,9 @@ struct kvmppc_host_state {
 	u8 napping;
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
+	u8 hwthread_req;
+	u8 hwthread_state;
+
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	unsigned long xics_phys;
@@ -122,4 +125,9 @@ struct kvmppc_book3s_shadow_vcpu {
 
 #endif /*__ASSEMBLY__ */
 
+/* Values for kvm_state */
+#define KVM_HWTHREAD_IN_KERNEL	0
+#define KVM_HWTHREAD_IN_NAP	1
+#define KVM_HWTHREAD_IN_KVM	2
+
 #endif /* __ASM_KVM_BOOK3S_ASM_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index bbede58..2abcf7d 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -540,6 +540,8 @@ int main(void)
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 	HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
 	HSTATE_FIELD(HSTATE_NAPPING, napping);
+	HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
+	HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index cb705fd..8829b10 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -63,11 +63,13 @@ BEGIN_FTR_SECTION
 	GET_PACA(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
-	lbz	r0,PACAPROCSTART(r13)
-	cmpwi	r0,0x80
-	bne	1f
-	li	r0,1
-	stb	r0,PACAPROCSTART(r13)
+	li	r0,KVM_HWTHREAD_IN_KERNEL
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r0,0
+	beq	1f
 	b	kvm_start_guest
 1:
 #endif
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index 0cdc9a3..7140d83 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -16,6 +16,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/ppc-opcode.h>
 #include <asm/hw_irq.h>
+#include <asm/kvm_book3s_asm.h>
 
 #undef DEBUG
 
@@ -81,6 +82,12 @@ _GLOBAL(power7_idle)
 	std	r9,_MSR(r1)
 	std	r1,PACAR1(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	/* Tell KVM we're napping */
+	li	r4,KVM_HWTHREAD_IN_NAP
+	stb	r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
+
 	/* Magic NAP mode enter sequence */
 	std	r0,0(r1)
 	ptesync
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 01294a5..e87f619 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -569,6 +569,45 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 	list_del(&vcpu->arch.run_list);
 }
 
+static int kvmppc_grab_hwthread(int cpu)
+{
+	struct paca_struct *tpaca;
+	long timeout = 1000;
+
+	tpaca = &paca[cpu];
+
+	/* Ensure the thread won't go into the kernel if it wakes */
+	tpaca->kvm_hstate.hwthread_req = 1;
+
+	/*
+	 * If the thread is already executing in the kernel (e.g. handling
+	 * a stray interrupt), wait for it to get back to nap mode.
+	 * The smp_mb() is to ensure that our setting of hwthread_req
+	 * is visible before we look at hwthread_state, so if this
+	 * races with the code at system_reset_pSeries and the thread
+	 * misses our setting of hwthread_req, we are sure to see its
+	 * setting of hwthread_state, and vice versa.
+	 */
+	smp_mb();
+	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
+		if (--timeout <= 0) {
+			pr_err("KVM: couldn't grab cpu %d\n", cpu);
+			return -EBUSY;
+		}
+		udelay(1);
+	}
+	return 0;
+}
+
+static void kvmppc_release_hwthread(int cpu)
+{
+	struct paca_struct *tpaca;
+
+	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.hwthread_req = 0;
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
+}
+
 static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 {
 	int cpu;
@@ -588,8 +627,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
 	if (vcpu->arch.ptid) {
-		tpaca->cpu_start = 0x80;
-		wmb();
+		kvmppc_grab_hwthread(cpu);
 		xics_wake_cpu(cpu);
 		++vc->n_woken;
 	}
@@ -639,7 +677,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
 	long ret;
 	u64 now;
-	int ptid;
+	int ptid, i;
 
 	/* don't start if any threads have a signal pending */
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -686,12 +724,17 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->napping_threads = 0;
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		kvmppc_start_thread(vcpu);
+	/* Grab any remaining hw threads so they can't go into the kernel */
+	for (i = ptid; i < threads_per_core; ++i)
+		kvmppc_grab_hwthread(vc->pcpu + i);
 
 	preempt_disable();
 	spin_unlock(&vc->lock);
 
 	kvm_guest_enter();
 	__kvmppc_vcore_entry(NULL, vcpu0);
+	for (i = 0; i < threads_per_core; ++i)
+		kvmppc_release_hwthread(vc->pcpu + i);
 
 	spin_lock(&vc->lock);
 	/* disable sending of IPIs on virtual external irqs */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b70bf22..d595033 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -26,6 +26,7 @@
 #include <asm/hvcall.h>
 #include <asm/asm-offsets.h>
 #include <asm/exception-64s.h>
+#include <asm/kvm_book3s_asm.h>
 
 /*****************************************************************************
  *                                                                           *
@@ -82,6 +83,7 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
 
 #define XICS_XIRR		4
 #define XICS_QIRR		0xc
+#define XICS_IPI		2	/* interrupt source # for IPIs */
 
 /*
  * We come in here when wakened from nap mode on a secondary hw thread.
@@ -94,26 +96,54 @@ kvm_start_guest:
 	subi	r1,r1,STACK_FRAME_OVERHEAD
 	ld	r2,PACATOC(r13)
 
-	/* were we napping due to cede? */
-	lbz	r0,HSTATE_NAPPING(r13)
-	cmpwi	r0,0
-	bne	kvm_end_cede
+	li	r0,KVM_HWTHREAD_IN_KVM
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
 
-	/* get vcpu pointer */
-	ld	r4, HSTATE_KVM_VCPU(r13)
+	/* NV GPR values from power7_idle() will no longer be valid */
+	li	r0,1
+	stb	r0,PACA_NAPSTATELOST(r13)
 
-	/* We got here with an IPI; clear it */
-	ld	r5, HSTATE_XICS_PHYS(r13)
-	li	r0, 0xff
-	li	r6, XICS_QIRR
-	li	r7, XICS_XIRR
-	lwzcix	r8, r5, r7		/* ack the interrupt */
+	/* get vcpu pointer, NULL if we have no vcpu to run */
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	cmpdi	cr1,r4,0
+
+	/* Check the wake reason in SRR1 to see why we got here */
+	mfspr	r3,SPRN_SRR1
+	rlwinm	r3,r3,44-31,0x7		/* extract wake reason field */
+	cmpwi	r3,4			/* was it an external interrupt? */
+	bne	27f
+
+	/*
+	 * External interrupt - for now assume it is an IPI, since we
+	 * should never get any other interrupts sent to offline threads.
+	 * Only do this for secondary threads.
+	 */
+	beq	cr1,25f
+	lwz	r3,VCPU_PTID(r4)
+	cmpwi	r3,0
+	beq	27f
+25:	ld	r5,HSTATE_XICS_PHYS(r13)
+	li	r0,0xff
+	li	r6,XICS_QIRR
+	li	r7,XICS_XIRR
+	lwzcix	r8,r5,r7		/* get and ack the interrupt */
 	sync
-	stbcix	r0, r5, r6		/* clear it */
-	stwcix	r8, r5, r7		/* EOI it */
+	clrldi.	r9,r8,40		/* get interrupt source ID. */
+	beq	27f			/* none there? */
+	cmpwi	r9,XICS_IPI
+	bne	26f
+	stbcix	r0,r5,r6		/* clear IPI */
+26:	stwcix	r8,r5,r7		/* EOI the interrupt */
 
-	/* NV GPR values from power7_idle() will no longer be valid */
-	stb	r0, PACA_NAPSTATELOST(r13)
+27:	/* XXX should handle hypervisor maintenance interrupts etc. here */
+
+	/* if we have no vcpu to run, go back to sleep */
+	beq	cr1,kvm_no_guest
+
+	/* were we napping due to cede? */
+	lbz	r0,HSTATE_NAPPING(r13)
+	cmpwi	r0,0
+	bne	kvm_end_cede
 
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
@@ -1445,8 +1475,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	 * Take a nap until a decrementer or external interrupt occurs,
 	 * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
 	 */
-	li	r0,0x80
-	stb	r0,PACAPROCSTART(r13)
+	li	r0,1
+	stb	r0,HSTATE_HWTHREAD_REQ(r13)
 	mfspr	r5,SPRN_LPCR
 	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR,r5
@@ -1463,26 +1493,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 kvm_end_cede:
 	/* Woken by external or decrementer interrupt */
 	ld	r1, HSTATE_HOST_R1(r13)
-	ld	r2, PACATOC(r13)
 
-	/* If we're a secondary thread and we got here by an IPI, ack it */
-	ld	r4,HSTATE_KVM_VCPU(r13)
-	lwz	r3,VCPU_PTID(r4)
-	cmpwi	r3,0
-	beq	27f
-	mfspr	r3,SPRN_SRR1
-	rlwinm	r3,r3,44-31,0x7		/* extract wake reason field */
-	cmpwi	r3,4			/* was it an external interrupt? */
-	bne	27f
-	ld	r5, HSTATE_XICS_PHYS(r13)
-	li	r0,0xff
-	li	r6,XICS_QIRR
-	li	r7,XICS_XIRR
-	lwzcix	r8,r5,r7		/* ack the interrupt */
-	sync
-	stbcix	r0,r5,r6		/* clear it */
-	stwcix	r8,r5,r7		/* EOI it */
-27:
 	/* load up FP state */
 	bl	kvmppc_load_fp
 
@@ -1580,12 +1591,17 @@ secondary_nap:
 	stwcx.	r3, 0, r4
 	bne	51b
 
+kvm_no_guest:
+	li	r0, KVM_HWTHREAD_IN_NAP
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+	li	r0, 0
+	std	r0, HSTATE_KVM_VCPU(r13)
+
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR, r4
 	isync
-	li	r0, 0
 	std	r0, HSTATE_SCRATCH0(r13)
 	ptesync
 	ld	r0, HSTATE_SCRATCH0(r13)
-- 
cgit v0.10.2


From 2e25aa5f64b18a97f35266e51c71ff4dc644db0c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sun, 19 Feb 2012 17:46:32 +0000
Subject: KVM: PPC: Book3S HV: Make virtual processor area registration more
 robust

The PAPR API allows three sorts of per-virtual-processor areas to be
registered (VPA, SLB shadow buffer, and dispatch trace log), and
furthermore, these can be registered and unregistered for another
virtual CPU.  Currently we just update the vcpu fields pointing to
these areas at the time of registration or unregistration.  If this
is done on another vcpu, there is the possibility that the target vcpu
is using those fields at the time and could end up using a bogus
pointer and corrupting memory.

This fixes the race by making the target cpu itself do the update, so
we can be sure that the update happens at a time when the fields
aren't being used.  Each area now has a struct kvmppc_vpa which is
used to manage these updates.  There is also a spinlock which protects
access to all of the kvmppc_vpa structs, other than to the pinned_addr
fields.  (We could have just taken the spinlock when using the vpa,
slb_shadow or dtl fields, but that would mean taking the spinlock on
every guest entry and exit.)

This also changes 'struct dtl' (which was undefined) to 'struct dtl_entry',
which is what the rest of the kernel uses.

Thanks to Michael Ellerman <michael@ellerman.id.au> for pointing out
the need to initialize vcpu->arch.vpa_update_lock.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 1c324ff..318bac9 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -114,6 +114,16 @@
 #define H_PP1			(1UL<<(63-62))
 #define H_PP2			(1UL<<(63-63))
 
+/* Flags for H_REGISTER_VPA subfunction field */
+#define H_VPA_FUNC_SHIFT	(63-18)	/* Bit posn of subfunction code */
+#define H_VPA_FUNC_MASK		7UL
+#define H_VPA_REG_VPA		1UL	/* Register Virtual Processor Area */
+#define H_VPA_REG_DTL		2UL	/* Register Dispatch Trace Log */
+#define H_VPA_REG_SLB		3UL	/* Register SLB shadow buffer */
+#define H_VPA_DEREG_VPA		5UL	/* Deregister Virtual Processor Area */
+#define H_VPA_DEREG_DTL		6UL	/* Deregister Dispatch Trace Log */
+#define H_VPA_DEREG_SLB		7UL	/* Deregister SLB shadow buffer */
+
 /* VASI States */
 #define H_VASI_INVALID          0
 #define H_VASI_ENABLED          1
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 97ecdaf..93ffd8d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -82,7 +82,7 @@ struct kvm_vcpu;
 
 struct lppaca;
 struct slb_shadow;
-struct dtl;
+struct dtl_entry;
 
 struct kvm_vm_stat {
 	u32 remote_tlb_flush;
@@ -279,6 +279,19 @@ struct kvmppc_vcore {
 #define VCORE_EXITING	2
 #define VCORE_SLEEPING	3
 
+/*
+ * Struct used to manage memory for a virtual processor area
+ * registered by a PAPR guest.  There are three types of area
+ * that a guest can register.
+ */
+struct kvmppc_vpa {
+	void *pinned_addr;	/* Address in kernel linear mapping */
+	void *pinned_end;	/* End of region */
+	unsigned long next_gpa;	/* Guest phys addr for update */
+	unsigned long len;	/* Number of bytes required */
+	u8 update_pending;	/* 1 => update pinned_addr from next_gpa */
+};
+
 struct kvmppc_pte {
 	ulong eaddr;
 	u64 vpage;
@@ -473,11 +486,6 @@ struct kvm_vcpu_arch {
 	u8 prodded;
 	u32 last_inst;
 
-	struct lppaca *vpa;
-	struct slb_shadow *slb_shadow;
-	struct dtl *dtl;
-	struct dtl *dtl_end;
-
 	wait_queue_head_t *wqp;
 	struct kvmppc_vcore *vcore;
 	int ret;
@@ -502,6 +510,13 @@ struct kvm_vcpu_arch {
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
 	pgd_t *pgdir;
+
+	spinlock_t vpa_update_lock;
+	struct kvmppc_vpa vpa;
+	struct kvmppc_vpa dtl;
+	struct dtl_entry *dtl_ptr;
+	unsigned long dtl_index;
+	struct kvmppc_vpa slb_shadow;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 2abcf7d..502e038 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -466,7 +466,7 @@ int main(void)
 	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
 	DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
 	DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
-	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
+	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e87f619..2444a9c 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -134,6 +134,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
 	vpa->yield_count = 1;
 }
 
+/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
+struct reg_vpa {
+	u32 dummy;
+	union {
+		u16 hword;
+		u32 word;
+	} length;
+};
+
+static int vpa_is_registered(struct kvmppc_vpa *vpap)
+{
+	if (vpap->update_pending)
+		return vpap->next_gpa != 0;
+	return vpap->pinned_addr != NULL;
+}
+
 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 				       unsigned long flags,
 				       unsigned long vcpuid, unsigned long vpa)
@@ -142,88 +158,153 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 	unsigned long len, nb;
 	void *va;
 	struct kvm_vcpu *tvcpu;
-	int err = H_PARAMETER;
+	int err;
+	int subfunc;
+	struct kvmppc_vpa *vpap;
 
 	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 	if (!tvcpu)
 		return H_PARAMETER;
 
-	flags >>= 63 - 18;
-	flags &= 7;
-	if (flags == 0 || flags == 4)
-		return H_PARAMETER;
-	if (flags < 4) {
-		if (vpa & 0x7f)
+	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
+	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
+	    subfunc == H_VPA_REG_SLB) {
+		/* Registering new area - address must be cache-line aligned */
+		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
 			return H_PARAMETER;
-		if (flags >= 2 && !tvcpu->arch.vpa)
-			return H_RESOURCE;
-		/* registering new area; convert logical addr to real */
+
+		/* convert logical addr to kernel addr and read length */
 		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
 		if (va == NULL)
 			return H_PARAMETER;
-		if (flags <= 1)
-			len = *(unsigned short *)(va + 4);
+		if (subfunc == H_VPA_REG_VPA)
+			len = ((struct reg_vpa *)va)->length.hword;
 		else
-			len = *(unsigned int *)(va + 4);
-		if (len > nb)
-			goto out_unpin;
-		switch (flags) {
-		case 1:		/* register VPA */
-			if (len < 640)
-				goto out_unpin;
-			if (tvcpu->arch.vpa)
-				kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa);
-			tvcpu->arch.vpa = va;
-			init_vpa(vcpu, va);
-			break;
-		case 2:		/* register DTL */
-			if (len < 48)
-				goto out_unpin;
-			len -= len % 48;
-			if (tvcpu->arch.dtl)
-				kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl);
-			tvcpu->arch.dtl = va;
-			tvcpu->arch.dtl_end = va + len;
+			len = ((struct reg_vpa *)va)->length.word;
+		kvmppc_unpin_guest_page(kvm, va);
+
+		/* Check length */
+		if (len > nb || len < sizeof(struct reg_vpa))
+			return H_PARAMETER;
+	} else {
+		vpa = 0;
+		len = 0;
+	}
+
+	err = H_PARAMETER;
+	vpap = NULL;
+	spin_lock(&tvcpu->arch.vpa_update_lock);
+
+	switch (subfunc) {
+	case H_VPA_REG_VPA:		/* register VPA */
+		if (len < sizeof(struct lppaca))
 			break;
-		case 3:		/* register SLB shadow buffer */
-			if (len < 16)
-				goto out_unpin;
-			if (tvcpu->arch.slb_shadow)
-				kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow);
-			tvcpu->arch.slb_shadow = va;
+		vpap = &tvcpu->arch.vpa;
+		err = 0;
+		break;
+
+	case H_VPA_REG_DTL:		/* register DTL */
+		if (len < sizeof(struct dtl_entry))
 			break;
-		}
-	} else {
-		switch (flags) {
-		case 5:		/* unregister VPA */
-			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
-				return H_RESOURCE;
-			if (!tvcpu->arch.vpa)
-				break;
-			kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa);
-			tvcpu->arch.vpa = NULL;
+		len -= len % sizeof(struct dtl_entry);
+
+		/* Check that they have previously registered a VPA */
+		err = H_RESOURCE;
+		if (!vpa_is_registered(&tvcpu->arch.vpa))
 			break;
-		case 6:		/* unregister DTL */
-			if (!tvcpu->arch.dtl)
-				break;
-			kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl);
-			tvcpu->arch.dtl = NULL;
+
+		vpap = &tvcpu->arch.dtl;
+		err = 0;
+		break;
+
+	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
+		/* Check that they have previously registered a VPA */
+		err = H_RESOURCE;
+		if (!vpa_is_registered(&tvcpu->arch.vpa))
 			break;
-		case 7:		/* unregister SLB shadow buffer */
-			if (!tvcpu->arch.slb_shadow)
-				break;
-			kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow);
-			tvcpu->arch.slb_shadow = NULL;
+
+		vpap = &tvcpu->arch.slb_shadow;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_VPA:		/* deregister VPA */
+		/* Check they don't still have a DTL or SLB buf registered */
+		err = H_RESOURCE;
+		if (vpa_is_registered(&tvcpu->arch.dtl) ||
+		    vpa_is_registered(&tvcpu->arch.slb_shadow))
 			break;
-		}
+
+		vpap = &tvcpu->arch.vpa;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_DTL:		/* deregister DTL */
+		vpap = &tvcpu->arch.dtl;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
+		vpap = &tvcpu->arch.slb_shadow;
+		err = 0;
+		break;
 	}
-	return H_SUCCESS;
 
- out_unpin:
-	kvmppc_unpin_guest_page(kvm, va);
+	if (vpap) {
+		vpap->next_gpa = vpa;
+		vpap->len = len;
+		vpap->update_pending = 1;
+	}
+
+	spin_unlock(&tvcpu->arch.vpa_update_lock);
+
 	return err;
 }
 
+static void kvmppc_update_vpa(struct kvm *kvm, struct kvmppc_vpa *vpap)
+{
+	void *va;
+	unsigned long nb;
+
+	vpap->update_pending = 0;
+	va = NULL;
+	if (vpap->next_gpa) {
+		va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
+		if (nb < vpap->len) {
+			/*
+			 * If it's now too short, it must be that userspace
+			 * has changed the mappings underlying guest memory,
+			 * so unregister the region.
+			 */
+			kvmppc_unpin_guest_page(kvm, va);
+			va = NULL;
+		}
+	}
+	if (vpap->pinned_addr)
+		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr);
+	vpap->pinned_addr = va;
+	if (va)
+		vpap->pinned_end = va + vpap->len;
+}
+
+static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (vcpu->arch.vpa.update_pending) {
+		kvmppc_update_vpa(kvm, &vcpu->arch.vpa);
+		init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+	}
+	if (vcpu->arch.dtl.update_pending) {
+		kvmppc_update_vpa(kvm, &vcpu->arch.dtl);
+		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
+		vcpu->arch.dtl_index = 0;
+	}
+	if (vcpu->arch.slb_shadow.update_pending)
+		kvmppc_update_vpa(kvm, &vcpu->arch.slb_shadow);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+}
+
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 {
 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
@@ -468,6 +549,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* default to host PVR, since we can't spoof it */
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+	spin_lock_init(&vcpu->arch.vpa_update_lock);
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
@@ -512,12 +594,14 @@ out:
 
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.dtl)
-		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl);
-	if (vcpu->arch.slb_shadow)
-		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow);
-	if (vcpu->arch.vpa)
-		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa);
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (vcpu->arch.dtl.pinned_addr)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr);
+	if (vcpu->arch.slb_shadow.pinned_addr)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr);
+	if (vcpu->arch.vpa.pinned_addr)
+		kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
@@ -722,8 +806,13 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->in_guest = 0;
 	vc->pcpu = smp_processor_id();
 	vc->napping_threads = 0;
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		kvmppc_start_thread(vcpu);
+		if (vcpu->arch.vpa.update_pending ||
+		    vcpu->arch.slb_shadow.update_pending ||
+		    vcpu->arch.dtl.update_pending)
+			kvmppc_update_vpas(vcpu);
+	}
 	/* Grab any remaining hw threads so they can't go into the kernel */
 	for (i = ptid; i < threads_per_core; ++i)
 		kvmppc_grab_hwthread(vc->pcpu + i);
-- 
cgit v0.10.2


From 0456ec4ff2b832ab9ff476ed687fea704500f1cd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 3 Feb 2012 00:56:21 +0000
Subject: KVM: PPC: Book3S HV: Report stolen time to guest through dispatch
 trace log

This adds code to measure "stolen" time per virtual core in units of
timebase ticks, and to report the stolen time to the guest using the
dispatch trace log (DTL).  The guest can register an area of memory
for the DTL for a given vcpu.  The DTL is a ring buffer where KVM
fills in one entry every time it enters the guest for that vcpu.

Stolen time is measured as time when the virtual core is not running,
either because the vcore is not runnable (e.g. some of its vcpus are
executing elsewhere in the kernel or in userspace), or when the vcpu
thread that is running the vcore is preempted.  This includes time
when all the vcpus are idle (i.e. have executed the H_CEDE hypercall),
which is OK because the guest accounts stolen time while idle as idle
time.

Each vcpu keeps a record of how much stolen time has been reported to
the guest for that vcpu so far.  When we are about to enter the guest,
we create a new DTL entry (if the guest vcpu has a DTL) and report the
difference between total stolen time for the vcore and stolen time
reported so far for the vcpu as the "enqueue to dispatch" time in the
DTL entry.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 93ffd8d..014eaf2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -268,6 +268,9 @@ struct kvmppc_vcore {
 	struct list_head runnable_threads;
 	spinlock_t lock;
 	wait_queue_head_t wq;
+	u64 stolen_tb;
+	u64 preempt_tb;
+	struct kvm_vcpu *runner;
 };
 
 #define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
@@ -516,6 +519,7 @@ struct kvm_vcpu_arch {
 	struct kvmppc_vpa dtl;
 	struct dtl_entry *dtl_ptr;
 	unsigned long dtl_index;
+	u64 stolen_logged;
 	struct kvmppc_vpa slb_shadow;
 #endif
 };
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2444a9c..9079357 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -60,12 +60,20 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu);
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
 	local_paca->kvm_hstate.kvm_vcpu = vcpu;
-	local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
+	local_paca->kvm_hstate.kvm_vcore = vc;
+	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+		vc->stolen_tb += mftb() - vc->preempt_tb;
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+		vc->preempt_tb = mftb();
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -305,6 +313,35 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 }
 
+static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+				    struct kvmppc_vcore *vc)
+{
+	struct dtl_entry *dt;
+	struct lppaca *vpa;
+	unsigned long old_stolen;
+
+	dt = vcpu->arch.dtl_ptr;
+	vpa = vcpu->arch.vpa.pinned_addr;
+	old_stolen = vcpu->arch.stolen_logged;
+	vcpu->arch.stolen_logged = vc->stolen_tb;
+	if (!dt || !vpa)
+		return;
+	memset(dt, 0, sizeof(struct dtl_entry));
+	dt->dispatch_reason = 7;
+	dt->processor_id = vc->pcpu + vcpu->arch.ptid;
+	dt->timebase = mftb();
+	dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen;
+	dt->srr0 = kvmppc_get_pc(vcpu);
+	dt->srr1 = vcpu->arch.shregs.msr;
+	++dt;
+	if (dt == vcpu->arch.dtl.pinned_end)
+		dt = vcpu->arch.dtl.pinned_addr;
+	vcpu->arch.dtl_ptr = dt;
+	/* order writing *dt vs. writing vpa->dtl_idx */
+	smp_wmb();
+	vpa->dtl_idx = ++vcpu->arch.dtl_index;
+}
+
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 {
 	unsigned long req = kvmppc_get_gpr(vcpu, 3);
@@ -568,6 +605,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 			INIT_LIST_HEAD(&vcore->runnable_threads);
 			spin_lock_init(&vcore->lock);
 			init_waitqueue_head(&vcore->wq);
+			vcore->preempt_tb = mftb();
 		}
 		kvm->arch.vcores[core] = vcore;
 	}
@@ -580,6 +618,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	++vcore->num_threads;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
+	vcpu->arch.stolen_logged = vcore->stolen_tb;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -803,6 +842,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->nap_count = 0;
 	vc->entry_exit_count = 0;
 	vc->vcore_state = VCORE_RUNNING;
+	vc->stolen_tb += mftb() - vc->preempt_tb;
 	vc->in_guest = 0;
 	vc->pcpu = smp_processor_id();
 	vc->napping_threads = 0;
@@ -812,6 +852,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		    vcpu->arch.slb_shadow.update_pending ||
 		    vcpu->arch.dtl.update_pending)
 			kvmppc_update_vpas(vcpu);
+		kvmppc_create_dtl_entry(vcpu, vc);
 	}
 	/* Grab any remaining hw threads so they can't go into the kernel */
 	for (i = ptid; i < threads_per_core; ++i)
@@ -869,6 +910,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	spin_lock(&vc->lock);
  out:
 	vc->vcore_state = VCORE_INACTIVE;
+	vc->preempt_tb = mftb();
 	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
 				 arch.run_list) {
 		if (vcpu->arch.ret != RESUME_GUEST) {
@@ -967,6 +1009,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			spin_lock(&vc->lock);
 			continue;
 		}
+		vc->runner = vcpu;
 		n_ceded = 0;
 		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
 			n_ceded += v->arch.ceded;
@@ -986,6 +1029,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 				wake_up(&v->arch.cpu_run);
 			}
 		}
+		vc->runner = NULL;
 	}
 
 	if (signal_pending(current)) {
-- 
cgit v0.10.2


From c0fe7b099931c6c05c98a05c277185ee25254f35 Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Mon, 5 Mar 2012 01:34:08 +0000
Subject: Restore guest CR after exit timing calculation

No instruction which can change Condition Register (CR) should be executed after
Guest CR is loaded. So the guest CR is restored after the Exit Timing in
lightweight_exit executes cmpw, which can clobber CR.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 57e2fa4..909e96e 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -580,7 +580,6 @@ lightweight_exit:
 	mtlr	r3
 	mtxer	r5
 	mtctr	r6
-	mtcr	r7
 	mtsrr0	r8
 	mtsrr1	r9
 
@@ -588,14 +587,20 @@ lightweight_exit:
 	/* save enter time */
 1:
 	mfspr	r6, SPRN_TBRU
-	mfspr	r7, SPRN_TBRL
+	mfspr	r9, SPRN_TBRL
 	mfspr	r8, SPRN_TBRU
 	cmpw	r8, r6
-	PPC_STL	r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	PPC_STL	r9, VCPU_TIMING_LAST_ENTER_TBL(r4)
 	bne	1b
 	PPC_STL	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
 #endif
 
+	/*
+	 * Don't execute any instruction which can change CR after
+	 * below instruction.
+	 */
+	mtcr	r7
+
 	/* Finish loading guest volatiles and jump to guest. */
 	PPC_LL	r5, VCPU_GPR(r5)(r4)
 	PPC_LL	r6, VCPU_GPR(r6)(r4)
-- 
cgit v0.10.2


From 7657f4089b097846cc37bfa2b74fc0bd2bd60e30 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 5 Mar 2012 21:42:25 +0000
Subject: KVM: PPC: Book 3S: Fix compilation for !HV configs

Commits 2f5cdd5487 ("KVM: PPC: Book3S HV: Make secondary threads more
robust against stray IPIs") and 1c2066b0f7 ("KVM: PPC: Book3S HV: Make
virtual processor area registration more robust") added fields to
struct kvm_vcpu_arch inside #ifdef CONFIG_KVM_BOOK3S_64_HV regions,
and added lines to arch/powerpc/kernel/asm-offsets.c to generate
assembler constants for their offsets.  Unfortunately this led to
compile errors on Book 3S machines for configs that had KVM enabled
but not CONFIG_KVM_BOOK3S_64_HV.  This fixes the problem by moving
the offending lines inside #ifdef CONFIG_KVM_BOOK3S_64_HV regions.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 502e038..694af3e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -450,6 +450,7 @@ int main(void)
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -466,7 +467,6 @@ int main(void)
 	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
 	DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
 	DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
-	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
@@ -540,10 +540,10 @@ int main(void)
 	HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
 	HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
 	HSTATE_FIELD(HSTATE_NAPPING, napping);
-	HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
-	HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
+	HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
+	HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
 	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
-- 
cgit v0.10.2


From 8943633cf9b87980d261a022e90d94bc2c55df35 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 2 Mar 2012 01:38:23 +0000
Subject: KVM: PPC: Work around POWER7 DABR corruption problem

It turns out that on POWER7, writing to the DABR can cause a corrupted
value to be written if the PMU is active and updating SDAR in continuous
sampling mode.  To work around this, we make sure that the PMU is inactive
and SDAR updates are disabled (via MMCRA) when we are context-switching
DABR.

When the guest sets DABR via the H_SET_DABR hypercall, we use a slightly
different workaround, which is to read back the DABR and write it again
if it got corrupted.

While we are at it, make it consistent that the saving and restoring
of the guest's non-volatile GPRs and the FPRs are done with the guest
setup of the PMU active.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index d3fb4df..84035a5 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -68,19 +68,24 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	rotldi  r10,r10,16
 	mtmsrd  r10,1
 
-	/* Save host PMU registers and load guest PMU registers */
+	/* Save host PMU registers */
 	/* R4 is live here (vcpu pointer) but not r3 or r5 */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	mfspr	r6, SPRN_MMCRA
+BEGIN_FTR_SECTION
+	/* On P7, clear MMCRA in order to disable SDAR updates */
+	li	r5, 0
+	mtspr	SPRN_MMCRA, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	isync
 	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
 	lbz	r5, LPPACA_PMCINUSE(r3)
 	cmpwi	r5, 0
 	beq	31f			/* skip if not */
 	mfspr	r5, SPRN_MMCR1
-	mfspr	r6, SPRN_MMCRA
 	std	r7, HSTATE_MMCR(r13)
 	std	r5, HSTATE_MMCR + 8(r13)
 	std	r6, HSTATE_MMCR + 16(r13)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index d595033..a84aafc 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -159,24 +159,15 @@ kvmppc_hv_entry:
 	mflr	r0
 	std	r0, HSTATE_VMHANDLER(r13)
 
-	ld	r14, VCPU_GPR(r14)(r4)
-	ld	r15, VCPU_GPR(r15)(r4)
-	ld	r16, VCPU_GPR(r16)(r4)
-	ld	r17, VCPU_GPR(r17)(r4)
-	ld	r18, VCPU_GPR(r18)(r4)
-	ld	r19, VCPU_GPR(r19)(r4)
-	ld	r20, VCPU_GPR(r20)(r4)
-	ld	r21, VCPU_GPR(r21)(r4)
-	ld	r22, VCPU_GPR(r22)(r4)
-	ld	r23, VCPU_GPR(r23)(r4)
-	ld	r24, VCPU_GPR(r24)(r4)
-	ld	r25, VCPU_GPR(r25)(r4)
-	ld	r26, VCPU_GPR(r26)(r4)
-	ld	r27, VCPU_GPR(r27)(r4)
-	ld	r28, VCPU_GPR(r28)(r4)
-	ld	r29, VCPU_GPR(r29)(r4)
-	ld	r30, VCPU_GPR(r30)(r4)
-	ld	r31, VCPU_GPR(r31)(r4)
+	/* Set partition DABR */
+	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
+	li	r5,3
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+BEGIN_FTR_SECTION
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Load guest PMU registers */
 	/* R4 is live here (vcpu pointer) */
@@ -215,6 +206,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	/* Load up FP, VMX and VSX registers */
 	bl	kvmppc_load_fp
 
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
 BEGIN_FTR_SECTION
 	/* Switch DSCR to guest value */
 	ld	r5, VCPU_DSCR(r4)
@@ -256,12 +266,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	mtspr	SPRN_DAR, r5
 	mtspr	SPRN_DSISR, r6
 
-	/* Set partition DABR */
-	li	r5,3
-	ld	r6,VCPU_DABR(r4)
-	mtspr	SPRN_DABRX,r5
-	mtspr	SPRN_DABR,r6
-
 BEGIN_FTR_SECTION
 	/* Restore AMR and UAMOR, set AMOR to all 1s */
 	ld	r5,VCPU_AMR(r4)
@@ -955,12 +959,6 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_AMR,r6
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
-	/* Restore host DABR and DABRX */
-	ld	r5,HSTATE_DABR(r13)
-	li	r6,7
-	mtspr	SPRN_DABR,r5
-	mtspr	SPRN_DABRX,r6
-
 	/* Switch DSCR back to host value */
 BEGIN_FTR_SECTION
 	mfspr	r8, SPRN_DSCR
@@ -999,6 +997,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	std	r5, VCPU_SPRG2(r9)
 	std	r6, VCPU_SPRG3(r9)
 
+	/* save FP state */
+	mr	r3, r9
+	bl	.kvmppc_save_fp
+
 	/* Increment yield count if they have a VPA */
 	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
 	cmpdi	r8, 0
@@ -1013,6 +1015,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	mfspr	r6, SPRN_MMCRA
+BEGIN_FTR_SECTION
+	/* On P7, clear MMCRA in order to disable SDAR updates */
+	li	r7, 0
+	mtspr	SPRN_MMCRA, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	isync
 	beq	21f			/* if no VPA, save PMU stuff anyway */
 	lbz	r7, LPPACA_PMCINUSE(r8)
@@ -1021,7 +1029,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
 	b	22f
 21:	mfspr	r5, SPRN_MMCR1
-	mfspr	r6, SPRN_MMCRA
 	std	r4, VCPU_MMCR(r9)
 	std	r5, VCPU_MMCR + 8(r9)
 	std	r6, VCPU_MMCR + 16(r9)
@@ -1046,17 +1053,20 @@ BEGIN_FTR_SECTION
 	stw	r11, VCPU_PMC + 28(r9)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 22:
-	/* save FP state */
-	mr	r3, r9
-	bl	.kvmppc_save_fp
 
 	/* Secondary threads go off to take a nap on POWER7 */
 BEGIN_FTR_SECTION
-	lwz	r0,VCPU_PTID(r3)
+	lwz	r0,VCPU_PTID(r9)
 	cmpwi	r0,0
 	bne	secondary_nap
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
+	/* Restore host DABR and DABRX */
+	ld	r5,HSTATE_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+
 	/*
 	 * Reload DEC.  HDEC interrupts were disabled when
 	 * we reloaded the host's LPCR value.
@@ -1393,7 +1403,12 @@ bounce_ext_interrupt:
 
 _GLOBAL(kvmppc_h_set_dabr)
 	std	r4,VCPU_DABR(r3)
-	mtspr	SPRN_DABR,r4
+	/* Work around P7 bug where DABR can get corrupted on mtspr */
+1:	mtspr	SPRN_DABR,r4
+	mfspr	r5, SPRN_DABR
+	cmpd	r4, r5
+	bne	1b
+	isync
 	li	r3,0
 	blr
 
@@ -1615,8 +1630,8 @@ kvm_no_guest:
  * r3 = vcpu pointer
  */
 _GLOBAL(kvmppc_save_fp)
-	mfmsr	r9
-	ori	r8,r9,MSR_FP
+	mfmsr	r5
+	ori	r8,r5,MSR_FP
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
 	oris	r8,r8,MSR_VEC@h
@@ -1665,7 +1680,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
 	mfspr	r6,SPRN_VRSAVE
 	stw	r6,VCPU_VRSAVE(r3)
-	mtmsrd	r9
+	mtmsrd	r5
 	isync
 	blr
 
-- 
cgit v0.10.2


From 6020c0f6e78888b6023559e9bf633ad0092a1709 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 12 Mar 2012 02:26:30 +0100
Subject: KVM: PPC: Pass EA to updating emulation ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When emulating updating load/store instructions (lwzu, stwu, ...) we need to
write the effective address of the load/store into a register.

Currently, we write the physical address in there, which is very wrong. So
instead let's save off where the virtual fault was on MMIO and use that
information as value to put into the register.

While at it, also move the XOP variants of the above instructions to the new
scheme of using the already known vaddr instead of calculating it themselves.

Reported-by: Jörg Sommer <joerg@alea.gnuu.de>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 014eaf2..42a527e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -464,6 +464,7 @@ struct kvm_vcpu_arch {
 	u32 epr;
 #endif
 	gpa_t paddr_accessed;
+	gva_t vaddr_accessed;
 
 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_is_bigendian;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d031ce1..8e6401f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -447,7 +447,7 @@ static int instruction_is_store(unsigned int instr)
 }
 
 static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
-				  unsigned long gpa, int is_store)
+				  unsigned long gpa, gva_t ea, int is_store)
 {
 	int ret;
 	u32 last_inst;
@@ -494,6 +494,7 @@ static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 */
 
 	vcpu->arch.paddr_accessed = gpa;
+	vcpu->arch.vaddr_accessed = ea;
 	return kvmppc_emulate_mmio(run, vcpu);
 }
 
@@ -547,7 +548,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* No memslot means it's an emulated MMIO region */
 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 		unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
-		return kvmppc_hv_emulate_mmio(run, vcpu, gpa,
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
 					      dsisr & DSISR_ISSTORE);
 	}
 
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 7759053..158047f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -351,6 +351,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* MMIO */
 		vcpu->stat.mmio_exits++;
 		vcpu->arch.paddr_accessed = pte.raddr;
+		vcpu->arch.vaddr_accessed = pte.eaddr;
 		r = kvmppc_emulate_mmio(run, vcpu);
 		if ( r == RESUME_HOST_NV )
 			r = RESUME_HOST;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 9f27258..2675dcb 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -875,6 +875,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			/* Guest has mapped and accessed a page which is not
 			 * actually RAM. */
 			vcpu->arch.paddr_accessed = gpaddr;
+			vcpu->arch.vaddr_accessed = eaddr;
 			r = kvmppc_emulate_mmio(run, vcpu);
 			kvmppc_account_exit(vcpu, MMIO_EXITS);
 		}
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 968f401..e79a620 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -141,7 +141,6 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
 int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	u32 inst = kvmppc_get_last_inst(vcpu);
-	u32 ea;
 	int ra;
 	int rb;
 	int rs;
@@ -185,12 +184,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			ra = get_ra(inst);
 			rb = get_rb(inst);
 
-			ea = kvmppc_get_gpr(vcpu, rb);
-			if (ra)
-				ea += kvmppc_get_gpr(vcpu, ra);
-
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
-			kvmppc_set_gpr(vcpu, ra, ea);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STWX:
@@ -212,14 +207,10 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			ra = get_ra(inst);
 			rb = get_rb(inst);
 
-			ea = kvmppc_get_gpr(vcpu, rb);
-			if (ra)
-				ea += kvmppc_get_gpr(vcpu, ra);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               1, 1);
-			kvmppc_set_gpr(vcpu, rs, ea);
+			kvmppc_set_gpr(vcpu, rs, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_LHAX:
@@ -237,12 +228,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			ra = get_ra(inst);
 			rb = get_rb(inst);
 
-			ea = kvmppc_get_gpr(vcpu, rb);
-			if (ra)
-				ea += kvmppc_get_gpr(vcpu, ra);
-
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
-			kvmppc_set_gpr(vcpu, ra, ea);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_MFSPR:
@@ -318,14 +305,10 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			ra = get_ra(inst);
 			rb = get_rb(inst);
 
-			ea = kvmppc_get_gpr(vcpu, rb);
-			if (ra)
-				ea += kvmppc_get_gpr(vcpu, ra);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               2, 1);
-			kvmppc_set_gpr(vcpu, ra, ea);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_MTSPR:
@@ -429,7 +412,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_LBZ:
@@ -441,7 +424,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STW:
@@ -457,7 +440,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               4, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STB:
@@ -473,7 +456,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               1, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_LHZ:
@@ -485,7 +468,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_LHA:
@@ -497,7 +480,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STH:
@@ -513,7 +496,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               2, 1);
-		kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	default:
-- 
cgit v0.10.2


From 6df79df5b27d74e0c9803d7f47bb878370996548 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 13 Mar 2012 22:15:45 +0100
Subject: KVM: PPC: Emulate tw and td instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are 4 conditional trapping instructions: tw, twi, td, tdi. The
ones with an i take an immediate comparison, the others compare two
registers. All of them arrive in the emulator when the condition to
trap was successfully fulfilled.

Unfortunately, we were only implementing the i versions so far, so
let's also add support for the other two.

This fixes kernel booting with recents book3s_32 guest kernels.

Reported-by: Jörg Sommer <joerg@alea.gnuu.de>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index e79a620..afc9154 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -35,7 +35,9 @@
 #define OP_TRAP 3
 #define OP_TRAP_64 2
 
+#define OP_31_XOP_TRAP      4
 #define OP_31_XOP_LWZX      23
+#define OP_31_XOP_TRAP_64   68
 #define OP_31_XOP_LBZX      87
 #define OP_31_XOP_STWX      151
 #define OP_31_XOP_STBX      215
@@ -169,6 +171,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	case 31:
 		switch (get_xop(inst)) {
 
+		case OP_31_XOP_TRAP:
+#ifdef CONFIG_64BIT
+		case OP_31_XOP_TRAP_64:
+#endif
+#ifdef CONFIG_PPC_BOOK3S
+			kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
+#else
+			kvmppc_core_queue_program(vcpu,
+					vcpu->arch.shared->esr | ESR_PTR);
+#endif
+			advance = 0;
+			break;
 		case OP_31_XOP_LWZX:
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
-- 
cgit v0.10.2


From 4f225ae06e7f39a523ec500c3cf127e50797983e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 13 Mar 2012 23:05:16 +0100
Subject: KVM: PPC: Book3s: PR: Add HV traps so we can run in HV=1 mode on p7

When running PR KVM on a p7 system in bare metal, we get HV exits instead
of normal supervisor traps. Semantically they are identical though and the
HSRR vs SRR difference is already taken care of in the exit code.

So all we need to do is handle them in addition to our normal exits.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 158047f..a7f031b 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -618,10 +618,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	/* We're good on these - the host merely wanted to get our attention */
 	case BOOK3S_INTERRUPT_DECREMENTER:
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
 		vcpu->stat.dec_exits++;
 		r = RESUME_GUEST;
 		break;
 	case BOOK3S_INTERRUPT_EXTERNAL:
+	case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
+	case BOOK3S_INTERRUPT_EXTERNAL_HV:
 		vcpu->stat.ext_intr_exits++;
 		r = RESUME_GUEST;
 		break;
@@ -629,6 +632,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 	case BOOK3S_INTERRUPT_PROGRAM:
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
 	{
 		enum emulation_result er;
 		struct kvmppc_book3s_shadow_vcpu *svcpu;
-- 
cgit v0.10.2


From 966cd0f3bdd422f0b10686fb59d0d456fbbb6398 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 14 Mar 2012 16:55:08 +0100
Subject: KVM: PPC: Ignore unhalt request from kvm_vcpu_block

When running kvm_vcpu_block and it realizes that the CPU is actually good
to run, we get a request bit set for KVM_REQ_UNHALT. Right now, there's
nothing we can do with that bit, so let's unset it right after the call
again so we don't get confused in our later checks for pending work.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a7f031b..912e10f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -120,6 +120,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
+			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			vcpu->stat.halt_wakeup++;
 
 			/* Unset POW bit after we woke up */
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 6d1bfe2..60ac0e7 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -224,6 +224,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		return kvmppc_h_pr_bulk_remove(vcpu);
 	case H_CEDE:
 		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
 	}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 2675dcb..72f13f4 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -449,6 +449,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_WE) {
 		local_irq_enable();
 		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		local_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
-- 
cgit v0.10.2


From bbcc9c06695243ea23d30de36842df9200c33857 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 13 Mar 2012 21:52:44 +0000
Subject: powerpc/kvm: Fix magic page vs. 32-bit RTAS on ppc64

When the kernel calls into RTAS, it switches to 32-bit mode. The
magic page was is longer accessible in that case, causing the
patched instructions in the RTAS call wrapper to crash.

This fixes it by making available a 32-bit mapping of the magic
page in that case. This mapping is flushed whenever we switch
the kernel back to 64-bit mode.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[agraf: add a check if the magic page is mapped]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index c8ead7b..3f2a836 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -291,6 +291,9 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	ulong mp_pa = vcpu->arch.magic_page_pa;
 
+	if (!(vcpu->arch.shared->msr & MSR_SF))
+		mp_pa = (uint32_t)mp_pa;
+
 	/* Magic page override */
 	if (unlikely(mp_pa) &&
 	    unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) ==
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 912e10f..dba282e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -145,6 +145,21 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 		}
 	}
 
+	/*
+	 * When switching from 32 to 64-bit, we may have a stale 32-bit
+	 * magic page around, we need to flush it. Typically 32-bit magic
+	 * page will be instanciated when calling into RTAS. Note: We
+	 * assume that such transition only happens while in kernel mode,
+	 * ie, we never transition from user 32-bit to kernel 64-bit with
+	 * a 32-bit magic page around.
+	 */
+	if (vcpu->arch.magic_page_pa &&
+	    !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) {
+		/* going from RTAS to normal kernel code */
+		kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa,
+				     ~0xFFFUL);
+	}
+
 	/* Preload FPU if it's enabled */
 	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
@@ -252,6 +267,9 @@ static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	ulong mp_pa = vcpu->arch.magic_page_pa;
 
+	if (!(vcpu->arch.shared->msr & MSR_SF))
+		mp_pa = (uint32_t)mp_pa;
+
 	if (unlikely(mp_pa) &&
 	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
 		return 1;
-- 
cgit v0.10.2


From e9bda3b3d0ce775afe15eaf71922d342cc74991c Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Tue, 20 Mar 2012 23:33:51 -0700
Subject: KVM: VMX: Auto-load on CPUs with VMX

Enable x86 feature-based autoloading for the kvm-intel module on CPUs
with X86_FEATURE_VMX.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-By: Kay Sievers <kay@vrfy.org>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad85adf..52f6856 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
@@ -51,6 +52,12 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static const struct x86_cpu_id vmx_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_VMX),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-- 
cgit v0.10.2


From c36fc04ef558c95cff46a8c89d2f804f217335f5 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Thu, 8 Mar 2012 12:45:54 +0100
Subject: KVM: x86: add paging gcc optimization

Since most guests will have paging enabled for memory management, add likely() optimization
around CR0.PG checks.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cb80c29..3d1134d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu)
 
 static inline int is_paging(struct kvm_vcpu *vcpu)
 {
-	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+	return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
 }
 
 static inline u32 bit(int bitno)
-- 
cgit v0.10.2


From 220f773a0013bf6fe2eefd9718ac7471f368fd8e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 21 Mar 2012 23:49:39 +0900
Subject: KVM: MMU: Make pte_list_desc fit cache lines well

We have PTE_LIST_EXT + 1 pointers in this structure and these 40/20
bytes do not fit cache lines well.  Furthermore, some allocators may
use 64/32-byte objects for the pte_list_desc cache.

This patch solves this problem by changing PTE_LIST_EXT from 4 to 3.

For shadow paging, the new size is still large enough to hold both the
kernel and process mappings for usual anonymous pages.  For file
mappings, there may be a slight change in the cache usage.

Note: with EPT/NPT we almost always have a single spte in each reverse
mapping and we will not see any change by this.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc5f245..3213348 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 			| PT64_NX_MASK)
 
-#define PTE_LIST_EXT 4
-
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 #define ACC_USER_MASK    PT_USER_MASK
@@ -151,6 +149,9 @@ module_param(dbg, bool, 0644);
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
+/* make pte_list_desc fit well in cache line */
+#define PTE_LIST_EXT 3
+
 struct pte_list_desc {
 	u64 *sptes[PTE_LIST_EXT];
 	struct pte_list_desc *more;
-- 
cgit v0.10.2


From 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 21 Mar 2012 23:50:34 +0900
Subject: KVM: MMU: Improve iteration through sptes from rmap

Iteration using rmap_next(), the actual body is pte_list_next(), is
inefficient: every time we call it we start from checking whether rmap
holds a single spte or points to a descriptor which links more sptes.

In the case of shadow paging, this quadratic total iteration cost is a
problem.  Even for two dimensional paging, with EPT/NPT on, in which we
almost always have a single mapping, the extra checks at the end of the
iteration should be eliminated.

This patch fixes this by introducing rmap_iterator which keeps the
iteration context for the next search.  Furthermore the implementation
of rmap_next() is splitted into two functions, rmap_get_first() and
rmap_get_next(), to avoid repeatedly checking whether the rmap being
iterated on has only one spte.

Although there seemed to be only a slight change for EPT/NPT, the actual
improvement was significant: we observed that GET_DIRTY_LOG for 1GB
dirty memory became 15% faster than before.  This is probably because
the new code is easy to make branch predictions.

Note: we just remove pte_list_next() because we can think of parent_ptes
as a reverse mapping.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3213348..29ad6f9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -842,32 +842,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
 	return count;
 }
 
-static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
-{
-	struct pte_list_desc *desc;
-	u64 *prev_spte;
-	int i;
-
-	if (!*pte_list)
-		return NULL;
-	else if (!(*pte_list & 1)) {
-		if (!spte)
-			return (u64 *)*pte_list;
-		return NULL;
-	}
-	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
-	prev_spte = NULL;
-	while (desc) {
-		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
-			if (prev_spte == spte)
-				return desc->sptes[i];
-			prev_spte = desc->sptes[i];
-		}
-		desc = desc->more;
-	}
-	return NULL;
-}
-
 static void
 pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
 			   int i, struct pte_list_desc *prev_desc)
@@ -988,11 +962,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return pte_list_add(vcpu, spte, rmapp);
 }
 
-static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
-{
-	return pte_list_next(rmapp, spte);
-}
-
 static void rmap_remove(struct kvm *kvm, u64 *spte)
 {
 	struct kvm_mmu_page *sp;
@@ -1005,6 +974,67 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	pte_list_remove(spte, rmapp);
 }
 
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap.  All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+	/* private fields */
+	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
+	int pos;			/* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function.  This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the itererator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
+{
+	if (!rmap)
+		return NULL;
+
+	if (!(rmap & 1)) {
+		iter->desc = NULL;
+		return (u64 *)rmap;
+	}
+
+	iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
+	iter->pos = 0;
+	return iter->desc->sptes[iter->pos];
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+	if (iter->desc) {
+		if (iter->pos < PTE_LIST_EXT - 1) {
+			u64 *sptep;
+
+			++iter->pos;
+			sptep = iter->desc->sptes[iter->pos];
+			if (sptep)
+				return sptep;
+		}
+
+		iter->desc = iter->desc->more;
+
+		if (iter->desc) {
+			iter->pos = 0;
+			/* desc->sptes[0] cannot be NULL */
+			return iter->desc->sptes[iter->pos];
+		}
+	}
+
+	return NULL;
+}
+
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
 	if (mmu_spte_clear_track_bits(sptep))
@@ -1013,23 +1043,27 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 
 static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
-	u64 *spte = NULL;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int write_protected = 0;
 
-	while ((spte = rmap_next(rmapp, spte))) {
-		BUG_ON(!(*spte & PT_PRESENT_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
 
-		if (!is_writable_pte(*spte))
+		if (!is_writable_pte(*sptep)) {
+			sptep = rmap_get_next(&iter);
 			continue;
+		}
 
 		if (level == PT_PAGE_TABLE_LEVEL) {
-			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
+			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
+			sptep = rmap_get_next(&iter);
 		} else {
-			BUG_ON(!is_large_pte(*spte));
-			drop_spte(kvm, spte);
+			BUG_ON(!is_large_pte(*sptep));
+			drop_spte(kvm, sptep);
 			--kvm->stat.lpages;
-			spte = NULL;
+			sptep = rmap_get_first(*rmapp, &iter);
 		}
 
 		write_protected = 1;
@@ -1084,48 +1118,57 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			   unsigned long data)
 {
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int need_tlb_flush = 0;
 
-	while ((spte = rmap_next(rmapp, NULL))) {
-		BUG_ON(!(*spte & PT_PRESENT_MASK));
-		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		drop_spte(kvm, spte);
+	while ((sptep = rmap_get_first(*rmapp, &iter))) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
+
+		drop_spte(kvm, sptep);
 		need_tlb_flush = 1;
 	}
+
 	return need_tlb_flush;
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int need_flush = 0;
-	u64 *spte, new_spte;
+	u64 new_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
 	WARN_ON(pte_huge(*ptep));
 	new_pfn = pte_pfn(*ptep);
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		BUG_ON(!is_shadow_present_pte(*spte));
-		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!is_shadow_present_pte(*sptep));
+		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
+
 		need_flush = 1;
+
 		if (pte_write(*ptep)) {
-			drop_spte(kvm, spte);
-			spte = rmap_next(rmapp, NULL);
+			drop_spte(kvm, sptep);
+			sptep = rmap_get_first(*rmapp, &iter);
 		} else {
-			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
-			mmu_spte_clear_track_bits(spte);
-			mmu_spte_set(spte, new_spte);
-			spte = rmap_next(rmapp, spte);
+
+			mmu_spte_clear_track_bits(sptep);
+			mmu_spte_set(sptep, new_spte);
+			sptep = rmap_get_next(&iter);
 		}
 	}
+
 	if (need_flush)
 		kvm_flush_remote_tlbs(kvm);
 
@@ -1184,7 +1227,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			 unsigned long data)
 {
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int young = 0;
 
 	/*
@@ -1197,25 +1241,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		return kvm_unmap_rmapp(kvm, rmapp, data);
 
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		int _young;
-		u64 _spte = *spte;
-		BUG_ON(!(_spte & PT_PRESENT_MASK));
-		_young = _spte & PT_ACCESSED_MASK;
-		if (_young) {
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+	     sptep = rmap_get_next(&iter)) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		if (*sptep & PT_ACCESSED_MASK) {
 			young = 1;
-			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep);
 		}
-		spte = rmap_next(rmapp, spte);
 	}
+
 	return young;
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			      unsigned long data)
 {
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 	int young = 0;
 
 	/*
@@ -1226,16 +1269,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		u64 _spte = *spte;
-		BUG_ON(!(_spte & PT_PRESENT_MASK));
-		young = _spte & PT_ACCESSED_MASK;
-		if (young) {
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+	     sptep = rmap_get_next(&iter)) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		if (*sptep & PT_ACCESSED_MASK) {
 			young = 1;
 			break;
 		}
-		spte = rmap_next(rmapp, spte);
 	}
 out:
 	return young;
@@ -1887,10 +1928,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 
 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	u64 *parent_pte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 
-	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
-		drop_parent_pte(sp, parent_pte);
+	while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
+		drop_parent_pte(sp, sptep);
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 715da5a..7d7d0b9 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
-	u64 *spte;
+	u64 *sptep;
+	struct rmap_iterator iter;
 
 	if (sp->role.direct || sp->unsync || sp->role.invalid)
 		return;
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = gfn_to_memslot(kvm, sp->gfn);
 	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
-	spte = rmap_next(rmapp, NULL);
-	while (spte) {
-		if (is_writable_pte(*spte))
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+	     sptep = rmap_get_next(&iter)) {
+		if (is_writable_pte(*sptep))
 			audit_printk(kvm, "shadow page has writable "
 				     "mappings: gfn %llx role %x\n",
 				     sp->gfn, sp->role.word);
-		spte = rmap_next(rmapp, spte);
 	}
 }
 
-- 
cgit v0.10.2


From 9de6fe91afcdc38efe398a9d42014a7c920a64db Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Wed, 11 Apr 2012 15:27:52 -0500
Subject: KVM: PPC: add CPU_FTR_EMB_HV to CPU table

e6500 support (commit 10241842fbe900276634fee8d37ec48a7d8a762f,
"powerpc: Add initial e6500 cpu support" and the introduction of
CPU_FTR_EMB_HV (commit 73196cd364a2d972d73fa08da9d81ca3215bed68,
"KVM: PPC: e500mc support") collided during merge, leaving e6500's CPU
table entry missing CPU_FTR_EMB_HV.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 67c34af..50d82c8 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -395,7 +395,7 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_E6500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_DEBUG_LVL_EXC)
+	    CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
 #define CPU_FTRS_GENERIC_32	(CPU_FTR_COMMON | CPU_FTR_NODSISRALIGN)
 
 /* 64-bit CPUs */
-- 
cgit v0.10.2


From ae75954457eee0a608072368c5b477e40f378d7b Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Wed, 28 Mar 2012 11:32:28 -0700
Subject: KVM: SVM: Auto-load on CPUs with SVM

Enable x86 feature-based autoloading for the kvm-amd module on CPUs
with X86_FEATURE_SVM.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f316720..f75af40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -22,6 +22,7 @@
 #include "x86.h"
 
 #include <linux/module.h>
+#include <linux/mod_devicetable.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -42,6 +43,12 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static const struct x86_cpu_id svm_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_SVM),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-- 
cgit v0.10.2


From 1c11b37669a5209bd11fb857a103634afef971e8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:39:59 +0300
Subject: KVM: x86 emulator: add support for vector alignment

x86 defines three classes of vector instructions: explicitly
aligned (#GP(0) if unaligned, explicitly unaligned, and default
(which depends on the encoding: AVX is unaligned, SSE is aligned).

Add support for marking an instruction as explicitly aligned or
unaligned, and mark MOVDQU as unaligned.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8375622..6302e5c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,9 @@
 #define Src2FS      (OpFS << Src2Shift)
 #define Src2GS      (OpGS << Src2Shift)
 #define Src2Mask    (OpMask << Src2Shift)
+#define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
 
 #define X2(x...) x, x
 #define X3(x...) X2(x), x
@@ -557,6 +560,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
 	ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
 }
 
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check.
+ */
+static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+	if (likely(size < 16))
+		return false;
+
+	if (ctxt->d & Aligned)
+		return true;
+	else if (ctxt->d & Unaligned)
+		return false;
+	else if (ctxt->d & Avx)
+		return false;
+	else
+		return true;
+}
+
 static int __linearize(struct x86_emulate_ctxt *ctxt,
 		     struct segmented_address addr,
 		     unsigned size, bool write, bool fetch,
@@ -621,6 +647,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 	}
 	if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
 		la &= (u32)-1;
+	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
+		return emulate_gp(ctxt, 0);
 	*linear = la;
 	return X86EMUL_CONTINUE;
 bad:
@@ -3415,7 +3443,7 @@ static struct opcode group11[] = {
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
-	N, N, N, I(Sse, em_movdqu),
+	N, N, N, I(Sse | Unaligned, em_movdqu),
 };
 
 static struct opcode opcode_table[256] = {
-- 
cgit v0.10.2


From 49597d8116ad70aabb598e606b218ddd9315b0af Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 9 Apr 2012 18:40:00 +0300
Subject: KVM: x86: emulate movdqa

An Ubuntu 9.10 Karmic Koala guest is unable to boot or install due to
missing movdqa emulation:

kvm_exit: reason EXCEPTION_NMI rip 0x7fef3e025a7b info 7fef3e799000 80000b0e
kvm_page_fault: address 7fef3e799000 error_code f
kvm_emulate_insn: 0:7fef3e025a7b: 66 0f 7f 07 (prot64)

movdqa %xmm0,(%rdi)

[avi: mark it explicitly aligned]

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6302e5c..b160fb1f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2818,7 +2818,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
 
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->dst.val = ctxt->src.val;
+	memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
@@ -2898,12 +2898,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
 }
 
-static int em_movdqu(struct x86_emulate_ctxt *ctxt)
-{
-	memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
-	return X86EMUL_CONTINUE;
-}
-
 static int em_invlpg(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
@@ -3443,7 +3437,7 @@ static struct opcode group11[] = {
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
-	N, N, N, I(Sse | Unaligned, em_movdqu),
+	N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
 static struct opcode opcode_table[256] = {
-- 
cgit v0.10.2


From 3e114eb4db3a33141b8c91bb53dae9ba6b015a32 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:40:01 +0300
Subject: KVM: x86 emulator: implement movntps

Used to write to framebuffers (by at least Icaros).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b160fb1f..fb39e0b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3440,6 +3440,10 @@ static struct gprefix pfx_0f_6f_0f_7f = {
 	N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
+static struct gprefix pfx_vmovntpx = {
+	I(0, em_mov), N, N, N,
+};
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	I6ALU(Lock, em_add),
@@ -3571,7 +3575,8 @@ static struct opcode twobyte_table[256] = {
 	IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
 	IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
 	N, N, N, N,
-	N, N, N, N, N, N, N, N,
+	N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+	N, N, N, N,
 	/* 0x30 - 0x3F */
 	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
-- 
cgit v0.10.2


From cbe2c9d30aa69b0551247ddb0fb450b6e8080ec4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:40:02 +0300
Subject: KVM: x86 emulator: MMX support

General support for the MMX instruction set.  Special care is taken
to trap pending x87 exceptions so that they are properly reflected
to the guest.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c222e1a..1ac46c22 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -200,7 +200,7 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
 
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
-	enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
+	enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
 	unsigned int bytes;
 	union {
 		unsigned long orig_val;
@@ -213,12 +213,14 @@ struct operand {
 			unsigned seg;
 		} mem;
 		unsigned xmm;
+		unsigned mm;
 	} addr;
 	union {
 		unsigned long val;
 		u64 val64;
 		char valptr[sizeof(unsigned long) + 2];
 		sse128_t vec_val;
+		u64 mm_val;
 	};
 };
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fb39e0b..0011b4a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,7 @@
 #define Src2FS      (OpFS << Src2Shift)
 #define Src2GS      (OpGS << Src2Shift)
 #define Src2Mask    (OpMask << Src2Shift)
+#define Mmx         ((u64)1 << 40)  /* MMX Vector instruction */
 #define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
 #define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
 #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
@@ -887,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 	ctxt->ops->put_fpu(ctxt);
 }
 
+static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+	ctxt->ops->get_fpu(ctxt);
+	switch (reg) {
+	case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+	case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+	case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+	case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+	case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+	case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+	case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+	case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+	default: BUG();
+	}
+	ctxt->ops->put_fpu(ctxt);
+}
+
+static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+	ctxt->ops->get_fpu(ctxt);
+	switch (reg) {
+	case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+	case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+	case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+	case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+	case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+	case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+	case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+	case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+	default: BUG();
+	}
+	ctxt->ops->put_fpu(ctxt);
+}
+
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 				    struct operand *op)
 {
@@ -903,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 		read_sse_reg(ctxt, &op->vec_val, reg);
 		return;
 	}
+	if (ctxt->d & Mmx) {
+		reg &= 7;
+		op->type = OP_MM;
+		op->bytes = 8;
+		op->addr.mm = reg;
+		return;
+	}
 
 	op->type = OP_REG;
 	if (ctxt->d & ByteOp) {
@@ -948,6 +990,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
 			return rc;
 		}
+		if (ctxt->d & Mmx) {
+			op->type = OP_MM;
+			op->bytes = 8;
+			op->addr.xmm = ctxt->modrm_rm & 7;
+			return rc;
+		}
 		fetch_register_operand(op);
 		return rc;
 	}
@@ -1415,6 +1463,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 	case OP_XMM:
 		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
 		break;
+	case OP_MM:
+		write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
+		break;
 	case OP_NONE:
 		/* no writeback */
 		break;
@@ -3987,6 +4038,8 @@ done_prefixes:
 
 	if (ctxt->d & Sse)
 		ctxt->op_bytes = 16;
+	else if (ctxt->d & Mmx)
+		ctxt->op_bytes = 8;
 
 	/* ModRM and SIB bytes. */
 	if (ctxt->d & ModRM) {
@@ -4057,6 +4110,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 	return false;
 }
 
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+	bool fault = false;
+
+	ctxt->ops->get_fpu(ctxt);
+	asm volatile("1: fwait \n\t"
+		     "2: \n\t"
+		     ".pushsection .fixup,\"ax\" \n\t"
+		     "3: \n\t"
+		     "movb $1, %[fault] \n\t"
+		     "jmp 2b \n\t"
+		     ".popsection \n\t"
+		     _ASM_EXTABLE(1b, 3b)
+		     : [fault]"+rm"(fault));
+	ctxt->ops->put_fpu(ctxt);
+
+	if (unlikely(fault))
+		return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+	return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
+				       struct operand *op)
+{
+	if (op->type == OP_MM)
+		read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+}
+
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
@@ -4081,18 +4163,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		goto done;
 	}
 
-	if ((ctxt->d & Sse)
-	    && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
-		|| !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+	if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+	    || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+	if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
 		rc = emulate_nm(ctxt);
 		goto done;
 	}
 
+	if (ctxt->d & Mmx) {
+		rc = flush_pending_x87_faults(ctxt);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		/*
+		 * Now that we know the fpu is exception safe, we can fetch
+		 * operands from it.
+		 */
+		fetch_possible_mmx_operand(ctxt, &ctxt->src);
+		fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+		if (!(ctxt->d & Mov))
+			fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+	}
+
 	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
 		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_PRE_EXCEPT);
-- 
cgit v0.10.2


From e59717550e5cf0e7159c5b7af1d1ead35fef49dd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 9 Apr 2012 18:40:03 +0300
Subject: KVM: x86 emulator: implement MMX MOVQ (opcodes 0f 6f, 0f 7f)

Needed by some framebuffer drivers.  See

https://bugzilla.kernel.org/show_bug.cgi?id=42779

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0011b4a..d5729a9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3488,7 +3488,7 @@ static struct opcode group11[] = {
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
-	N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+	I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
 static struct gprefix pfx_vmovntpx = {
-- 
cgit v0.10.2


From a0c9a822bf37e6282eb6006b407ec5aec22e08fb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 11 Apr 2012 18:49:55 +0300
Subject: KVM: dont clear TMR on EOI

Intel spec says that TMR needs to be set/cleared
when IRR is set, but kvm also clears it on  EOI.

I did some tests on a real (AMD based) system,
and I see same TMR values both before
and after EOI, so I think it's a minor bug in kvm.

This patch fixes TMR to be set/cleared on IRR set
only as per spec.

And now that we don't clear TMR, we can save
an atomic read of TMR on EOI that's not propagated
to ioapic, by checking whether ioapic needs
a specific vector first and calculating
the mode afterwards.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8584322..992b4ea 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap)
 	return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
 static inline void apic_set_vector(int vec, void *bitmap)
 {
 	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
-	int trigger_mode;
 	/*
 	 * Not every write EOI will has corresponding ISR,
 	 * one example is when Kernel check timer on setup_IO_APIC
@@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 	apic_clear_vector(vector, apic->regs + APIC_ISR);
 	apic_update_ppr(apic);
 
-	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-		trigger_mode = IOAPIC_LEVEL_TRIG;
-	else
-		trigger_mode = IOAPIC_EDGE_TRIG;
-	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
+	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+		int trigger_mode;
+		if (apic_test_vector(vector, apic->regs + APIC_TMR))
+			trigger_mode = IOAPIC_LEVEL_TRIG;
+		else
+			trigger_mode = IOAPIC_EDGE_TRIG;
 		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	}
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index dcaf272c26..26fd54d 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -254,13 +254,17 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 	}
 }
 
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	smp_rmb();
+	return test_bit(vector, ioapic->handled_vectors);
+}
+
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
-	smp_rmb();
-	if (!test_bit(vector, ioapic->handled_vectors))
-		return;
 	spin_lock(&ioapic->lock);
 	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
 	spin_unlock(&ioapic->lock);
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 0b190c3..32872a0 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,6 +71,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-- 
cgit v0.10.2


From f71fa31f9f7ac33cba12b8897983f950ad2c7a5b Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Wed, 18 Apr 2012 12:24:29 +0200
Subject: KVM: MMU: use page table level macro

Its much cleaner to use PT_PAGE_TABLE_LEVEL than its numeric value.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 29ad6f9..07424cf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3618,7 +3618,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
 	 * Skip write-flooding detected for the sp whose level is 1, because
 	 * it can become unsync, then the guest page is not write-protected.
 	 */
-	if (sp->role.level == 1)
+	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
 		return false;
 
 	return ++sp->write_flooding_count >= 3;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a703..34f9709 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
 {
 	int offset = 0;
 
-	WARN_ON(sp->role.level != 1);
+	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-- 
cgit v0.10.2


From f78146b0f9230765c6315b2e14f56112513389ad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Apr 2012 19:22:47 +0300
Subject: KVM: Fix page-crossing MMIO

MMIO that are split across a page boundary are currently broken - the
code does not expect to be aborted by the exit to userspace for the
first MMIO fragment.

This patch fixes the problem by generalizing the current code for handling
16-byte MMIOs to handle a number of "fragments", and changes the MMIO
code to create those fragments.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index c4b4bac..6d6a5ac 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -449,6 +449,8 @@ struct kvm_vcpu_arch {
 	char log_buf[VMM_LOG_LEN];
 	union context host;
 	union context guest;
+
+	char mmio_data[8];
 };
 
 struct kvm_vm_stat {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 9d80ff8..882ab21 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -232,12 +232,12 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if ((p->addr & PAGE_MASK) == IOAPIC_DEFAULT_BASE_ADDRESS)
 		goto mmio;
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = kvm_run->mmio.phys_addr = p->addr;
-	vcpu->mmio_size = kvm_run->mmio.len = p->size;
+	vcpu->mmio_fragments[0].gpa = kvm_run->mmio.phys_addr = p->addr;
+	vcpu->mmio_fragments[0].len = kvm_run->mmio.len = p->size;
 	vcpu->mmio_is_write = kvm_run->mmio.is_write = !p->dir;
 
 	if (vcpu->mmio_is_write)
-		memcpy(vcpu->mmio_data, &p->data, p->size);
+		memcpy(vcpu->arch.mmio_data, &p->data, p->size);
 	memcpy(kvm_run->mmio.data, &p->data, p->size);
 	kvm_run->exit_reason = KVM_EXIT_MMIO;
 	return 0;
@@ -719,7 +719,7 @@ static void kvm_set_mmio_data(struct kvm_vcpu *vcpu)
 	struct kvm_mmio_req *p = kvm_get_vcpu_ioreq(vcpu);
 
 	if (!vcpu->mmio_is_write)
-		memcpy(&p->data, vcpu->mmio_data, 8);
+		memcpy(&p->data, vcpu->arch.mmio_data, 8);
 	p->state = STATE_IORESP_READY;
 }
 
@@ -739,7 +739,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+		memcpy(vcpu->arch.mmio_data, kvm_run->mmio.data, 8);
 		kvm_set_mmio_data(vcpu);
 		vcpu->mmio_read_completed = 1;
 		vcpu->mmio_needed = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d9a578..4de705c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3718,9 +3718,8 @@ struct read_write_emulator_ops {
 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
 	if (vcpu->mmio_read_completed) {
-		memcpy(val, vcpu->mmio_data, bytes);
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-			       vcpu->mmio_phys_addr, *(u64 *)val);
+			       vcpu->mmio_fragments[0].gpa, *(u64 *)val);
 		vcpu->mmio_read_completed = 0;
 		return 1;
 	}
@@ -3756,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 			   void *val, int bytes)
 {
-	memcpy(vcpu->mmio_data, val, bytes);
-	memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+	memcpy(vcpu->run->mmio.data, frag->data, frag->len);
 	return X86EMUL_CONTINUE;
 }
 
@@ -3784,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
 	gpa_t gpa;
 	int handled, ret;
 	bool write = ops->write;
-
-	if (ops->read_write_prepare &&
-		  ops->read_write_prepare(vcpu, val, bytes))
-		return X86EMUL_CONTINUE;
+	struct kvm_mmio_fragment *frag;
 
 	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
@@ -3813,15 +3810,19 @@ mmio:
 	bytes -= handled;
 	val += handled;
 
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
-	vcpu->mmio_index = 0;
+	while (bytes) {
+		unsigned now = min(bytes, 8U);
 
-	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+		frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+		frag->gpa = gpa;
+		frag->data = val;
+		frag->len = now;
+
+		gpa += now;
+		val += now;
+		bytes -= now;
+	}
+	return X86EMUL_CONTINUE;
 }
 
 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
@@ -3830,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 			struct read_write_emulator_ops *ops)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	gpa_t gpa;
+	int rc;
+
+	if (ops->read_write_prepare &&
+		  ops->read_write_prepare(vcpu, val, bytes))
+		return X86EMUL_CONTINUE;
+
+	vcpu->mmio_nr_fragments = 0;
 
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-		int rc, now;
+		int now;
 
 		now = -addr & ~PAGE_MASK;
 		rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3846,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
 		bytes -= now;
 	}
 
-	return emulator_read_write_onepage(addr, val, bytes, exception,
-					   vcpu, ops);
+	rc = emulator_read_write_onepage(addr, val, bytes, exception,
+					 vcpu, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (!vcpu->mmio_nr_fragments)
+		return rc;
+
+	gpa = vcpu->mmio_fragments[0].gpa;
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_cur_fragment = 0;
+
+	vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = gpa;
+
+	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 }
 
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -5446,33 +5472,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ *   for each fragment
+ *     write gpa, len
+ *     exit
+ *     copy data
+ *   execute insn
+ *
+ * write:
+ *   for each fragment
+ *      write gpa, len
+ *      copy data
+ *      exit
+ */
 static int complete_mmio(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
+	struct kvm_mmio_fragment *frag;
 	int r;
 
 	if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
 		return 1;
 
 	if (vcpu->mmio_needed) {
-		vcpu->mmio_needed = 0;
+		/* Complete previous fragment */
+		frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
 		if (!vcpu->mmio_is_write)
-			memcpy(vcpu->mmio_data + vcpu->mmio_index,
-			       run->mmio.data, 8);
-		vcpu->mmio_index += 8;
-		if (vcpu->mmio_index < vcpu->mmio_size) {
-			run->exit_reason = KVM_EXIT_MMIO;
-			run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
-			memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
-			run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
-			run->mmio.is_write = vcpu->mmio_is_write;
-			vcpu->mmio_needed = 1;
-			return 0;
+			memcpy(frag->data, run->mmio.data, frag->len);
+		if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+			vcpu->mmio_needed = 0;
+			if (vcpu->mmio_is_write)
+				return 1;
+			vcpu->mmio_read_completed = 1;
+			goto done;
 		}
+		/* Initiate next fragment */
+		++frag;
+		run->exit_reason = KVM_EXIT_MMIO;
+		run->mmio.phys_addr = frag->gpa;
 		if (vcpu->mmio_is_write)
-			return 1;
-		vcpu->mmio_read_completed = 1;
+			memcpy(run->mmio.data, frag->data, frag->len);
+		run->mmio.len = frag->len;
+		run->mmio.is_write = vcpu->mmio_is_write;
+		return 0;
+
 	}
+done:
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a2d00b1..186ffab 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,20 @@
 #endif
 
 /*
+ * If we support unaligned MMIO, at most one fragment will be split into two:
+ */
+#ifdef KVM_UNALIGNED_MMIO
+#  define KVM_EXTRA_MMIO_FRAGMENTS 1
+#else
+#  define KVM_EXTRA_MMIO_FRAGMENTS 0
+#endif
+
+#define KVM_USER_MMIO_SIZE 8
+
+#define KVM_MAX_MMIO_FRAGMENTS \
+	(KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
+
+/*
  * vcpu->requests bit members
  */
 #define KVM_REQ_TLB_FLUSH          0
@@ -117,6 +131,16 @@ enum {
 	EXITING_GUEST_MODE
 };
 
+/*
+ * Sometimes a large or cross-page mmio needs to be broken up into separate
+ * exits for userspace servicing.
+ */
+struct kvm_mmio_fragment {
+	gpa_t gpa;
+	void *data;
+	unsigned len;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -144,10 +168,9 @@ struct kvm_vcpu {
 	int mmio_needed;
 	int mmio_read_completed;
 	int mmio_is_write;
-	int mmio_size;
-	int mmio_index;
-	unsigned char mmio_data[KVM_MMIO_SIZE];
-	gpa_t mmio_phys_addr;
+	int mmio_cur_fragment;
+	int mmio_nr_fragments;
+	struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
 #endif
 
 #ifdef CONFIG_KVM_ASYNC_PF
-- 
cgit v0.10.2


From 8281715b4109b5ee26032ff7b77c0d575c4150f7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Apr 2012 19:23:50 +0300
Subject: KVM: ia64: fix build due to typo

s/kcm/kvm/.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 882ab21..bd77cb5 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1174,7 +1174,7 @@ out:
 
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
-	return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL);
+	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 1f15d10984c854e077da5aa1a23f901496b49773 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 20 Apr 2012 18:21:46 -0300
Subject: KVM: add kvm_arch_para_features stub to asm-generic/kvm_para.h

Needed by kvm_para_has_feature().

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
index 05ef7e7..9a7bbad 100644
--- a/include/asm-generic/kvm_para.h
+++ b/include/asm-generic/kvm_para.h
@@ -11,4 +11,9 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 	return false;
 }
 
+static inline unsigned int kvm_arch_para_features(void)
+{
+	return 0;
+}
+
 #endif
-- 
cgit v0.10.2


From 07975ad3b30579ca27d880491ad992326b930c63 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 29 Mar 2012 21:14:12 +0200
Subject: KVM: Introduce direct MSI message injection for in-kernel irqchips

Currently, MSI messages can only be injected to in-kernel irqchips by
defining a corresponding IRQ route for each message. This is not only
unhandy if the MSI messages are generated "on the fly" by user space,
IRQ routes are a limited resource that user space has to manage
carefully.

By providing a direct injection path, we can both avoid using up limited
resources and simplify the necessary steps for user land.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 81ff39f..a155221 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1689,6 +1689,27 @@ where the guest will clear the flag: when the soft lockup watchdog timer resets
 itself or when a soft lockup is detected.  This ioctl can be called any time
 after pausing the vcpu, but before it is resumed.
 
+4.71 KVM_SIGNAL_MSI
+
+Capability: KVM_CAP_SIGNAL_MSI
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msi (in)
+Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
+
+Directly inject a MSI message. Only valid with in-kernel irqchip that handles
+MSI messages.
+
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
+No flags are defined so far. The corresponding field must be 0.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe86..a28f338 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
 	select TASKSTATS
 	select TASK_DELAY_ACCT
 	select PERF_EVENTS
+	select HAVE_KVM_MSI
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 7a9dd4b..225b452 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -590,6 +590,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_SYNC_REGS 74
 #define KVM_CAP_PCI_2_3 75
 #define KVM_CAP_KVMCLOCK_CTRL 76
+#define KVM_CAP_SIGNAL_MSI 77
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -715,6 +716,14 @@ struct kvm_one_reg {
 	__u64 addr;
 };
 
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -789,6 +798,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PCI_2_3 */
 #define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
 				       struct kvm_assigned_pci_dev)
+/* Available with KVM_CAP_SIGNAL_MSI */
+#define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 186ffab..6f34330 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -802,6 +802,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			unsigned flags);
 void kvm_free_irq_routing(struct kvm *kvm);
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
 #else
 
 static inline void kvm_free_irq_routing(struct kvm *kvm) {}
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index f63ccb0..28694f4 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -18,3 +18,6 @@ config KVM_MMIO
 
 config KVM_ASYNC_PF
        bool
+
+config HAVE_KVM_MSI
+       bool
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9f614b4..a6a0365 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -138,6 +138,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	struct kvm_kernel_irq_routing_entry route;
+
+	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+		return -EINVAL;
+
+	route.msi.address_lo = msi->address_lo;
+	route.msi.address_hi = msi->address_hi;
+	route.msi.data = msi->data;
+
+	return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+}
+
 /*
  * Return value:
  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9eb7936..1847c76 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2060,6 +2060,17 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_SIGNAL_MSI: {
+		struct kvm_msi msi;
+
+		r = -EFAULT;
+		if (copy_from_user(&msi, argp, sizeof msi))
+			goto out;
+		r = kvm_send_userspace_msi(kvm, &msi);
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -2188,6 +2199,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
 	case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_CAP_SIGNAL_MSI:
+#endif
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING:
-- 
cgit v0.10.2


From 413837714232b3a4c0705e915d8af75ad521d083 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 19 Apr 2012 14:06:29 +0300
Subject: KVM: Introduce bitmask for apic attention reasons

The patch introduces a bitmap that will hold reasons apic should be
checked during vmexit. This is in a preparation for vp eoi patch
that will add one more check on vmexit. With the bitmap we can do
if(apic_attention) to check everything simultaneously which will
add zero overhead on the fast path.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f624ca7..69e39bc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -172,6 +172,9 @@ enum {
 #define DR7_FIXED_1	0x00000400
 #define DR7_VOLATILE	0xffff23ff
 
+/* apic attention bits */
+#define KVM_APIC_CHECK_VAPIC	0
+
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -337,6 +340,7 @@ struct kvm_vcpu_arch {
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
 	int sipi_vector;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 992b4ea..93c1574 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1088,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	apic_update_ppr(apic);
 
 	vcpu->arch.apic_arb_prio = 0;
+	vcpu->arch.apic_attention = 0;
 
 	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
@@ -1287,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	u32 data;
 	void *vapic;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
 	vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
@@ -1304,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic;
 	void *vapic;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
 	apic = vcpu->arch.apic;
@@ -1324,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
 	vcpu->arch.apic->vapic_addr = vapic_addr;
+	if (vapic_addr)
+		__set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+	else
+		__clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
 }
 
 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-- 
cgit v0.10.2


From 38e8a2ddc9ada5dd1f2def95bebb733bf619bbef Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 22 Apr 2012 15:12:50 +0300
Subject: KVM: x86 emulator: fix asm constraint in flush_pending_x87_faults

'bool' wants 8-bit registers.

Reported-by: Takuya Yoshikawa <takuya.yoshikawa@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d5729a9..0d151e2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4123,7 +4123,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
 		     "jmp 2b \n\t"
 		     ".popsection \n\t"
 		     _ASM_EXTABLE(1b, 3b)
-		     : [fault]"+rm"(fault));
+		     : [fault]"+qm"(fault));
 	ctxt->ops->put_fpu(ctxt);
 
 	if (unlikely(fault))
-- 
cgit v0.10.2


From 414fa985f91bdf61fe2baf8111c0ddbdb94808ea Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 24 Apr 2012 16:40:15 +0200
Subject: KVM: Improve readability of KVM API doc

This helps to identify sections and it also fixes the numbering from
4.54 to 4.61.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a155221..b48f927 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2,6 +2,7 @@ The Definitive KVM (Kernel-based Virtual Machine) API Documentation
 ===================================================================
 
 1. General description
+----------------------
 
 The kvm API is a set of ioctls that are issued to control various aspects
 of a virtual machine.  The ioctls belong to three classes
@@ -23,7 +24,9 @@ of a virtual machine.  The ioctls belong to three classes
    Only run vcpu ioctls from the same thread that was used to create the
    vcpu.
 
+
 2. File descriptors
+-------------------
 
 The kvm API is centered around file descriptors.  An initial
 open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
@@ -41,7 +44,9 @@ not cause harm to the host, their actual behavior is not guaranteed by
 the API.  The only supported use is one virtual machine per process,
 and one vcpu per thread.
 
+
 3. Extensions
+-------------
 
 As of Linux 2.6.22, the KVM ABI has been stabilized: no backward
 incompatible change are allowed.  However, there is an extension
@@ -53,7 +58,9 @@ Instead, kvm defines extension identifiers and a facility to query
 whether a particular extension identifier is available.  If it is, a
 set of ioctls is available for application use.
 
+
 4. API description
+------------------
 
 This section describes ioctls that can be used to control kvm guests.
 For each ioctl, the following information is provided along with a
@@ -75,6 +82,7 @@ description:
   Returns: the return value.  General error numbers (EBADF, ENOMEM, EINVAL)
       are not detailed, but errors with specific meanings are.
 
+
 4.1 KVM_GET_API_VERSION
 
 Capability: basic
@@ -90,6 +98,7 @@ supported.  Applications should refuse to run if KVM_GET_API_VERSION
 returns a value other than 12.  If this check passes, all ioctls
 described as 'basic' will be available.
 
+
 4.2 KVM_CREATE_VM
 
 Capability: basic
@@ -109,6 +118,7 @@ In order to create user controlled virtual machines on S390, check
 KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
 privileged user (CAP_SYS_ADMIN).
 
+
 4.3 KVM_GET_MSR_INDEX_LIST
 
 Capability: basic
@@ -135,6 +145,7 @@ Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
 not returned in the MSR list, as different vcpus can have a different number
 of banks, as set via the KVM_X86_SETUP_MCE ioctl.
 
+
 4.4 KVM_CHECK_EXTENSION
 
 Capability: basic
@@ -149,6 +160,7 @@ receives an integer that describes the extension availability.
 Generally 0 means no and 1 means yes, but some extensions may report
 additional information in the integer return value.
 
+
 4.5 KVM_GET_VCPU_MMAP_SIZE
 
 Capability: basic
@@ -161,6 +173,7 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
 memory region.  This ioctl returns the size of that region.  See the
 KVM_RUN documentation for details.
 
+
 4.6 KVM_SET_MEMORY_REGION
 
 Capability: basic
@@ -171,6 +184,7 @@ Returns: 0 on success, -1 on error
 
 This ioctl is obsolete and has been removed.
 
+
 4.7 KVM_CREATE_VCPU
 
 Capability: basic
@@ -223,6 +237,7 @@ machines, the resulting vcpu fd can be memory mapped at page offset
 KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual
 cpu's hardware control block.
 
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
@@ -246,6 +261,7 @@ since the last call to this ioctl.  Bit 0 is the first page in the
 memory slot.  Ensure the entire structure is cleared to avoid padding
 issues.
 
+
 4.9 KVM_SET_MEMORY_ALIAS
 
 Capability: basic
@@ -256,6 +272,7 @@ Returns: 0 (success), -1 (error)
 
 This ioctl is obsolete and has been removed.
 
+
 4.10 KVM_RUN
 
 Capability: basic
@@ -272,6 +289,7 @@ obtained by mmap()ing the vcpu fd at offset 0, with the size given by
 KVM_GET_VCPU_MMAP_SIZE.  The parameter block is formatted as a 'struct
 kvm_run' (see below).
 
+
 4.11 KVM_GET_REGS
 
 Capability: basic
@@ -292,6 +310,7 @@ struct kvm_regs {
 	__u64 rip, rflags;
 };
 
+
 4.12 KVM_SET_REGS
 
 Capability: basic
@@ -304,6 +323,7 @@ Writes the general purpose registers into the vcpu.
 
 See KVM_GET_REGS for the data structure.
 
+
 4.13 KVM_GET_SREGS
 
 Capability: basic
@@ -331,6 +351,7 @@ interrupt_bitmap is a bitmap of pending external interrupts.  At most
 one bit may be set.  This interrupt has been acknowledged by the APIC
 but not yet injected into the cpu core.
 
+
 4.14 KVM_SET_SREGS
 
 Capability: basic
@@ -342,6 +363,7 @@ Returns: 0 on success, -1 on error
 Writes special registers into the vcpu.  See KVM_GET_SREGS for the
 data structures.
 
+
 4.15 KVM_TRANSLATE
 
 Capability: basic
@@ -365,6 +387,7 @@ struct kvm_translation {
 	__u8  pad[5];
 };
 
+
 4.16 KVM_INTERRUPT
 
 Capability: basic
@@ -413,6 +436,7 @@ c) KVM_INTERRUPT_SET_LEVEL
 Note that any value for 'irq' other than the ones stated above is invalid
 and incurs unexpected behavior.
 
+
 4.17 KVM_DEBUG_GUEST
 
 Capability: basic
@@ -423,6 +447,7 @@ Returns: -1 on error
 
 Support for this has been removed.  Use KVM_SET_GUEST_DEBUG instead.
 
+
 4.18 KVM_GET_MSRS
 
 Capability: basic
@@ -451,6 +476,7 @@ Application code should set the 'nmsrs' member (which indicates the
 size of the entries array) and the 'index' member of each array entry.
 kvm will fill in the 'data' member.
 
+
 4.19 KVM_SET_MSRS
 
 Capability: basic
@@ -466,6 +492,7 @@ Application code should set the 'nmsrs' member (which indicates the
 size of the entries array), and the 'index' and 'data' members of each
 array entry.
 
+
 4.20 KVM_SET_CPUID
 
 Capability: basic
@@ -494,6 +521,7 @@ struct kvm_cpuid {
 	struct kvm_cpuid_entry entries[0];
 };
 
+
 4.21 KVM_SET_SIGNAL_MASK
 
 Capability: basic
@@ -516,6 +544,7 @@ struct kvm_signal_mask {
 	__u8  sigset[0];
 };
 
+
 4.22 KVM_GET_FPU
 
 Capability: basic
@@ -541,6 +570,7 @@ struct kvm_fpu {
 	__u32 pad2;
 };
 
+
 4.23 KVM_SET_FPU
 
 Capability: basic
@@ -566,6 +596,7 @@ struct kvm_fpu {
 	__u32 pad2;
 };
 
+
 4.24 KVM_CREATE_IRQCHIP
 
 Capability: KVM_CAP_IRQCHIP
@@ -579,6 +610,7 @@ ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
 local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
 only go to the IOAPIC.  On ia64, a IOSAPIC is created.
 
+
 4.25 KVM_IRQ_LINE
 
 Capability: KVM_CAP_IRQCHIP
@@ -600,6 +632,7 @@ struct kvm_irq_level {
 	__u32 level;           /* 0 or 1 */
 };
 
+
 4.26 KVM_GET_IRQCHIP
 
 Capability: KVM_CAP_IRQCHIP
@@ -621,6 +654,7 @@ struct kvm_irqchip {
 	} chip;
 };
 
+
 4.27 KVM_SET_IRQCHIP
 
 Capability: KVM_CAP_IRQCHIP
@@ -642,6 +676,7 @@ struct kvm_irqchip {
 	} chip;
 };
 
+
 4.28 KVM_XEN_HVM_CONFIG
 
 Capability: KVM_CAP_XEN_HVM
@@ -666,6 +701,7 @@ struct kvm_xen_hvm_config {
 	__u8 pad2[30];
 };
 
+
 4.29 KVM_GET_CLOCK
 
 Capability: KVM_CAP_ADJUST_CLOCK
@@ -684,6 +720,7 @@ struct kvm_clock_data {
 	__u32 pad[9];
 };
 
+
 4.30 KVM_SET_CLOCK
 
 Capability: KVM_CAP_ADJUST_CLOCK
@@ -702,6 +739,7 @@ struct kvm_clock_data {
 	__u32 pad[9];
 };
 
+
 4.31 KVM_GET_VCPU_EVENTS
 
 Capability: KVM_CAP_VCPU_EVENTS
@@ -741,6 +779,7 @@ struct kvm_vcpu_events {
 KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
 interrupt.shadow contains a valid state. Otherwise, this field is undefined.
 
+
 4.32 KVM_SET_VCPU_EVENTS
 
 Capability: KVM_CAP_VCPU_EVENTS
@@ -767,6 +806,7 @@ If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
 the flags field to signal that interrupt.shadow contains a valid state and
 shall be written into the VCPU.
 
+
 4.33 KVM_GET_DEBUGREGS
 
 Capability: KVM_CAP_DEBUGREGS
@@ -785,6 +825,7 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
+
 4.34 KVM_SET_DEBUGREGS
 
 Capability: KVM_CAP_DEBUGREGS
@@ -798,6 +839,7 @@ Writes debug registers into the vcpu.
 See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
 yet and must be cleared on entry.
 
+
 4.35 KVM_SET_USER_MEMORY_REGION
 
 Capability: KVM_CAP_USER_MEM
@@ -844,6 +886,7 @@ It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
 The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
 allocation and is deprecated.
 
+
 4.36 KVM_SET_TSS_ADDR
 
 Capability: KVM_CAP_SET_TSS_ADDR
@@ -862,6 +905,7 @@ This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
 because of a quirk in the virtualization implementation (see the internals
 documentation when it pops into existence).
 
+
 4.37 KVM_ENABLE_CAP
 
 Capability: KVM_CAP_ENABLE_CAP
@@ -897,6 +941,7 @@ function properly, this is the place to put them.
        __u8  pad[64];
 };
 
+
 4.38 KVM_GET_MP_STATE
 
 Capability: KVM_CAP_MP_STATE
@@ -927,6 +972,7 @@ Possible values are:
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 irqchip, the multiprocessing state must be maintained by userspace.
 
+
 4.39 KVM_SET_MP_STATE
 
 Capability: KVM_CAP_MP_STATE
@@ -941,6 +987,7 @@ arguments.
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 irqchip, the multiprocessing state must be maintained by userspace.
 
+
 4.40 KVM_SET_IDENTITY_MAP_ADDR
 
 Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR
@@ -959,6 +1006,7 @@ This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
 because of a quirk in the virtualization implementation (see the internals
 documentation when it pops into existence).
 
+
 4.41 KVM_SET_BOOT_CPU_ID
 
 Capability: KVM_CAP_SET_BOOT_CPU_ID
@@ -971,6 +1019,7 @@ Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
 as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
 is vcpu 0.
 
+
 4.42 KVM_GET_XSAVE
 
 Capability: KVM_CAP_XSAVE
@@ -985,6 +1034,7 @@ struct kvm_xsave {
 
 This ioctl would copy current vcpu's xsave struct to the userspace.
 
+
 4.43 KVM_SET_XSAVE
 
 Capability: KVM_CAP_XSAVE
@@ -999,6 +1049,7 @@ struct kvm_xsave {
 
 This ioctl would copy userspace's xsave struct to the kernel.
 
+
 4.44 KVM_GET_XCRS
 
 Capability: KVM_CAP_XCRS
@@ -1022,6 +1073,7 @@ struct kvm_xcrs {
 
 This ioctl would copy current vcpu's xcrs to the userspace.
 
+
 4.45 KVM_SET_XCRS
 
 Capability: KVM_CAP_XCRS
@@ -1045,6 +1097,7 @@ struct kvm_xcrs {
 
 This ioctl would set vcpu's xcr to the value userspace specified.
 
+
 4.46 KVM_GET_SUPPORTED_CPUID
 
 Capability: KVM_CAP_EXT_CPUID
@@ -1119,6 +1172,7 @@ support.  Instead it is reported via
 if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the
 feature in userspace, then you can enable the feature for KVM_SET_CPUID2.
 
+
 4.47 KVM_PPC_GET_PVINFO
 
 Capability: KVM_CAP_PPC_GET_PVINFO
@@ -1142,6 +1196,7 @@ of 4 instructions that make up a hypercall.
 If any additional field gets added to this structure later on, a bit for that
 additional piece of information will be set in the flags bitmap.
 
+
 4.48 KVM_ASSIGN_PCI_DEVICE
 
 Capability: KVM_CAP_DEVICE_ASSIGNMENT
@@ -1185,6 +1240,7 @@ Only PCI header type 0 devices with PCI BAR resources are supported by
 device assignment.  The user requesting this ioctl must have read/write
 access to the PCI sysfs resource files associated with the device.
 
+
 4.49 KVM_DEASSIGN_PCI_DEVICE
 
 Capability: KVM_CAP_DEVICE_DEASSIGNMENT
@@ -1198,6 +1254,7 @@ Ends PCI device assignment, releasing all associated resources.
 See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is
 used in kvm_assigned_pci_dev to identify the device.
 
+
 4.50 KVM_ASSIGN_DEV_IRQ
 
 Capability: KVM_CAP_ASSIGN_DEV_IRQ
@@ -1231,6 +1288,7 @@ The following flags are defined:
 It is not valid to specify multiple types per host or guest IRQ. However, the
 IRQ type of host and guest can differ or can even be null.
 
+
 4.51 KVM_DEASSIGN_DEV_IRQ
 
 Capability: KVM_CAP_ASSIGN_DEV_IRQ
@@ -1245,6 +1303,7 @@ See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
 by assigned_dev_id, flags must correspond to the IRQ type specified on
 KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
 
+
 4.52 KVM_SET_GSI_ROUTING
 
 Capability: KVM_CAP_IRQ_ROUTING
@@ -1293,6 +1352,7 @@ struct kvm_irq_routing_msi {
 	__u32 pad;
 };
 
+
 4.53 KVM_ASSIGN_SET_MSIX_NR
 
 Capability: KVM_CAP_DEVICE_MSIX
@@ -1314,6 +1374,7 @@ struct kvm_assigned_msix_nr {
 
 #define KVM_MAX_MSIX_PER_DEV		256
 
+
 4.54 KVM_ASSIGN_SET_MSIX_ENTRY
 
 Capability: KVM_CAP_DEVICE_MSIX
@@ -1332,7 +1393,8 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
-4.54 KVM_SET_TSC_KHZ
+
+4.55 KVM_SET_TSC_KHZ
 
 Capability: KVM_CAP_TSC_CONTROL
 Architectures: x86
@@ -1343,7 +1405,8 @@ Returns: 0 on success, -1 on error
 Specifies the tsc frequency for the virtual machine. The unit of the
 frequency is KHz.
 
-4.55 KVM_GET_TSC_KHZ
+
+4.56 KVM_GET_TSC_KHZ
 
 Capability: KVM_CAP_GET_TSC_KHZ
 Architectures: x86
@@ -1355,7 +1418,8 @@ Returns the tsc frequency of the guest. The unit of the return value is
 KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
 error.
 
-4.56 KVM_GET_LAPIC
+
+4.57 KVM_GET_LAPIC
 
 Capability: KVM_CAP_IRQCHIP
 Architectures: x86
@@ -1371,7 +1435,8 @@ struct kvm_lapic_state {
 Reads the Local APIC registers and copies them into the input argument.  The
 data format and layout are the same as documented in the architecture manual.
 
-4.57 KVM_SET_LAPIC
+
+4.58 KVM_SET_LAPIC
 
 Capability: KVM_CAP_IRQCHIP
 Architectures: x86
@@ -1387,7 +1452,8 @@ struct kvm_lapic_state {
 Copies the input argument into the the Local APIC registers.  The data format
 and layout are the same as documented in the architecture manual.
 
-4.58 KVM_IOEVENTFD
+
+4.59 KVM_IOEVENTFD
 
 Capability: KVM_CAP_IOEVENTFD
 Architectures: all
@@ -1417,7 +1483,8 @@ The following flags are defined:
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
-4.59 KVM_DIRTY_TLB
+
+4.60 KVM_DIRTY_TLB
 
 Capability: KVM_CAP_SW_TLB
 Architectures: ppc
@@ -1449,7 +1516,8 @@ The "num_dirty" field is a performance hint for KVM to determine whether it
 should skip processing the bitmap and just invalidate everything.  It must
 be set to the number of set bits in the bitmap.
 
-4.60 KVM_ASSIGN_SET_INTX_MASK
+
+4.61 KVM_ASSIGN_SET_INTX_MASK
 
 Capability: KVM_CAP_PCI_2_3
 Architectures: x86
@@ -1482,6 +1550,7 @@ See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
 by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
 evaluated.
 
+
 4.62 KVM_CREATE_SPAPR_TCE
 
 Capability: KVM_CAP_SPAPR_TCE
@@ -1517,6 +1586,7 @@ the entries written by kernel-handled H_PUT_TCE calls, and also lets
 userspace update the TCE table directly which is useful in some
 circumstances.
 
+
 4.63 KVM_ALLOCATE_RMA
 
 Capability: KVM_CAP_PPC_RMA
@@ -1549,6 +1619,7 @@ is supported; 2 if the processor requires all virtual machines to have
 an RMA, or 1 if the processor can use an RMA but doesn't require it,
 because it supports the Virtual RMA (VRMA) facility.
 
+
 4.64 KVM_NMI
 
 Capability: KVM_CAP_USER_NMI
@@ -1574,6 +1645,7 @@ following algorithm:
 Some guests configure the LINT1 NMI input to cause a panic, aiding in
 debugging.
 
+
 4.65 KVM_S390_UCAS_MAP
 
 Capability: KVM_CAP_S390_UCONTROL
@@ -1593,6 +1665,7 @@ This ioctl maps the memory at "user_addr" with the length "length" to
 the vcpu's address space starting at "vcpu_addr". All parameters need to
 be alligned by 1 megabyte.
 
+
 4.66 KVM_S390_UCAS_UNMAP
 
 Capability: KVM_CAP_S390_UCONTROL
@@ -1612,6 +1685,7 @@ This ioctl unmaps the memory in the vcpu's address space starting at
 "vcpu_addr" with the length "length". The field "user_addr" is ignored.
 All parameters need to be alligned by 1 megabyte.
 
+
 4.67 KVM_S390_VCPU_FAULT
 
 Capability: KVM_CAP_S390_UCONTROL
@@ -1628,6 +1702,7 @@ table upfront. This is useful to handle validity intercepts for user
 controlled virtual machines to fault in the virtual cpu's lowcore pages
 prior to calling the KVM_RUN ioctl.
 
+
 4.68 KVM_SET_ONE_REG
 
 Capability: KVM_CAP_ONE_REG
@@ -1653,6 +1728,7 @@ registers, find a list below:
         |                       |
   PPC   | KVM_REG_PPC_HIOR      | 64
 
+
 4.69 KVM_GET_ONE_REG
 
 Capability: KVM_CAP_ONE_REG
@@ -1669,6 +1745,7 @@ at the memory location pointed to by "addr".
 The list of registers accessible using this interface is identical to the
 list in 4.64.
 
+
 4.70 KVM_KVMCLOCK_CTRL
 
 Capability: KVM_CAP_KVMCLOCK_CTRL
@@ -1689,6 +1766,7 @@ where the guest will clear the flag: when the soft lockup watchdog timer resets
 itself or when a soft lockup is detected.  This ioctl can be called any time
 after pausing the vcpu, but before it is resumed.
 
+
 4.71 KVM_SIGNAL_MSI
 
 Capability: KVM_CAP_SIGNAL_MSI
@@ -1710,7 +1788,9 @@ struct kvm_msi {
 
 No flags are defined so far. The corresponding field must be 0.
 
+
 5. The kvm_run structure
+------------------------
 
 Application code obtains a pointer to the kvm_run structure by
 mmap()ing a vcpu fd.  From that point, application code can control
@@ -1951,7 +2031,9 @@ and usually define the validity of a groups of registers. (e.g. one bit
 
 };
 
+
 6. Capabilities that can be enabled
+-----------------------------------
 
 There are certain capabilities that change the behavior of the virtual CPU when
 enabled. To enable them, please see section 4.37. Below you can find a list of
@@ -1967,6 +2049,7 @@ The following information is provided along with the description:
   Returns: the return value.  General error numbers (EBADF, ENOMEM, EINVAL)
       are not detailed, but errors with specific meanings are.
 
+
 6.1 KVM_CAP_PPC_OSI
 
 Architectures: ppc
@@ -1980,6 +2063,7 @@ between the guest and the host.
 
 When this capability is enabled, KVM_EXIT_OSI can occur.
 
+
 6.2 KVM_CAP_PPC_PAPR
 
 Architectures: ppc
@@ -1998,6 +2082,7 @@ HTAB invisible to the guest.
 
 When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.
 
+
 6.3 KVM_CAP_SW_TLB
 
 Architectures: ppc
-- 
cgit v0.10.2


From 0589ff6c11d8128cf053c3ddc75b0f6d8b71c62b Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 24 Apr 2012 16:40:16 +0200
Subject: KVM: x86: Document in-kernel PIT API

Add descriptions for KVM_CREATE_PIT2 and KVM_GET/SET_PIT2.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b48f927..a7cb93c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1789,6 +1789,69 @@ struct kvm_msi {
 No flags are defined so far. The corresponding field must be 0.
 
 
+4.71 KVM_CREATE_PIT2
+
+Capability: KVM_CAP_PIT2
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_pit_config (in)
+Returns: 0 on success, -1 on error
+
+Creates an in-kernel device model for the i8254 PIT. This call is only valid
+after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following
+parameters have to be passed:
+
+struct kvm_pit_config {
+	__u32 flags;
+	__u32 pad[15];
+};
+
+Valid flags are:
+
+#define KVM_PIT_SPEAKER_DUMMY     1 /* emulate speaker port stub */
+
+This IOCTL replaces the obsolete KVM_CREATE_PIT.
+
+
+4.72 KVM_GET_PIT2
+
+Capability: KVM_CAP_PIT_STATE2
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_pit_state2 (out)
+Returns: 0 on success, -1 on error
+
+Retrieves the state of the in-kernel PIT model. Only valid after
+KVM_CREATE_PIT2. The state is returned in the following structure:
+
+struct kvm_pit_state2 {
+	struct kvm_pit_channel_state channels[3];
+	__u32 flags;
+	__u32 reserved[9];
+};
+
+Valid flags are:
+
+/* disable PIT in HPET legacy mode */
+#define KVM_PIT_FLAGS_HPET_LEGACY  0x00000001
+
+This IOCTL replaces the obsolete KVM_GET_PIT.
+
+
+4.73 KVM_SET_PIT2
+
+Capability: KVM_CAP_PIT_STATE2
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_pit_state2 (in)
+Returns: 0 on success, -1 on error
+
+Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2.
+See KVM_GET_PIT2 for details on struct kvm_pit_state2.
+
+This IOCTL replaces the obsolete KVM_SET_PIT.
+
+
 5. The kvm_run structure
 ------------------------
 
-- 
cgit v0.10.2


From b6ddf05ff68d81a7c1736717faf492b70e9bf4f9 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 24 Apr 2012 16:40:17 +0200
Subject: KVM: x86: Run PIT work in own kthread

We can't run PIT IRQ injection work in the interrupt context of the host
timer. This would allow the user to influence the handler complexity by
asking for a broadcast to a large number of VCPUs. Therefore, this work
was pushed into workqueue context in 9d244caf2e. However, this prevents
prioritizing the PIT injection over other task as workqueues share
kernel threads.

This replaces the workqueue with a kthread worker and gives that thread
a name in the format "kvm-pit/<owner-process-pid>". That allows to
identify and adjust the kthread priority according to the VM process
parameters.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a7cb93c..eb62761 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1810,6 +1810,14 @@ Valid flags are:
 
 #define KVM_PIT_SPEAKER_DUMMY     1 /* emulate speaker port stub */
 
+PIT timer interrupts may use a per-VM kernel thread for injection. If it
+exists, this thread will have a name of the following pattern:
+
+kvm-pit/<owner-process-pid>
+
+When running a guest with elevated priorities, the scheduling parameters of
+this thread may have to be adjusted accordingly.
+
 This IOCTL replaces the obsolete KVM_CREATE_PIT.
 
 
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d68f99d..adba28f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,7 +34,6 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
-#include <linux/workqueue.h>
 
 #include "irq.h"
 #include "i8254.h"
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 		/* in this case, we had multiple outstanding pit interrupts
 		 * that we needed to inject.  Reinject
 		 */
-		queue_work(ps->pit->wq, &ps->pit->expired);
+		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
 	ps->irq_ack = 1;
 	spin_unlock(&ps->inject_lock);
 }
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 static void destroy_pit_timer(struct kvm_pit *pit)
 {
 	hrtimer_cancel(&pit->pit_state.pit_timer.timer);
-	cancel_work_sync(&pit->expired);
+	flush_kthread_work(&pit->expired);
 }
 
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
-static void pit_do_work(struct work_struct *work)
+static void pit_do_work(struct kthread_work *work)
 {
 	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
 	struct kvm *kvm = pit->kvm;
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
-		queue_work(pt->wq, &pt->expired);
+		queue_kthread_work(&pt->worker, &pt->expired);
 	}
 
 	if (ktimer->t_ops->is_periodic(ktimer)) {
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
-	cancel_work_sync(&ps->pit->expired);
+	flush_kthread_work(&ps->pit->expired);
 	pt->period = interval;
 	ps->is_periodic = is_period;
 
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
 	struct kvm_pit *pit;
 	struct kvm_kpit_state *pit_state;
+	struct pid *pid;
+	pid_t pid_nr;
 	int ret;
 
 	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	mutex_lock(&pit->pit_state.lock);
 	spin_lock_init(&pit->pit_state.inject_lock);
 
-	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
-	if (!pit->wq) {
+	pid = get_pid(task_tgid(current));
+	pid_nr = pid_vnr(pid);
+	put_pid(pid);
+
+	init_kthread_worker(&pit->worker);
+	pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
+				       "kvm-pit/%d", pid_nr);
+	if (IS_ERR(pit->worker_task)) {
 		mutex_unlock(&pit->pit_state.lock);
 		kvm_free_irq_source_id(kvm, pit->irq_source_id);
 		kfree(pit);
 		return NULL;
 	}
-	INIT_WORK(&pit->expired, pit_do_work);
+	init_kthread_work(&pit->expired, pit_do_work);
 
 	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
@@ -736,7 +743,7 @@ fail:
 	kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 	kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
 	kvm_free_irq_source_id(kvm, pit->irq_source_id);
-	destroy_workqueue(pit->wq);
+	kthread_stop(pit->worker_task);
 	kfree(pit);
 	return NULL;
 }
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm)
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
-		cancel_work_sync(&kvm->arch.vpit->expired);
+		flush_kthread_work(&kvm->arch.vpit->expired);
+		kthread_stop(kvm->arch.vpit->worker_task);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-		destroy_workqueue(kvm->arch.vpit->wq);
 		kfree(kvm->arch.vpit);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 51a9742..fdf4042 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,6 +1,8 @@
 #ifndef __I8254_H
 #define __I8254_H
 
+#include <linux/kthread.h>
+
 #include "iodev.h"
 
 struct kvm_kpit_channel_state {
@@ -39,8 +41,9 @@ struct kvm_pit {
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
-	struct workqueue_struct *wq;
-	struct work_struct expired;
+	struct kthread_worker worker;
+	struct task_struct *worker_task;
+	struct kthread_work expired;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
-- 
cgit v0.10.2


From 41628d334361670d825fb03c04568f5ef9f084dc Mon Sep 17 00:00:00 2001
From: Konstantin Weitz <WEITZKON@de.ibm.com>
Date: Wed, 25 Apr 2012 15:30:38 +0200
Subject: KVM: s390: Implement the directed yield (diag 9c) hypervisor call for
 KVM

This patch implements the directed yield hypercall found on other
System z hypervisors. It delegates execution time to the virtual cpu
specified in the instruction's parameter.

Useful to avoid long spinlock waits in the guest.

Christian Borntraeger: moved common code in virt/kvm/

Signed-off-by: Konstantin Weitz <WEITZKON@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 7343872..dd17537 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -148,6 +148,7 @@ struct kvm_vcpu_stat {
 	u32 instruction_sigp_restart;
 	u32 diagnose_10;
 	u32 diagnose_44;
+	u32 diagnose_9c;
 };
 
 struct kvm_s390_io_info {
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index a353f0e..2d2ae32 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -53,6 +53,29 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *tcpu;
+	int tid;
+	int i;
+
+	tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
+	vcpu->stat.diagnose_9c++;
+	VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d", tid);
+
+	if (tid == vcpu->vcpu_id)
+		return 0;
+
+	kvm_for_each_vcpu(i, tcpu, kvm)
+		if (tcpu->vcpu_id == tid) {
+			kvm_vcpu_yield_to(tcpu);
+			break;
+		}
+
+	return 0;
+}
+
 static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
 {
 	unsigned int reg = vcpu->arch.sie_block->ipa & 0xf;
@@ -89,6 +112,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 		return diag_release_pages(vcpu);
 	case 0x44:
 		return __diag_time_slice_end(vcpu);
+	case 0x9c:
+		return __diag_time_slice_end_directed(vcpu);
 	case 0x308:
 		return __diag_ipl_functions(vcpu);
 	default:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d30c835..fd98914 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -74,6 +74,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
 	{ "diagnose_10", VCPU_STAT(diagnose_10) },
 	{ "diagnose_44", VCPU_STAT(diagnose_44) },
+	{ "diagnose_9c", VCPU_STAT(diagnose_9c) },
 	{ NULL }
 };
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6f34330..cae342d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -461,6 +461,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1847c76..7e14068 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1543,6 +1543,31 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
+bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
+{
+	struct pid *pid;
+	struct task_struct *task = NULL;
+
+	rcu_read_lock();
+	pid = rcu_dereference(target->pid);
+	if (pid)
+		task = get_pid_task(target->pid, PIDTYPE_PID);
+	rcu_read_unlock();
+	if (!task)
+		return false;
+	if (task->flags & PF_VCPU) {
+		put_task_struct(task);
+		return false;
+	}
+	if (yield_to(task, 1)) {
+		put_task_struct(task);
+		return true;
+	}
+	put_task_struct(task);
+	return false;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
 	struct kvm *kvm = me->kvm;
@@ -1561,8 +1586,6 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 	 */
 	for (pass = 0; pass < 2 && !yielded; pass++) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			struct task_struct *task = NULL;
-			struct pid *pid;
 			if (!pass && i < last_boosted_vcpu) {
 				i = last_boosted_vcpu;
 				continue;
@@ -1572,24 +1595,11 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (waitqueue_active(&vcpu->wq))
 				continue;
-			rcu_read_lock();
-			pid = rcu_dereference(vcpu->pid);
-			if (pid)
-				task = get_pid_task(vcpu->pid, PIDTYPE_PID);
-			rcu_read_unlock();
-			if (!task)
-				continue;
-			if (task->flags & PF_VCPU) {
-				put_task_struct(task);
-				continue;
-			}
-			if (yield_to(task, 1)) {
-				put_task_struct(task);
+			if (kvm_vcpu_yield_to(vcpu)) {
 				kvm->last_boosted_vcpu = i;
 				yielded = 1;
 				break;
 			}
-			put_task_struct(task);
 		}
 	}
 }
-- 
cgit v0.10.2


From 8733ac36fc37fe055a7d7daadf5451b4231f0214 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 25 Apr 2012 15:30:39 +0200
Subject: KVM: s390: use kvm_vcpu_on_spin for diag 0x44

Lets replace the old open coded version of diag 0x44 (which relied on
compat_sched_yield) with kvm_vcpu_on_spin.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 2d2ae32..b23d9ac 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -47,9 +47,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
 	vcpu->stat.diagnose_44++;
-	vcpu_put(vcpu);
-	yield();
-	vcpu_load(vcpu);
+	kvm_vcpu_on_spin(vcpu);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 8c3f61e2ddb6c1845704fd8ef1b999ecd2e4725c Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 24 Apr 2012 09:24:44 +0200
Subject: KVM: s390: Handle sckpf instruction

Handle the mandatory intercept SET CLOCK PROGRAMMABLE FIELD
instruction.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 3614565..979cbe5 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -101,6 +101,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 }
 
 static intercept_handler_t instruction_handlers[256] = {
+	[0x01] = kvm_s390_handle_01,
 	[0x83] = kvm_s390_handle_diag,
 	[0xae] = kvm_s390_handle_sigp,
 	[0xb2] = kvm_s390_handle_b2,
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index ff28f9d..2294377 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,6 +79,7 @@ int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
 /* implemented in priv.c */
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
 
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index e5a45db..68a6b2e 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -380,3 +380,34 @@ int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)
 	return -EOPNOTSUPP;
 }
 
+static int handle_sckpf(struct kvm_vcpu *vcpu)
+{
+	u32 value;
+
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu,
+						   PGM_PRIVILEGED_OPERATION);
+
+	if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000)
+		return kvm_s390_inject_program_int(vcpu,
+						   PGM_SPECIFICATION);
+
+	value = vcpu->run->s.regs.gprs[0] & 0x000000000000ffff;
+	vcpu->arch.sie_block->todpr = value;
+
+	return 0;
+}
+
+static intercept_handler_t x01_handlers[256] = {
+	[0x07] = handle_sckpf,
+};
+
+int kvm_s390_handle_01(struct kvm_vcpu *vcpu)
+{
+	intercept_handler_t handler;
+
+	handler = x01_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
+	if (handler)
+		return handler(vcpu);
+	return -EOPNOTSUPP;
+}
-- 
cgit v0.10.2


From e726b1bd64b0b8945c171d2d4bf749fba9fc0800 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 2 May 2012 10:50:38 +0200
Subject: KVM: s390: implement KVM_CAP_NR/MAX_VCPUS

Let userspace know the number of max and supported cpus for kvm on s390.
Return KVM_MAX_VCPUS (currently 64) for both values.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fd98914..e5e3800 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -136,6 +136,10 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SYNC_REGS:
 		r = 1;
 		break;
+	case KVM_CAP_NR_VCPUS:
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
 	default:
 		r = 0;
 	}
-- 
cgit v0.10.2


From 57c22e5f35aa4b9b2fe11f73f3e62bbf9ef36190 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 2 May 2012 17:55:56 +0300
Subject: KVM: fix cpuid eax for KVM leaf

cpuid eax should return the max leaf so that
guests can find out the valid range.
This matches Xen et al.
Update documentation to match.

Tested with -cpu host.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 8820685..83afe65 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -10,11 +10,15 @@ a guest.
 KVM cpuid functions are:
 
 function: KVM_CPUID_SIGNATURE (0x40000000)
-returns : eax = 0,
+returns : eax = 0x40000001,
           ebx = 0x4b4d564b,
           ecx = 0x564b4d56,
           edx = 0x4d.
 Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+The value in eax corresponds to the maximum cpuid function present in this leaf,
+and will be updated if more functions are added in the future.
+Note also that old hosts set eax value to 0x0. This should
+be interpreted as if the value was 0x40000001.
 This function queries the presence of KVM cpuid leafs.
 
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c2134b8..7df1c6d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -398,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	case KVM_CPUID_SIGNATURE: {
 		char signature[12] = "KVMKVMKVM\0\0";
 		u32 *sigptr = (u32 *)signature;
-		entry->eax = 0;
+		entry->eax = KVM_CPUID_FEATURES;
 		entry->ebx = sigptr[0];
 		entry->ecx = sigptr[1];
 		entry->edx = sigptr[2];
-- 
cgit v0.10.2


From 9b72d3b07dd99ac8ab2b84de5004a295af460536 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Apr 2012 14:45:49 +0300
Subject: KVM guest: make kvm_para_available() check hypervisor bit reading
 cpuid leaf

This cpuid range does not exist on real HW and Intel spec says that
"Information returned for highest basic information leaf" will be
returned. Not very well defined.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 99c4bbe..a7a7a94 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -178,14 +178,16 @@ static inline int kvm_para_available(void)
 	unsigned int eax, ebx, ecx, edx;
 	char signature[13];
 
-	cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
-	memcpy(signature + 0, &ebx, 4);
-	memcpy(signature + 4, &ecx, 4);
-	memcpy(signature + 8, &edx, 4);
-	signature[12] = 0;
-
-	if (strcmp(signature, "KVMKVMKVM") == 0)
-		return 1;
+	if (cpu_has_hypervisor) {
+		cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+		memcpy(signature + 0, &ebx, 4);
+		memcpy(signature + 4, &ecx, 4);
+		memcpy(signature + 8, &edx, 4);
+		signature[12] = 0;
+
+		if (strcmp(signature, "KVMKVMKVM") == 0)
+			return 1;
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 1c2545be05f436523cabc54087c6a60ea10110d3 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 30 Apr 2012 17:46:31 +0900
Subject: KVM: x86 emulator: Move ModRM flags for groups to top level opcode
 tables

Needed for the following patch which simplifies ModRM fetching code.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0d151e2..8d2c3d0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3359,8 +3359,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 		      .check_perm = (_p) }
 #define N    D(0)
 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 #define II(_f, _e, _i) \
 	{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
@@ -3380,25 +3380,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 		I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
 
 static struct opcode group7_rm1[] = {
-	DI(SrcNone | ModRM | Priv, monitor),
-	DI(SrcNone | ModRM | Priv, mwait),
+	DI(SrcNone | Priv, monitor),
+	DI(SrcNone | Priv, mwait),
 	N, N, N, N, N, N,
 };
 
 static struct opcode group7_rm3[] = {
-	DIP(SrcNone | ModRM | Prot | Priv, vmrun,   check_svme_pa),
-	II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
-	DIP(SrcNone | ModRM | Prot | Priv, vmload,  check_svme_pa),
-	DIP(SrcNone | ModRM | Prot | Priv, vmsave,  check_svme_pa),
-	DIP(SrcNone | ModRM | Prot | Priv, stgi,    check_svme),
-	DIP(SrcNone | ModRM | Prot | Priv, clgi,    check_svme),
-	DIP(SrcNone | ModRM | Prot | Priv, skinit,  check_svme),
-	DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
+	DIP(SrcNone | Prot | Priv,		vmrun,		check_svme_pa),
+	II(SrcNone  | Prot | VendorSpecific,	em_vmmcall,	vmmcall),
+	DIP(SrcNone | Prot | Priv,		vmload,		check_svme_pa),
+	DIP(SrcNone | Prot | Priv,		vmsave,		check_svme_pa),
+	DIP(SrcNone | Prot | Priv,		stgi,		check_svme),
+	DIP(SrcNone | Prot | Priv,		clgi,		check_svme),
+	DIP(SrcNone | Prot | Priv,		skinit,		check_svme),
+	DIP(SrcNone | Prot | Priv,		invlpga,	check_svme),
 };
 
 static struct opcode group7_rm7[] = {
 	N,
-	DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
+	DIP(SrcNone, rdtscp, check_rdtsc),
 	N, N, N, N, N, N,
 };
 
@@ -3414,76 +3414,77 @@ static struct opcode group1[] = {
 };
 
 static struct opcode group1A[] = {
-	I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
 };
 
 static struct opcode group3[] = {
-	I(DstMem | SrcImm | ModRM, em_test),
-	I(DstMem | SrcImm | ModRM, em_test),
-	I(DstMem | SrcNone | ModRM | Lock, em_not),
-	I(DstMem | SrcNone | ModRM | Lock, em_neg),
-	I(SrcMem | ModRM, em_mul_ex),
-	I(SrcMem | ModRM, em_imul_ex),
-	I(SrcMem | ModRM, em_div_ex),
-	I(SrcMem | ModRM, em_idiv_ex),
+	I(DstMem | SrcImm, em_test),
+	I(DstMem | SrcImm, em_test),
+	I(DstMem | SrcNone | Lock, em_not),
+	I(DstMem | SrcNone | Lock, em_neg),
+	I(SrcMem, em_mul_ex),
+	I(SrcMem, em_imul_ex),
+	I(SrcMem, em_div_ex),
+	I(SrcMem, em_idiv_ex),
 };
 
 static struct opcode group4[] = {
-	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
-	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
+	I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
 	N, N, N, N, N, N,
 };
 
 static struct opcode group5[] = {
-	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
-	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
-	I(SrcMem | ModRM | Stack, em_grp45),
-	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
-	I(SrcMem | ModRM | Stack, em_grp45),
-	I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
-	I(SrcMem | ModRM | Stack, em_grp45), N,
+	I(DstMem | SrcNone | Lock,		em_grp45),
+	I(DstMem | SrcNone | Lock,		em_grp45),
+	I(SrcMem | Stack,			em_grp45),
+	I(SrcMemFAddr | ImplicitOps | Stack,	em_call_far),
+	I(SrcMem | Stack,			em_grp45),
+	I(SrcMemFAddr | ImplicitOps,		em_grp45),
+	I(SrcMem | Stack,			em_grp45), N,
 };
 
 static struct opcode group6[] = {
-	DI(ModRM | Prot,        sldt),
-	DI(ModRM | Prot,        str),
-	DI(ModRM | Prot | Priv, lldt),
-	DI(ModRM | Prot | Priv, ltr),
+	DI(Prot,	sldt),
+	DI(Prot,	str),
+	DI(Prot | Priv,	lldt),
+	DI(Prot | Priv,	ltr),
 	N, N, N, N,
 };
 
 static struct group_dual group7 = { {
-	DI(ModRM | Mov | DstMem | Priv, sgdt),
-	DI(ModRM | Mov | DstMem | Priv, sidt),
-	II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
-	II(ModRM | SrcMem | Priv, em_lidt, lidt),
-	II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
-	II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
-	II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+	DI(Mov | DstMem | Priv,			sgdt),
+	DI(Mov | DstMem | Priv,			sidt),
+	II(SrcMem | Priv,			em_lgdt, lgdt),
+	II(SrcMem | Priv,			em_lidt, lidt),
+	II(SrcNone | DstMem | Mov,		em_smsw, smsw), N,
+	II(SrcMem16 | Mov | Priv,		em_lmsw, lmsw),
+	II(SrcMem | ByteOp | Priv | NoAccess,	em_invlpg, invlpg),
 }, {
-	I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
+	I(SrcNone | Priv | VendorSpecific,	em_vmcall),
 	EXT(0, group7_rm1),
 	N, EXT(0, group7_rm3),
-	II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
-	II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
+	II(SrcNone | DstMem | Mov,		em_smsw, smsw), N,
+	II(SrcMem16 | Mov | Priv,		em_lmsw, lmsw),
+	EXT(0, group7_rm7),
 } };
 
 static struct opcode group8[] = {
 	N, N, N, N,
-	I(DstMem | SrcImmByte | ModRM, em_bt),
-	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
-	I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
-	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
+	I(DstMem | SrcImmByte,				em_bt),
+	I(DstMem | SrcImmByte | Lock | PageTable,	em_bts),
+	I(DstMem | SrcImmByte | Lock,			em_btr),
+	I(DstMem | SrcImmByte | Lock | PageTable,	em_btc),
 };
 
 static struct group_dual group9 = { {
-	N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
 
 static struct opcode group11[] = {
-	I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
+	I(DstMem | SrcImm | Mov | PageTable, em_mov),
 	X7(D(Undefined)),
 };
 
@@ -3541,10 +3542,10 @@ static struct opcode opcode_table[256] = {
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
-	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
-	G(DstMem | SrcImm | ModRM | Group, group1),
-	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
-	G(DstMem | SrcImmByte | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm, group1),
+	G(DstMem | SrcImm, group1),
+	G(ByteOp | DstMem | SrcImm | No64, group1),
+	G(DstMem | SrcImmByte, group1),
 	I2bv(DstMem | SrcReg | ModRM, em_test),
 	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
 	/* 0x88 - 0x8F */
-- 
cgit v0.10.2


From 9f4260e73ac43aaa91eb5de95950e1de7002f467 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 30 Apr 2012 17:48:25 +0900
Subject: KVM: x86 emulator: Avoid pushing back ModRM byte fetched for group
 decoding

Although ModRM byte is fetched for group decoding, it is soon pushed
back to make decode_modrm() fetch it later again.

Now that ModRM flag can be found in the top level opcode tables, fetch
ModRM byte before group decoding to make the code simpler.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8d2c3d0..7fd2576 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -972,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
 	}
 
-	ctxt->modrm = insn_fetch(u8, ctxt);
 	ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
 	ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
 	ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -3976,17 +3975,16 @@ done_prefixes:
 	}
 	ctxt->d = opcode.flags;
 
+	if (ctxt->d & ModRM)
+		ctxt->modrm = insn_fetch(u8, ctxt);
+
 	while (ctxt->d & GroupMask) {
 		switch (ctxt->d & GroupMask) {
 		case Group:
-			ctxt->modrm = insn_fetch(u8, ctxt);
-			--ctxt->_eip;
 			goffset = (ctxt->modrm >> 3) & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case GroupDual:
-			ctxt->modrm = insn_fetch(u8, ctxt);
-			--ctxt->_eip;
 			goffset = (ctxt->modrm >> 3) & 7;
 			if ((ctxt->modrm >> 6) == 3)
 				opcode = opcode.u.gdual->mod3[goffset];
-- 
cgit v0.10.2


From cc902ad4f2b7cd3dd2cc268c63f6fb99fb1abf0f Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Thu, 22 Mar 2012 18:39:11 +0000
Subject: KVM: Use minimum and maximum address mapped by TLB1

Keep track of minimum and maximum address mapped by tlb1.
This helps in TLBMISS handling in KVM to quick check whether the address lies in mapped range.
If address does not lies in this range then no need to look in each tlb1 entry of tlb1 array.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 7967f3f..aa8b814 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -89,6 +89,10 @@ struct kvmppc_vcpu_e500 {
 	u64 *g2h_tlb1_map;
 	unsigned int *h2g_tlb1_rmap;
 
+	/* Minimum and maximum address mapped my TLB1 */
+	unsigned long tlb1_min_eaddr;
+	unsigned long tlb1_max_eaddr;
+
 #ifdef CONFIG_KVM_E500V2
 	u32 pid[E500_PID_NUM];
 
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index e05232b..c510fc9 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -261,6 +261,9 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
 		set_base = gtlb0_set_base(vcpu_e500, eaddr);
 		size = vcpu_e500->gtlb_params[0].ways;
 	} else {
+		if (eaddr < vcpu_e500->tlb1_min_eaddr ||
+				eaddr > vcpu_e500->tlb1_max_eaddr)
+			return -1;
 		set_base = 0;
 	}
 
@@ -583,6 +586,65 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	return victim;
 }
 
+static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int size = vcpu_e500->gtlb_params[1].entries;
+	unsigned int offset;
+	gva_t eaddr;
+	int i;
+
+	vcpu_e500->tlb1_min_eaddr = ~0UL;
+	vcpu_e500->tlb1_max_eaddr = 0;
+	offset = vcpu_e500->gtlb_offset[1];
+
+	for (i = 0; i < size; i++) {
+		struct kvm_book3e_206_tlb_entry *tlbe =
+			&vcpu_e500->gtlb_arch[offset + i];
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		eaddr = get_tlb_eaddr(tlbe);
+		vcpu_e500->tlb1_min_eaddr =
+				min(vcpu_e500->tlb1_min_eaddr, eaddr);
+
+		eaddr = get_tlb_end(tlbe);
+		vcpu_e500->tlb1_max_eaddr =
+				max(vcpu_e500->tlb1_max_eaddr, eaddr);
+	}
+}
+
+static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500,
+				struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned long start, end, size;
+
+	size = get_tlb_bytes(gtlbe);
+	start = get_tlb_eaddr(gtlbe) & ~(size - 1);
+	end = start + size - 1;
+
+	return vcpu_e500->tlb1_min_eaddr == start ||
+			vcpu_e500->tlb1_max_eaddr == end;
+}
+
+/* This function is supposed to be called for a adding a new valid tlb entry */
+static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu,
+				struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned long start, end, size;
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (!get_tlb_v(gtlbe))
+		return;
+
+	size = get_tlb_bytes(gtlbe);
+	start = get_tlb_eaddr(gtlbe) & ~(size - 1);
+	end = start + size - 1;
+
+	vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start);
+	vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end);
+}
+
 static inline int kvmppc_e500_gtlbe_invalidate(
 				struct kvmppc_vcpu_e500 *vcpu_e500,
 				int tlbsel, int esel)
@@ -593,6 +655,9 @@ static inline int kvmppc_e500_gtlbe_invalidate(
 	if (unlikely(get_tlb_iprot(gtlbe)))
 		return -1;
 
+	if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe))
+		kvmppc_recalc_tlb1map_range(vcpu_e500);
+
 	gtlbe->mas1 = 0;
 
 	return 0;
@@ -792,14 +857,19 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
 	int tlbsel, esel, stlbsel, sesel;
+	int recal = 0;
 
 	tlbsel = get_tlb_tlbsel(vcpu);
 	esel = get_tlb_esel(vcpu, tlbsel);
 
 	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
 
-	if (get_tlb_v(gtlbe))
+	if (get_tlb_v(gtlbe)) {
 		inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+		if ((tlbsel == 1) &&
+			kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe))
+			recal = 1;
+	}
 
 	gtlbe->mas1 = vcpu->arch.shared->mas1;
 	gtlbe->mas2 = vcpu->arch.shared->mas2;
@@ -808,6 +878,18 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 	trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
 	                              gtlbe->mas2, gtlbe->mas7_3);
 
+	if (tlbsel == 1) {
+		/*
+		 * If a valid tlb1 entry is overwritten then recalculate the
+		 * min/max TLB1 map address range otherwise no need to look
+		 * in tlb1 array.
+		 */
+		if (recal)
+			kvmppc_recalc_tlb1map_range(vcpu_e500);
+		else
+			kvmppc_set_tlb1map_range(vcpu, gtlbe);
+	}
+
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	if (tlbe_is_host_safe(vcpu, gtlbe)) {
 		u64 eaddr;
@@ -1145,6 +1227,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1];
 	vcpu_e500->gtlb_params[1].sets = 1;
 
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
 
 err_put_page:
@@ -1163,7 +1246,7 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
 			     struct kvm_dirty_tlb *dirty)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	clear_tlb_refs(vcpu_e500);
 	return 0;
 }
@@ -1272,6 +1355,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	vcpu->arch.tlbcfg[1] |=
 		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
 
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
 
 err:
-- 
cgit v0.10.2


From 6e35994d1f6831af1e5577e28c363c9137d7d597 Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Wed, 18 Apr 2012 06:01:19 +0000
Subject: KVM: PPC: Use clockevent multiplier and shifter for decrementer

Time for which the hrtimer is started for decrementer emulation is calculated
using tb_ticks_per_usec. While hrtimer uses the clockevent for DEC
reprogramming (if needed) and which calculate timebase ticks using the
multiplier and shifter mechanism implemented within clockevent layer.

It was observed that this conversion (timebase->time->timebase) are not
correct because the mechanism are not consistent.
In our setup it adds 2% jitter.

With this patch clockevent multiplier and shifter mechanism are used when
starting hrtimer for decrementer emulation. Now the jitter is < 0.5%.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 2136f58..3b4b4a8 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -23,6 +23,7 @@
 extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
+extern struct clock_event_device decrementer_clockevent;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2c42cd7..99a995c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -100,7 +100,7 @@ static int decrementer_set_next_event(unsigned long evt,
 static void decrementer_set_mode(enum clock_event_mode mode,
 				 struct clock_event_device *dev);
 
-static struct clock_event_device decrementer_clockevent = {
+struct clock_event_device decrementer_clockevent = {
 	.name           = "decrementer",
 	.rating         = 200,
 	.irq            = 0,
@@ -108,6 +108,7 @@ static struct clock_event_device decrementer_clockevent = {
 	.set_mode       = decrementer_set_mode,
 	.features       = CLOCK_EVT_FEAT_ONESHOT,
 };
+EXPORT_SYMBOL(decrementer_clockevent);
 
 DEFINE_PER_CPU(u64, decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index afc9154..b5872f6 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kvm_host.h>
+#include <linux/clockchips.h>
 
 #include <asm/reg.h>
 #include <asm/time.h>
@@ -104,8 +105,12 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 	 */
 
 	dec_time = vcpu->arch.dec;
-	dec_time *= 1000;
-	do_div(dec_time, tb_ticks_per_usec);
+	/*
+	 * Guest timebase ticks at the same frequency as host decrementer.
+	 * So use the host decrementer calculations for decrementer emulation.
+	 */
+	dec_time = dec_time << decrementer_clockevent.shift;
+	do_div(dec_time, decrementer_clockevent.mult);
 	dec_nsec = do_div(dec_time, NSEC_PER_SEC);
 	hrtimer_start(&vcpu->arch.dec_timer,
 		ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL);
-- 
cgit v0.10.2


From 185e4188dab6456409cad66c579501dd89487188 Mon Sep 17 00:00:00 2001
From: Varun Sethi <Varun.Sethi@freescale.com>
Date: Wed, 25 Apr 2012 01:26:43 +0000
Subject: KVM: PPC: bookehv: Use a Macro for saving/restoring guest registers
 to/from their 64 bit copies.

Introduced PPC_STD/PPC_LD macros for saving/restoring guest registers to/from their 64 bit copies.

Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 0978152..7d4018d 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -20,6 +20,14 @@
 #ifndef __POWERPC_KVM_ASM_H__
 #define __POWERPC_KVM_ASM_H__
 
+#ifdef CONFIG_64BIT
+#define PPC_STD(sreg, offset, areg)  std sreg, (offset)(areg)
+#define PPC_LD(treg, offset, areg)   ld treg, (offset)(areg)
+#else
+#define PPC_STD(sreg, offset, areg)  stw sreg, (offset+4)(areg)
+#define PPC_LD(treg, offset, areg)   lwz treg, (offset+4)(areg)
+#endif
+
 /* IVPR must be 64KiB-aligned. */
 #define VCPU_SIZE_ORDER 4
 #define VCPU_SIZE_LOG   (VCPU_SIZE_ORDER + 12)
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 909e96e..41d3485 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -93,11 +93,7 @@
 #endif
 
 	oris	r8, r6, MSR_CE@h
-#ifdef CONFIG_64BIT
-	std	r6, (VCPU_SHARED_MSR)(r11)
-#else
-	stw	r6, (VCPU_SHARED_MSR + 4)(r11)
-#endif
+	PPC_STD(r6, VCPU_SHARED_MSR, r11)
 	ori	r8, r8, MSR_ME | MSR_RI
 	PPC_STL	r5, VCPU_PC(r4)
 
@@ -335,11 +331,7 @@ _GLOBAL(kvmppc_resume_host)
 	stw	r5, VCPU_SHARED_MAS0(r11)
 	mfspr	r7, SPRN_MAS2
 	stw	r6, VCPU_SHARED_MAS1(r11)
-#ifdef CONFIG_64BIT
-	std	r7, (VCPU_SHARED_MAS2)(r11)
-#else
-	stw	r7, (VCPU_SHARED_MAS2 + 4)(r11)
-#endif
+	PPC_STD(r7, VCPU_SHARED_MAS2, r11)
 	mfspr	r5, SPRN_MAS3
 	mfspr	r6, SPRN_MAS4
 	stw	r5, VCPU_SHARED_MAS7_3+4(r11)
@@ -527,11 +519,7 @@ lightweight_exit:
 	stw	r3, VCPU_HOST_MAS6(r4)
 	lwz	r3, VCPU_SHARED_MAS0(r11)
 	lwz	r5, VCPU_SHARED_MAS1(r11)
-#ifdef CONFIG_64BIT
-	ld	r6, (VCPU_SHARED_MAS2)(r11)
-#else
-	lwz	r6, (VCPU_SHARED_MAS2 + 4)(r11)
-#endif
+	PPC_LD(r6, VCPU_SHARED_MAS2, r11)
 	lwz	r7, VCPU_SHARED_MAS7_3+4(r11)
 	lwz	r8, VCPU_SHARED_MAS4(r11)
 	mtspr	SPRN_MAS0, r3
@@ -565,11 +553,7 @@ lightweight_exit:
 	PPC_LL	r6, VCPU_CTR(r4)
 	PPC_LL	r7, VCPU_CR(r4)
 	PPC_LL	r8, VCPU_PC(r4)
-#ifdef CONFIG_64BIT
-	ld	r9, (VCPU_SHARED_MSR)(r11)
-#else
-	lwz	r9, (VCPU_SHARED_MSR + 4)(r11)
-#endif
+	PPC_LD(r9, VCPU_SHARED_MSR, r11)
 	PPC_LL	r0, VCPU_GPR(r0)(r4)
 	PPC_LL	r1, VCPU_GPR(r1)(r4)
 	PPC_LL	r2, VCPU_GPR(r2)(r4)
-- 
cgit v0.10.2


From 3d4c6826ed2a28e69e8ee14f1d58c4c8622f04b3 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 25 Apr 2012 13:48:54 +0200
Subject: KVM: PPC: Restrict PPC_[L|ST]D macro to asm code

We only want asm code macros to be accessible from asm code, so #ifdef it
depending on it.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 7d4018d..76fdcfe 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -20,6 +20,7 @@
 #ifndef __POWERPC_KVM_ASM_H__
 #define __POWERPC_KVM_ASM_H__
 
+#ifdef __ASSEMBLY__
 #ifdef CONFIG_64BIT
 #define PPC_STD(sreg, offset, areg)  std sreg, (offset)(areg)
 #define PPC_LD(treg, offset, areg)   ld treg, (offset)(areg)
@@ -27,6 +28,7 @@
 #define PPC_STD(sreg, offset, areg)  stw sreg, (offset+4)(areg)
 #define PPC_LD(treg, offset, areg)   lwz treg, (offset+4)(areg)
 #endif
+#endif
 
 /* IVPR must be 64KiB-aligned. */
 #define VCPU_SIZE_ORDER 4
-- 
cgit v0.10.2


From 30124906db8598255fba32c8bf0adb7e8f1503ab Mon Sep 17 00:00:00 2001
From: Varun Sethi <Varun.Sethi@freescale.com>
Date: Wed, 25 Apr 2012 01:27:34 +0000
Subject: KVM: PPC: booke(hv): Fix save/restore of guest accessible SPRGs.

For Guest accessible SPRGs 4-7, save/restore must be handled differently for 64bit and
non-64 bit case. Use the PPC_STD/PPC_LD macros for saving/restoring to/from these registers.

Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index c8c4b87..8feec2f 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -419,13 +419,13 @@ lightweight_exit:
 	 * written directly to the shared area, so we
 	 * need to reload them here with the guest's values.
 	 */
-	lwz	r3, VCPU_SHARED_SPRG4(r5)
+	PPC_LD(r3, VCPU_SHARED_SPRG4, r5)
 	mtspr	SPRN_SPRG4W, r3
-	lwz	r3, VCPU_SHARED_SPRG5(r5)
+	PPC_LD(r3, VCPU_SHARED_SPRG5, r5)
 	mtspr	SPRN_SPRG5W, r3
-	lwz	r3, VCPU_SHARED_SPRG6(r5)
+	PPC_LD(r3, VCPU_SHARED_SPRG6, r5)
 	mtspr	SPRN_SPRG6W, r3
-	lwz	r3, VCPU_SHARED_SPRG7(r5)
+	PPC_LD(r3, VCPU_SHARED_SPRG7, r5)
 	mtspr	SPRN_SPRG7W, r3
 
 #ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 41d3485..b7608ac 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -316,13 +316,13 @@ _GLOBAL(kvmppc_resume_host)
 	PPC_STL	r5, VCPU_LR(r4)
 	mfspr	r7, SPRN_SPRG5
 	PPC_STL	r3, VCPU_VRSAVE(r4)
-	PPC_STL	r6, VCPU_SHARED_SPRG4(r11)
+	PPC_STD(r6, VCPU_SHARED_SPRG4, r11)
 	mfspr	r8, SPRN_SPRG6
-	PPC_STL	r7, VCPU_SHARED_SPRG5(r11)
+	PPC_STD(r7, VCPU_SHARED_SPRG5, r11)
 	mfspr	r9, SPRN_SPRG7
-	PPC_STL	r8, VCPU_SHARED_SPRG6(r11)
+	PPC_STD(r8, VCPU_SHARED_SPRG6, r11)
 	mfxer	r3
-	PPC_STL	r9, VCPU_SHARED_SPRG7(r11)
+	PPC_STD(r9, VCPU_SHARED_SPRG7, r11)
 
 	/* save guest MAS registers and restore host mas4 & mas6 */
 	mfspr	r5, SPRN_MAS0
@@ -537,13 +537,13 @@ lightweight_exit:
 	 * SPRGs, so we need to reload them here with the guest's values.
 	 */
 	lwz	r3, VCPU_VRSAVE(r4)
-	lwz	r5, VCPU_SHARED_SPRG4(r11)
+	PPC_LD(r5, VCPU_SHARED_SPRG4, r11)
 	mtspr	SPRN_VRSAVE, r3
-	lwz	r6, VCPU_SHARED_SPRG5(r11)
+	PPC_LD(r6, VCPU_SHARED_SPRG5, r11)
 	mtspr	SPRN_SPRG4W, r5
-	lwz	r7, VCPU_SHARED_SPRG6(r11)
+	PPC_LD(r7, VCPU_SHARED_SPRG6, r11)
 	mtspr	SPRN_SPRG5W, r6
-	lwz	r8, VCPU_SHARED_SPRG7(r11)
+	PPC_LD(r8, VCPU_SHARED_SPRG7, r11)
 	mtspr	SPRN_SPRG6W, r7
 	mtspr	SPRN_SPRG7W, r8
 
-- 
cgit v0.10.2


From 8c2d0be7efb0b92b5e4f89ea4363f3cdc11e2459 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 25 Apr 2012 14:28:23 +0200
Subject: KVM: PPC: Book3S: PR: Optimize entry path

By shuffling a few instructions around we can execute more memory
loads in parallel, giving us a small performance boost.

With this patch and a simple priviledged SPR access loop guest, I get
a speed bump from 2013052 to 2035607 exits per second.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 0676ae2..6bae0a9 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -128,24 +128,25 @@ no_dcbz32_on:
 	/* First clear RI in our current MSR value */
 	li	r0, MSR_RI
 	andc	r6, r6, r0
-	MTMSR_EERI(r6)
-	mtsrr0	r9
-	mtsrr1	r4
 
 	PPC_LL	r0, SVCPU_R0(r3)
 	PPC_LL	r1, SVCPU_R1(r3)
 	PPC_LL	r2, SVCPU_R2(r3)
-	PPC_LL	r4, SVCPU_R4(r3)
 	PPC_LL	r5, SVCPU_R5(r3)
-	PPC_LL	r6, SVCPU_R6(r3)
 	PPC_LL	r7, SVCPU_R7(r3)
 	PPC_LL	r8, SVCPU_R8(r3)
-	PPC_LL	r9, SVCPU_R9(r3)
 	PPC_LL	r10, SVCPU_R10(r3)
 	PPC_LL	r11, SVCPU_R11(r3)
 	PPC_LL	r12, SVCPU_R12(r3)
 	PPC_LL	r13, SVCPU_R13(r3)
 
+	MTMSR_EERI(r6)
+	mtsrr0	r9
+	mtsrr1	r4
+
+	PPC_LL	r4, SVCPU_R4(r3)
+	PPC_LL	r6, SVCPU_R6(r3)
+	PPC_LL	r9, SVCPU_R9(r3)
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
 	RFI
-- 
cgit v0.10.2


From af415087d2bbbef3cc25cdf371bfb0460cf66b3b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 25 Apr 2012 14:29:57 +0200
Subject: KVM: PPC: Book3S: PR: No isync in slbie path

While messing around with the SLBs we're running in real mode. The
entry to guest space goes through rfid, which is context synchronizing,
so there's no need to manually synchronize anything through isync.

With this patch and a simple priviledged SPR access loop guest, I get
a speed bump from 2035607 to 2181301 exits per second.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index f2e6e48..56b983e 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -90,8 +90,6 @@ slb_exit_skip_ ## num:
 	or      r10, r10, r12
 	slbie	r10
 
-	isync
-
 	/* Fill SLB with our shadow */
 
 	lbz	r12, SVCPU_SLB_MAX(r3)
-- 
cgit v0.10.2


From 518f040c826d569daf260153d4f75c21b6d9979b Mon Sep 17 00:00:00 2001
From: Mihai Caraman <mihai.caraman@freescale.com>
Date: Mon, 16 Apr 2012 04:08:54 +0000
Subject: KVM: PPC: bookehv: Use lwz/stw instead of PPC_LL/PPC_STL for 32-bit
 fields

Interrupt code used PPC_LL/PPC_STL macros to load/store some of u32 fields
which led to memory overflow on 64-bit. Use lwz/stw instead.

Signed-off-by: Mihai Caraman <mihai.caraman@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index b7608ac..06750cc 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -87,9 +87,9 @@
 	mfspr	r8, SPRN_TBRL
 	mfspr	r9, SPRN_TBRU
 	cmpw	r9, r7
-	PPC_STL	r8, VCPU_TIMING_EXIT_TBL(r4)
+	stw	r8, VCPU_TIMING_EXIT_TBL(r4)
 	bne-	1b
-	PPC_STL	r9, VCPU_TIMING_EXIT_TBU(r4)
+	stw	r9, VCPU_TIMING_EXIT_TBU(r4)
 #endif
 
 	oris	r8, r6, MSR_CE@h
@@ -216,7 +216,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 	PPC_STL	r4, VCPU_GPR(r4)(r11)
 	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
 	PPC_STL	r5, VCPU_GPR(r5)(r11)
-	PPC_STL	r13, VCPU_CR(r11)
+	stw	r13, VCPU_CR(r11)
 	mfspr	r5, \srr0
 	PPC_STL	r3, VCPU_GPR(r10)(r11)
 	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
@@ -243,7 +243,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 	PPC_STL	r4, VCPU_GPR(r4)(r11)
 	PPC_LL	r4, GPR9(r8)
 	PPC_STL	r5, VCPU_GPR(r5)(r11)
-	PPC_STL	r9, VCPU_CR(r11)
+	stw	r9, VCPU_CR(r11)
 	mfspr	r5, \srr0
 	PPC_STL	r3, VCPU_GPR(r8)(r11)
 	PPC_LL	r3, GPR10(r8)
@@ -315,7 +315,7 @@ _GLOBAL(kvmppc_resume_host)
 	mfspr	r6, SPRN_SPRG4
 	PPC_STL	r5, VCPU_LR(r4)
 	mfspr	r7, SPRN_SPRG5
-	PPC_STL	r3, VCPU_VRSAVE(r4)
+	stw	r3, VCPU_VRSAVE(r4)
 	PPC_STD(r6, VCPU_SHARED_SPRG4, r11)
 	mfspr	r8, SPRN_SPRG6
 	PPC_STD(r7, VCPU_SHARED_SPRG5, r11)
@@ -551,7 +551,7 @@ lightweight_exit:
 	PPC_LL	r3, VCPU_LR(r4)
 	PPC_LL	r5, VCPU_XER(r4)
 	PPC_LL	r6, VCPU_CTR(r4)
-	PPC_LL	r7, VCPU_CR(r4)
+	lwz	r7, VCPU_CR(r4)
 	PPC_LL	r8, VCPU_PC(r4)
 	PPC_LD(r9, VCPU_SHARED_MSR, r11)
 	PPC_LL	r0, VCPU_GPR(r0)(r4)
@@ -574,9 +574,9 @@ lightweight_exit:
 	mfspr	r9, SPRN_TBRL
 	mfspr	r8, SPRN_TBRU
 	cmpw	r8, r6
-	PPC_STL	r9, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	stw	r9, VCPU_TIMING_LAST_ENTER_TBL(r4)
 	bne	1b
-	PPC_STL	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
 #endif
 
 	/*
-- 
cgit v0.10.2


From 978b4fae45b3fae803a9f56e2262f01f71b7dbc9 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 27 Apr 2012 01:00:17 +0200
Subject: KVM: PPC: Fix stbux emulation

Stbux writes the address it's operating on to the register specified in ra,
not into the data source register.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b5872f6..a27d4dc 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -229,7 +229,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               1, 1);
-			kvmppc_set_gpr(vcpu, rs, vcpu->arch.vaddr_accessed);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_LHAX:
-- 
cgit v0.10.2


From 11f7d6c2d1b17abf7b91e0f2d43bfe9de0b9e5cf Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 27 Apr 2012 16:33:35 +0200
Subject: KVM: PPC: Fix PR KVM on POWER7 bare metal

When running on a system that is HV capable, some interrupts use HSRR
SPRs instead of the normal SRR SPRs. These are also used in the Linux
handlers to jump back to code after an interrupt got processed.

Unfortunately, in our "jump back to the real host handler after we've
done the context switch" code, we were only setting the SRR SPRs,
rendering Linux to jump back to some invalid IP after it's processed
the interrupt.

This fixes random crashes on p7 opal mode with PR KVM for me.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 6bae0a9..8b2fc66 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -198,6 +198,7 @@ kvmppc_interrupt:
 	/* Save guest PC and MSR */
 #ifdef CONFIG_PPC64
 BEGIN_FTR_SECTION
+	mr	r10, r12
 	andi.	r0,r12,0x2
 	beq	1f
 	mfspr	r3,SPRN_HSRR0
@@ -317,23 +318,17 @@ no_dcbz32_off:
 	 * Having set up SRR0/1 with the address where we want
 	 * to continue with relocation on (potentially in module
 	 * space), we either just go straight there with rfi[d],
-	 * or we jump to an interrupt handler with bctr if there
-	 * is an interrupt to be handled first.  In the latter
-	 * case, the rfi[d] at the end of the interrupt handler
-	 * will get us back to where we want to continue.
+	 * or we jump to an interrupt handler if there is an
+	 * interrupt to be handled first.  In the latter case,
+	 * the rfi[d] at the end of the interrupt handler will
+	 * get us back to where we want to continue.
 	 */
 
-	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
-	beq	1f
-	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER
-	beq	1f
-	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
-1:	mtctr	r12
-
 	/* Register usage at this point:
 	 *
 	 * R1       = host R1
 	 * R2       = host R2
+	 * R10      = raw exit handler id
 	 * R12      = exit handler id
 	 * R13      = shadow vcpu (32-bit) or PACA (64-bit)
 	 * SVCPU.*  = guest *
@@ -343,12 +338,26 @@ no_dcbz32_off:
 	PPC_LL	r6, HSTATE_HOST_MSR(r13)
 	PPC_LL	r8, HSTATE_VMHANDLER(r13)
 
-	/* Restore host msr -> SRR1 */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	andi.	r0,r10,0x2
+	beq	1f
+	mtspr	SPRN_HSRR1, r6
+	mtspr	SPRN_HSRR0, r8
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+#endif
+1:	/* Restore host msr -> SRR1 */
 	mtsrr1	r6
 	/* Load highmem handler address */
 	mtsrr0	r8
 
 	/* RFI into the highmem handler, or jump to interrupt handler */
-	beqctr
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beqa	BOOK3S_INTERRUPT_EXTERNAL
+	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER
+	beqa	BOOK3S_INTERRUPT_DECREMENTER
+	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
+	beqa	BOOK3S_INTERRUPT_PERFMON
+
 	RFI
 kvmppc_handler_trampoline_exit_end:
-- 
cgit v0.10.2


From 3b1d9d7d95e7c62518160edebd92450b58c6d55f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 30 Apr 2012 10:56:12 +0200
Subject: KVM: PPC: Book3S: Enable IRQs during exit handling

While handling an exit, we should listen for interrupts and make sure to
receive them when they arrive, to keep our latencies low.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index dba282e..d169a0a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -548,6 +548,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	run->exit_reason = KVM_EXIT_UNKNOWN;
 	run->ready_for_interrupt_injection = 1;
 
+	/* We get here with MSR.EE=0, so enable it to be a nice citizen */
+	__hard_irq_enable();
+
 	trace_kvm_book3s_exit(exit_nr, vcpu);
 	preempt_enable();
 	kvm_resched(vcpu);
-- 
cgit v0.10.2


From 4444aa5f78eff73a353c8c4784cda2de74dea54b Mon Sep 17 00:00:00 2001
From: Mihai Caraman <mihai.caraman@freescale.com>
Date: Mon, 16 Apr 2012 04:08:53 +0000
Subject: KVM: PPC: bookehv: Fix r8/r13 storing in level exception handler

Guest r8 register is held in the scratch register and stored correctly,
so remove the instruction that clobbers it. Guest r13 was missing from vcpu,
store it there.

Signed-off-by: Mihai Caraman <mihai.caraman@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 06750cc..6048a00 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -252,10 +252,10 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 	mfspr	r6, \srr1
 	PPC_LL	r4, GPR11(r8)
 	PPC_STL	r7, VCPU_GPR(r7)(r11)
-	PPC_STL	r8, VCPU_GPR(r8)(r11)
 	PPC_STL r3, VCPU_GPR(r10)(r11)
 	mfctr	r7
 	PPC_STL	r12, VCPU_GPR(r12)(r11)
+	PPC_STL r13, VCPU_GPR(r13)(r11)
 	PPC_STL	r4, VCPU_GPR(r11)(r11)
 	PPC_STL	r7, VCPU_CTR(r11)
 	mr	r4, r11
-- 
cgit v0.10.2


From f31e65e1170edba4a86bd8cba0318e251d3746d0 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 15 Mar 2012 21:58:34 +0000
Subject: kvm/book3s: Make kernel emulated H_PUT_TCE available for "PR" KVM

There is nothing in the code for emulating TCE tables in the kernel
that prevents it from working on "PR" KVM... other than ifdef's and
location of the code.

This and moves the bulk of the code there to a new file called
book3s_64_vio.c.

This speeds things up a bit on my G5.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[agraf: fix for hv kvm, 32bit, whitespace]
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 42a527e..d848cdc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -237,7 +237,6 @@ struct kvm_arch {
 	unsigned long vrma_slb_v;
 	int rma_setup_done;
 	int using_mmu_notifiers;
-	struct list_head spapr_tce_tables;
 	spinlock_t slot_phys_lock;
 	unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
 	int slot_npages[KVM_MEM_SLOTS_NUM];
@@ -245,6 +244,9 @@ struct kvm_arch {
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 	struct kvmppc_linear_info *hpt_li;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct list_head spapr_tce_tables;
+#endif
 };
 
 /*
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 7f0a3da..c1069f6 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -126,6 +126,8 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
+extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+			     unsigned long ioba, unsigned long tce);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
 				struct kvm_allocate_rma *rma);
 extern struct kvmppc_linear_info *kvm_alloc_rma(void);
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 25225ae..c2a0863 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -54,6 +54,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
 	book3s_paired_singles.o \
 	book3s_pr.o \
 	book3s_pr_papr.o \
+	book3s_64_vio_hv.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_mmu_hpte.o \
@@ -78,6 +79,7 @@ kvm-book3s_64-module-objs := \
 	powerpc.o \
 	emulate.o \
 	book3s.o \
+	book3s_64_vio.o \
 	$(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
new file mode 100644
index 0000000..72ffc89
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -0,0 +1,150 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+	struct kvm *kvm = stt->kvm;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	list_del(&stt->list);
+	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+		__free_page(stt->pages[i]);
+	kfree(stt);
+	mutex_unlock(&kvm->lock);
+
+	kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+		return VM_FAULT_SIGBUS;
+
+	page = stt->pages[vmf->pgoff];
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+	release_spapr_tce_table(stt);
+	return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				   struct kvm_create_spapr_tce *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	long npages;
+	int ret = -ENOMEM;
+	int i;
+
+	/* Check this LIOBN hasn't been previously allocated */
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == args->liobn)
+			return -EBUSY;
+	}
+
+	npages = kvmppc_stt_npages(args->window_size);
+
+	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!stt)
+		goto fail;
+
+	stt->liobn = args->liobn;
+	stt->window_size = args->window_size;
+	stt->kvm = kvm;
+
+	for (i = 0; i < npages; i++) {
+		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!stt->pages[i])
+			goto fail;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	mutex_unlock(&kvm->lock);
+
+	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				stt, O_RDWR);
+
+fail:
+	if (stt) {
+		for (i = 0; i < npages; i++)
+			if (stt->pages[i])
+				__free_page(stt->pages[i]);
+
+		kfree(stt);
+	}
+	return ret;
+}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index ea0f8c5..30c2f3b 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -38,6 +38,9 @@
 
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
+/* WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9079357..59c2967 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1093,115 +1093,6 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return r;
 }
 
-static long kvmppc_stt_npages(unsigned long window_size)
-{
-	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
-}
-
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
-{
-	struct kvm *kvm = stt->kvm;
-	int i;
-
-	mutex_lock(&kvm->lock);
-	list_del(&stt->list);
-	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
-		__free_page(stt->pages[i]);
-	kfree(stt);
-	mutex_unlock(&kvm->lock);
-
-	kvm_put_kvm(kvm);
-}
-
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
-	struct page *page;
-
-	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
-		return VM_FAULT_SIGBUS;
-
-	page = stt->pages[vmf->pgoff];
-	get_page(page);
-	vmf->page = page;
-	return 0;
-}
-
-static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
-	.fault = kvm_spapr_tce_fault,
-};
-
-static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_ops = &kvm_spapr_tce_vm_ops;
-	return 0;
-}
-
-static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
-{
-	struct kvmppc_spapr_tce_table *stt = filp->private_data;
-
-	release_spapr_tce_table(stt);
-	return 0;
-}
-
-static struct file_operations kvm_spapr_tce_fops = {
-	.mmap           = kvm_spapr_tce_mmap,
-	.release	= kvm_spapr_tce_release,
-};
-
-long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				   struct kvm_create_spapr_tce *args)
-{
-	struct kvmppc_spapr_tce_table *stt = NULL;
-	long npages;
-	int ret = -ENOMEM;
-	int i;
-
-	/* Check this LIOBN hasn't been previously allocated */
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == args->liobn)
-			return -EBUSY;
-	}
-
-	npages = kvmppc_stt_npages(args->window_size);
-
-	stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
-		      GFP_KERNEL);
-	if (!stt)
-		goto fail;
-
-	stt->liobn = args->liobn;
-	stt->window_size = args->window_size;
-	stt->kvm = kvm;
-
-	for (i = 0; i < npages; i++) {
-		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
-		if (!stt->pages[i])
-			goto fail;
-	}
-
-	kvm_get_kvm(kvm);
-
-	mutex_lock(&kvm->lock);
-	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
-
-	mutex_unlock(&kvm->lock);
-
-	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-				stt, O_RDWR);
-
-fail:
-	if (stt) {
-		for (i = 0; i < npages; i++)
-			if (stt->pages[i])
-				__free_page(stt->pages[i]);
-
-		kfree(stt);
-	}
-	return ret;
-}
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
    Assumes POWER7 or PPC970. */
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d169a0a..815ac59 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1171,11 +1171,18 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
+#ifdef CONFIG_PPC64
+	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+#endif
+
 	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
+#ifdef CONFIG_PPC64
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+#endif
 }
 
 static int kvmppc_book3s_init(void)
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 60ac0e7..3ff9013 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -15,6 +15,8 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/anon_inodes.h>
+
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -211,6 +213,20 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	long rc;
+
+	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
 	switch (cmd) {
@@ -222,6 +238,8 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		return kvmppc_h_pr_protect(vcpu);
 	case H_BULK_REMOVE:
 		return kvmppc_h_pr_bulk_remove(vcpu);
+	case H_PUT_TCE:
+		return kvmppc_h_pr_put_tce(vcpu);
 	case H_CEDE:
 		kvm_vcpu_block(vcpu);
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 58ad860..6ac3115 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -244,10 +244,12 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
 #endif
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 		r = 1;
 		break;
+#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_CAP_PPC_SMT:
 		r = threads_per_core;
 		break;
@@ -773,7 +775,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
 		struct kvm *kvm = filp->private_data;
@@ -784,7 +786,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
 		goto out;
 	}
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_ALLOCATE_RMA: {
 		struct kvm *kvm = filp->private_data;
 		struct kvm_allocate_rma rma;
-- 
cgit v0.10.2


From 5b74716ebab10e7bce960d148fe6d8f6920451e5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 26 Apr 2012 19:43:42 +0000
Subject: kvm/powerpc: Add new ioctl to retreive server MMU infos

This is necessary for qemu to be able to pass the right information
to the guest, such as the supported page sizes and corresponding
encodings in the SLB and hash table, which can vary depending
on the processor type, the type of KVM used (PR vs HV) and the
version of KVM

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[agraf: fix compilation on hv, adjust for newer ioctl numbers]
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index eb62761..9301266 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1860,6 +1860,76 @@ See KVM_GET_PIT2 for details on struct kvm_pit_state2.
 This IOCTL replaces the obsolete KVM_SET_PIT.
 
 
+4.74 KVM_PPC_GET_SMMU_INFO
+
+Capability: KVM_CAP_PPC_GET_SMMU_INFO
+Architectures: powerpc
+Type: vm ioctl
+Parameters: None
+Returns: 0 on success, -1 on error
+
+This populates and returns a structure describing the features of
+the "Server" class MMU emulation supported by KVM.
+This can in turn be used by userspace to generate the appropariate
+device-tree properties for the guest operating system.
+
+The structure contains some global informations, followed by an
+array of supported segment page sizes:
+
+      struct kvm_ppc_smmu_info {
+	     __u64 flags;
+	     __u32 slb_size;
+	     __u32 pad;
+	     struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
+      };
+
+The supported flags are:
+
+    - KVM_PPC_PAGE_SIZES_REAL:
+        When that flag is set, guest page sizes must "fit" the backing
+        store page sizes. When not set, any page size in the list can
+        be used regardless of how they are backed by userspace.
+
+    - KVM_PPC_1T_SEGMENTS
+        The emulated MMU supports 1T segments in addition to the
+        standard 256M ones.
+
+The "slb_size" field indicates how many SLB entries are supported
+
+The "sps" array contains 8 entries indicating the supported base
+page sizes for a segment in increasing order. Each entry is defined
+as follow:
+
+   struct kvm_ppc_one_seg_page_size {
+	__u32 page_shift;	/* Base page shift of segment (or 0) */
+	__u32 slb_enc;		/* SLB encoding for BookS */
+	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
+   };
+
+An entry with a "page_shift" of 0 is unused. Because the array is
+organized in increasing order, a lookup can stop when encoutering
+such an entry.
+
+The "slb_enc" field provides the encoding to use in the SLB for the
+page size. The bits are in positions such as the value can directly
+be OR'ed into the "vsid" argument of the slbmte instruction.
+
+The "enc" array is a list which for each of those segment base page
+size provides the list of supported actual page sizes (which can be
+only larger or equal to the base page size), along with the
+corresponding encoding in the hash PTE. Similarily, the array is
+8 entries sorted by increasing sizes and an entry with a "0" shift
+is an empty entry and a terminator:
+
+   struct kvm_ppc_one_page_size {
+	__u32 page_shift;	/* Page shift (or 0) */
+	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
+   };
+
+The "pte_enc" field provides a value that can OR'ed into the hash
+PTE's RPN field (ie, it needs to be shifted left by 12 to OR it
+into the hash PTE second double word).
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c1069f6..c87e3b5 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -140,6 +140,8 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
+extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
+				      struct kvm_ppc_smmu_info *info);
 
 extern int kvmppc_bookehv_init(void);
 extern void kvmppc_bookehv_exit(void);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 786a270..d1f2aaf 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -190,3 +190,7 @@ EXPORT_SYMBOL(__arch_hweight16);
 EXPORT_SYMBOL(__arch_hweight32);
 EXPORT_SYMBOL(__arch_hweight64);
 #endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 59c2967..bb5a0f4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1175,6 +1175,38 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 	return fd;
 }
 
+static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
+				     int linux_psize)
+{
+	struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
+
+	if (!def->shift)
+		return;
+	(*sps)->page_shift = def->shift;
+	(*sps)->slb_enc = def->sllp;
+	(*sps)->enc[0].page_shift = def->shift;
+	(*sps)->enc[0].pte_enc = def->penc;
+	(*sps)++;
+}
+
+int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
+{
+	struct kvm_ppc_one_seg_page_size *sps;
+
+	info->flags = KVM_PPC_PAGE_SIZES_REAL;
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		info->flags |= KVM_PPC_1T_SEGMENTS;
+	info->slb_size = mmu_slb_size;
+
+	/* We only support these sizes for now, and no muti-size segments */
+	sps = &info->sps[0];
+	kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
+	kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
+	kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+
+	return 0;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 815ac59..a1baec3 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1158,6 +1158,31 @@ out:
 	return r;
 }
 
+#ifdef CONFIG_PPC64
+int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
+{
+	/* No flags */
+	info->flags = 0;
+
+	/* SLB is always 64 entries */
+	info->slb_size = 64;
+
+	/* Standard 4k base page size segment */
+	info->sps[0].page_shift = 12;
+	info->sps[0].slb_enc = 0;
+	info->sps[0].enc[0].page_shift = 12;
+	info->sps[0].enc[0].pte_enc = 0;
+
+	/* Standard 16M large page size segment */
+	info->sps[1].page_shift = 24;
+	info->sps[1].slb_enc = SLB_VSID_L;
+	info->sps[1].enc[0].page_shift = 24;
+	info->sps[1].enc[0].pte_enc = 0;
+
+	return 0;
+}
+#endif /* CONFIG_PPC64 */
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_userspace_memory_region *mem)
 {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6ac3115..1493c8d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -279,6 +279,11 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CAP_PPC_GET_SMMU_INFO:
+		r = 1;
+		break;
+#endif
 	default:
 		r = 0;
 		break;
@@ -718,7 +723,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 #endif
-
 	default:
 		r = -EINVAL;
 	}
@@ -800,6 +804,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_PPC_GET_SMMU_INFO: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_smmu_info info;
+
+		memset(&info, 0, sizeof(info));
+		r = kvm_vm_ioctl_get_smmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
+#endif /* CONFIG_PPC_BOOK3S_64 */
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 225b452..8d696cf 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -449,6 +449,30 @@ struct kvm_ppc_pvinfo {
 	__u8  pad[108];
 };
 
+/* for KVM_PPC_GET_SMMU_INFO */
+#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
+
+struct kvm_ppc_one_page_size {
+	__u32 page_shift;	/* Page shift (or 0) */
+	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
+};
+
+struct kvm_ppc_one_seg_page_size {
+	__u32 page_shift;	/* Base page shift of segment (or 0) */
+	__u32 slb_enc;		/* SLB encoding for BookS */
+	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
+#define KVM_PPC_1T_SEGMENTS		0x00000002
+
+struct kvm_ppc_smmu_info {
+	__u64 flags;
+	__u32 slb_size;
+	__u32 pad;
+	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
 #define KVMIO 0xAE
 
 /* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -591,6 +615,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PCI_2_3 75
 #define KVM_CAP_KVMCLOCK_CTRL 76
 #define KVM_CAP_SIGNAL_MSI 77
+#define KVM_CAP_PPC_GET_SMMU_INFO 78
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -800,6 +825,8 @@ struct kvm_s390_ucas_mapping {
 				       struct kvm_assigned_pci_dev)
 /* Available with KVM_CAP_SIGNAL_MSI */
 #define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
+/* Available with KVM_CAP_PPC_GET_SMMU_INFO */
+#define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
 
 /*
  * ioctls for vcpu fds
-- 
cgit v0.10.2


From c46dc9a86148bc37c31d67a22a3887144ba7aa81 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 4 May 2012 14:01:33 +0200
Subject: KVM: PPC: Emulator: clean up instruction parsing

Instructions on PPC are pretty similarly encoded. So instead of
every instruction emulation code decoding the instruction fields
itself, we can move that code to more generic places and rely on
the compiler to optimize the unused bits away.

This has 2 advantages. It makes the code smaller and it makes the
code less error prone, as the instruction fields are always
available, so accidental misusage is reduced.

Functionally, this patch doesn't change anything.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 549bb2c..da81a2d 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -37,22 +37,19 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
 	int emulated = EMULATE_DONE;
-	int dcrn;
-	int ra;
-	int rb;
-	int rc;
-	int rs;
-	int rt;
-	int ws;
+	int dcrn = get_dcrn(inst);
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
+	int rc = get_rc(inst);
+	int rs = get_rs(inst);
+	int rt = get_rt(inst);
+	int ws = get_ws(inst);
 
 	switch (get_op(inst)) {
 	case 31:
 		switch (get_xop(inst)) {
 
 		case XOP_MFDCR:
-			dcrn = get_dcrn(inst);
-			rt = get_rt(inst);
-
 			/* The guest may access CPR0 registers to determine the timebase
 			 * frequency, and it must know the real host frequency because it
 			 * can directly access the timebase registers.
@@ -88,9 +85,6 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 
 		case XOP_MTDCR:
-			dcrn = get_dcrn(inst);
-			rs = get_rs(inst);
-
 			/* emulate some access in kernel */
 			switch (dcrn) {
 			case DCRN_CPR0_CONFIG_ADDR:
@@ -108,17 +102,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 
 		case XOP_TLBWE:
-			ra = get_ra(inst);
-			rs = get_rs(inst);
-			ws = get_ws(inst);
 			emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws);
 			break;
 
 		case XOP_TLBSX:
-			rt = get_rt(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-			rc = get_rc(inst);
 			emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc);
 			break;
 
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 135663a..c023bcd 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -87,6 +87,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
 	int emulated = EMULATE_DONE;
+	int rt = get_rt(inst);
+	int rs = get_rs(inst);
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
 
 	switch (get_op(inst)) {
 	case 19:
@@ -106,21 +110,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case 31:
 		switch (get_xop(inst)) {
 		case OP_31_XOP_MFMSR:
-			kvmppc_set_gpr(vcpu, get_rt(inst),
-				       vcpu->arch.shared->msr);
+			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
 			break;
 		case OP_31_XOP_MTMSRD:
 		{
-			ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst));
+			ulong rs_val = kvmppc_get_gpr(vcpu, rs);
 			if (inst & 0x10000) {
-				vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE);
-				vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE);
+				ulong new_msr = vcpu->arch.shared->msr;
+				new_msr &= ~(MSR_RI | MSR_EE);
+				new_msr |= rs_val & (MSR_RI | MSR_EE);
+				vcpu->arch.shared->msr = new_msr;
 			} else
-				kvmppc_set_msr(vcpu, rs);
+				kvmppc_set_msr(vcpu, rs_val);
 			break;
 		}
 		case OP_31_XOP_MTMSR:
-			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst)));
+			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
 			break;
 		case OP_31_XOP_MFSR:
 		{
@@ -130,7 +135,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			if (vcpu->arch.mmu.mfsrin) {
 				u32 sr;
 				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
-				kvmppc_set_gpr(vcpu, get_rt(inst), sr);
+				kvmppc_set_gpr(vcpu, rt, sr);
 			}
 			break;
 		}
@@ -138,29 +143,29 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		{
 			int srnum;
 
-			srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf;
+			srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf;
 			if (vcpu->arch.mmu.mfsrin) {
 				u32 sr;
 				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
-				kvmppc_set_gpr(vcpu, get_rt(inst), sr);
+				kvmppc_set_gpr(vcpu, rt, sr);
 			}
 			break;
 		}
 		case OP_31_XOP_MTSR:
 			vcpu->arch.mmu.mtsrin(vcpu,
 				(inst >> 16) & 0xf,
-				kvmppc_get_gpr(vcpu, get_rs(inst)));
+				kvmppc_get_gpr(vcpu, rs));
 			break;
 		case OP_31_XOP_MTSRIN:
 			vcpu->arch.mmu.mtsrin(vcpu,
-				(kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf,
-				kvmppc_get_gpr(vcpu, get_rs(inst)));
+				(kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf,
+				kvmppc_get_gpr(vcpu, rs));
 			break;
 		case OP_31_XOP_TLBIE:
 		case OP_31_XOP_TLBIEL:
 		{
 			bool large = (inst & 0x00200000) ? true : false;
-			ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst));
+			ulong addr = kvmppc_get_gpr(vcpu, rb);
 			vcpu->arch.mmu.tlbie(vcpu, addr, large);
 			break;
 		}
@@ -171,15 +176,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				return EMULATE_FAIL;
 
 			vcpu->arch.mmu.slbmte(vcpu,
-					kvmppc_get_gpr(vcpu, get_rs(inst)),
-					kvmppc_get_gpr(vcpu, get_rb(inst)));
+					kvmppc_get_gpr(vcpu, rs),
+					kvmppc_get_gpr(vcpu, rb));
 			break;
 		case OP_31_XOP_SLBIE:
 			if (!vcpu->arch.mmu.slbie)
 				return EMULATE_FAIL;
 
 			vcpu->arch.mmu.slbie(vcpu,
-					kvmppc_get_gpr(vcpu, get_rb(inst)));
+					kvmppc_get_gpr(vcpu, rb));
 			break;
 		case OP_31_XOP_SLBIA:
 			if (!vcpu->arch.mmu.slbia)
@@ -191,22 +196,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			if (!vcpu->arch.mmu.slbmfee) {
 				emulated = EMULATE_FAIL;
 			} else {
-				ulong t, rb;
+				ulong t, rb_val;
 
-				rb = kvmppc_get_gpr(vcpu, get_rb(inst));
-				t = vcpu->arch.mmu.slbmfee(vcpu, rb);
-				kvmppc_set_gpr(vcpu, get_rt(inst), t);
+				rb_val = kvmppc_get_gpr(vcpu, rb);
+				t = vcpu->arch.mmu.slbmfee(vcpu, rb_val);
+				kvmppc_set_gpr(vcpu, rt, t);
 			}
 			break;
 		case OP_31_XOP_SLBMFEV:
 			if (!vcpu->arch.mmu.slbmfev) {
 				emulated = EMULATE_FAIL;
 			} else {
-				ulong t, rb;
+				ulong t, rb_val;
 
-				rb = kvmppc_get_gpr(vcpu, get_rb(inst));
-				t = vcpu->arch.mmu.slbmfev(vcpu, rb);
-				kvmppc_set_gpr(vcpu, get_rt(inst), t);
+				rb_val = kvmppc_get_gpr(vcpu, rb);
+				t = vcpu->arch.mmu.slbmfev(vcpu, rb_val);
+				kvmppc_set_gpr(vcpu, rt, t);
 			}
 			break;
 		case OP_31_XOP_DCBA:
@@ -214,17 +219,17 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 		case OP_31_XOP_DCBZ:
 		{
-			ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst));
-			ulong ra = 0;
+			ulong rb_val = kvmppc_get_gpr(vcpu, rb);
+			ulong ra_val = 0;
 			ulong addr, vaddr;
 			u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 			u32 dsisr;
 			int r;
 
-			if (get_ra(inst))
-				ra = kvmppc_get_gpr(vcpu, get_ra(inst));
+			if (ra)
+				ra_val = kvmppc_get_gpr(vcpu, ra);
 
-			addr = (ra + rb) & ~31ULL;
+			addr = (ra_val + rb_val) & ~31ULL;
 			if (!(vcpu->arch.shared->msr & MSR_SF))
 				addr &= 0xffffffff;
 			vaddr = addr;
@@ -565,23 +570,22 @@ u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
 ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
 {
 	ulong dar = 0;
-	ulong ra;
+	ulong ra = get_ra(inst);
+	ulong rb = get_rb(inst);
 
 	switch (get_op(inst)) {
 	case OP_LFS:
 	case OP_LFD:
 	case OP_STFD:
 	case OP_STFS:
-		ra = get_ra(inst);
 		if (ra)
 			dar = kvmppc_get_gpr(vcpu, ra);
 		dar += (s32)((s16)inst);
 		break;
 	case 31:
-		ra = get_ra(inst);
 		if (ra)
 			dar = kvmppc_get_gpr(vcpu, ra);
-		dar += kvmppc_get_gpr(vcpu, get_rb(inst));
+		dar += kvmppc_get_gpr(vcpu, rb);
 		break;
 	default:
 		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 904412b..e14f7b2 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -40,8 +40,8 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                             unsigned int inst, int *advance)
 {
 	int emulated = EMULATE_DONE;
-	int rs;
-	int rt;
+	int rs = get_rs(inst);
+	int rt = get_rt(inst);
 
 	switch (get_op(inst)) {
 	case 19:
@@ -62,19 +62,16 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		switch (get_xop(inst)) {
 
 		case OP_31_XOP_MFMSR:
-			rt = get_rt(inst);
 			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
 			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
 			break;
 
 		case OP_31_XOP_MTMSR:
-			rs = get_rs(inst);
 			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
 			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
 			break;
 
 		case OP_31_XOP_WRTEE:
-			rs = get_rs(inst);
 			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
 					| (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
 			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 99155f8..9b2dcda 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -86,9 +86,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
 	int emulated = EMULATE_DONE;
-	int ra;
-	int rb;
-	int rt;
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
+	int rt = get_rt(inst);
 
 	switch (get_op(inst)) {
 	case 31:
@@ -96,11 +96,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 #ifdef CONFIG_KVM_E500MC
 		case XOP_MSGSND:
-			emulated = kvmppc_e500_emul_msgsnd(vcpu, get_rb(inst));
+			emulated = kvmppc_e500_emul_msgsnd(vcpu, rb);
 			break;
 
 		case XOP_MSGCLR:
-			emulated = kvmppc_e500_emul_msgclr(vcpu, get_rb(inst));
+			emulated = kvmppc_e500_emul_msgclr(vcpu, rb);
 			break;
 #endif
 
@@ -113,20 +113,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 
 		case XOP_TLBSX:
-			rb = get_rb(inst);
 			emulated = kvmppc_e500_emul_tlbsx(vcpu,rb);
 			break;
 
 		case XOP_TLBILX:
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-			rt = get_rt(inst);
 			emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb);
 			break;
 
 		case XOP_TLBIVAX:
-			ra = get_ra(inst);
-			rb = get_rb(inst);
 			emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb);
 			break;
 
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index a27d4dc..f63b5cb 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -148,11 +148,10 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
 int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	u32 inst = kvmppc_get_last_inst(vcpu);
-	int ra;
-	int rb;
-	int rs;
-	int rt;
-	int sprn;
+	int ra = get_ra(inst);
+	int rs = get_rs(inst);
+	int rt = get_rt(inst);
+	int sprn = get_sprn(inst);
 	enum emulation_result emulated = EMULATE_DONE;
 	int advance = 1;
 
@@ -189,43 +188,31 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			advance = 0;
 			break;
 		case OP_31_XOP_LWZX:
-			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 			break;
 
 		case OP_31_XOP_LBZX:
-			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			break;
 
 		case OP_31_XOP_LBZUX:
-			rt = get_rt(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STWX:
-			rs = get_rs(inst);
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               4, 1);
 			break;
 
 		case OP_31_XOP_STBX:
-			rs = get_rs(inst);
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               1, 1);
 			break;
 
 		case OP_31_XOP_STBUX:
-			rs = get_rs(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               1, 1);
@@ -233,28 +220,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_LHAX:
-			rt = get_rt(inst);
 			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
 			break;
 
 		case OP_31_XOP_LHZX:
-			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 			break;
 
 		case OP_31_XOP_LHZUX:
-			rt = get_rt(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_MFSPR:
-			sprn = get_sprn(inst);
-			rt = get_rt(inst);
-
 			switch (sprn) {
 			case SPRN_SRR0:
 				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0);
@@ -310,20 +288,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_STHX:
-			rs = get_rs(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               2, 1);
 			break;
 
 		case OP_31_XOP_STHUX:
-			rs = get_rs(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               2, 1);
@@ -331,8 +301,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_MTSPR:
-			sprn = get_sprn(inst);
-			rs = get_rs(inst);
 			switch (sprn) {
 			case SPRN_SRR0:
 				vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs);
@@ -384,7 +352,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_LWBRX:
-			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0);
 			break;
 
@@ -392,25 +359,16 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_STWBRX:
-			rs = get_rs(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               4, 0);
 			break;
 
 		case OP_31_XOP_LHBRX:
-			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
 			break;
 
 		case OP_31_XOP_STHBRX:
-			rs = get_rs(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-
 			emulated = kvmppc_handle_store(run, vcpu,
 						       kvmppc_get_gpr(vcpu, rs),
 			                               2, 0);
@@ -423,39 +381,30 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	case OP_LWZ:
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		break;
 
 	case OP_LWZU:
-		ra = get_ra(inst);
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_LBZ:
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 		break;
 
 	case OP_LBZU:
-		ra = get_ra(inst);
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STW:
-		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               4, 1);
 		break;
 
 	case OP_STWU:
-		ra = get_ra(inst);
-		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               4, 1);
@@ -463,15 +412,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	case OP_STB:
-		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               1, 1);
 		break;
 
 	case OP_STBU:
-		ra = get_ra(inst);
-		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               1, 1);
@@ -479,39 +425,30 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	case OP_LHZ:
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 		break;
 
 	case OP_LHZU:
-		ra = get_ra(inst);
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_LHA:
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
 		break;
 
 	case OP_LHAU:
-		ra = get_ra(inst);
-		rt = get_rt(inst);
 		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STH:
-		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               2, 1);
 		break;
 
 	case OP_STHU:
-		ra = get_ra(inst);
-		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu,
 					       kvmppc_get_gpr(vcpu, rs),
 		                               2, 1);
-- 
cgit v0.10.2


From 54771e6217ce05a474827d9b23ff03de9d2ef2a0 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 4 May 2012 14:55:12 +0200
Subject: KVM: PPC: Emulator: clean up SPR reads and writes

When reading and writing SPRs, every SPR emulation piece had to read
or write the respective GPR the value was read from or stored in itself.

This approach is pretty prone to failure. What if we accidentally
implement mfspr emulation where we just do "break" and nothing else?
Suddenly we would get a random value in the return register - which is
always a bad idea.

So let's consolidate the generic code paths and only give the core
specific SPR handling code readily made variables to read/write from/to.

Functionally, this patch doesn't change anything, but it increases the
readability of the code and makes is less prone to bugs.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c87e3b5..f68c22f 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -107,8 +107,10 @@ extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
-extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
-extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
+extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn,
+				     ulong val);
+extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn,
+				     ulong *val);
 
 extern int kvmppc_booke_init(void);
 extern void kvmppc_booke_exit(void);
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index da81a2d..c8c6157 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -128,41 +128,41 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return emulated;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
 	int emulated = EMULATE_DONE;
 
 	switch (sprn) {
 	case SPRN_PID:
-		kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break;
+		kvmppc_set_pid(vcpu, spr_val); break;
 	case SPRN_MMUCR:
-		vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break;
+		vcpu->arch.mmucr = spr_val; break;
 	case SPRN_CCR0:
-		vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break;
+		vcpu->arch.ccr0 = spr_val; break;
 	case SPRN_CCR1:
-		vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break;
+		vcpu->arch.ccr1 = spr_val; break;
 	default:
-		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
+		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val);
 	}
 
 	return emulated;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
 	int emulated = EMULATE_DONE;
 
 	switch (sprn) {
 	case SPRN_PID:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break;
+		*spr_val = vcpu->arch.pid; break;
 	case SPRN_MMUCR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break;
+		*spr_val = vcpu->arch.mmucr; break;
 	case SPRN_CCR0:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break;
+		*spr_val = vcpu->arch.ccr0; break;
 	case SPRN_CCR1:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break;
+		*spr_val = vcpu->arch.ccr1; break;
 	default:
-		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
+		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val);
 	}
 
 	return emulated;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index c023bcd..b9a989d 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -318,10 +318,9 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
 	return bat;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
 	int emulated = EMULATE_DONE;
-	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
 
 	switch (sprn) {
 	case SPRN_SDR1:
@@ -433,7 +432,7 @@ unprivileged:
 	return emulated;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
 	int emulated = EMULATE_DONE;
 
@@ -446,46 +445,46 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
 
 		if (sprn % 2)
-			kvmppc_set_gpr(vcpu, rt, bat->raw >> 32);
+			*spr_val = bat->raw >> 32;
 		else
-			kvmppc_set_gpr(vcpu, rt, bat->raw);
+			*spr_val = bat->raw;
 
 		break;
 	}
 	case SPRN_SDR1:
 		if (!spr_allowed(vcpu, PRIV_HYPER))
 			goto unprivileged;
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
+		*spr_val = to_book3s(vcpu)->sdr1;
 		break;
 	case SPRN_DSISR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr);
+		*spr_val = vcpu->arch.shared->dsisr;
 		break;
 	case SPRN_DAR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar);
+		*spr_val = vcpu->arch.shared->dar;
 		break;
 	case SPRN_HIOR:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior);
+		*spr_val = to_book3s(vcpu)->hior;
 		break;
 	case SPRN_HID0:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]);
+		*spr_val = to_book3s(vcpu)->hid[0];
 		break;
 	case SPRN_HID1:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]);
+		*spr_val = to_book3s(vcpu)->hid[1];
 		break;
 	case SPRN_HID2:
 	case SPRN_HID2_GEKKO:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]);
+		*spr_val = to_book3s(vcpu)->hid[2];
 		break;
 	case SPRN_HID4:
 	case SPRN_HID4_GEKKO:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]);
+		*spr_val = to_book3s(vcpu)->hid[4];
 		break;
 	case SPRN_HID5:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
+		*spr_val = to_book3s(vcpu)->hid[5];
 		break;
 	case SPRN_CFAR:
 	case SPRN_PURR:
-		kvmppc_set_gpr(vcpu, rt, 0);
+		*spr_val = 0;
 		break;
 	case SPRN_GQR0:
 	case SPRN_GQR1:
@@ -495,8 +494,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_GQR5:
 	case SPRN_GQR6:
 	case SPRN_GQR7:
-		kvmppc_set_gpr(vcpu, rt,
-			       to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]);
+		*spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0];
 		break;
 	case SPRN_THRM1:
 	case SPRN_THRM2:
@@ -511,7 +509,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_PMC3_GEKKO:
 	case SPRN_PMC4_GEKKO:
 	case SPRN_WPAR_GEKKO:
-		kvmppc_set_gpr(vcpu, rt, 0);
+		*spr_val = 0;
 		break;
 	default:
 unprivileged:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bb5a0f4..db36598 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1505,12 +1505,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return EMULATE_FAIL;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
 	return EMULATE_FAIL;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
 	return EMULATE_FAIL;
 }
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 62c4fe5..ba61974 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -75,8 +75,8 @@ void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
 
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                             unsigned int inst, int *advance);
-int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
-int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val);
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val);
 
 /* low-level asm code to transfer guest state */
 void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index e14f7b2..6c76397 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -102,22 +102,26 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
  * will return the wrong result if called for them in another context
  * (such as debugging).
  */
-int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
 	int emulated = EMULATE_DONE;
-	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
 
 	switch (sprn) {
 	case SPRN_DEAR:
-		vcpu->arch.shared->dar = spr_val; break;
+		vcpu->arch.shared->dar = spr_val;
+		break;
 	case SPRN_ESR:
-		vcpu->arch.shared->esr = spr_val; break;
+		vcpu->arch.shared->esr = spr_val;
+		break;
 	case SPRN_DBCR0:
-		vcpu->arch.dbcr0 = spr_val; break;
+		vcpu->arch.dbcr0 = spr_val;
+		break;
 	case SPRN_DBCR1:
-		vcpu->arch.dbcr1 = spr_val; break;
+		vcpu->arch.dbcr1 = spr_val;
+		break;
 	case SPRN_DBSR:
-		vcpu->arch.dbsr &= ~spr_val; break;
+		vcpu->arch.dbsr &= ~spr_val;
+		break;
 	case SPRN_TSR:
 		kvmppc_clr_tsr_bits(vcpu, spr_val);
 		break;
@@ -131,13 +135,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	 * guest (PR-mode only).
 	 */
 	case SPRN_SPRG4:
-		vcpu->arch.shared->sprg4 = spr_val; break;
+		vcpu->arch.shared->sprg4 = spr_val;
+		break;
 	case SPRN_SPRG5:
-		vcpu->arch.shared->sprg5 = spr_val; break;
+		vcpu->arch.shared->sprg5 = spr_val;
+		break;
 	case SPRN_SPRG6:
-		vcpu->arch.shared->sprg6 = spr_val; break;
+		vcpu->arch.shared->sprg6 = spr_val;
+		break;
 	case SPRN_SPRG7:
-		vcpu->arch.shared->sprg7 = spr_val; break;
+		vcpu->arch.shared->sprg7 = spr_val;
+		break;
 
 	case SPRN_IVPR:
 		vcpu->arch.ivpr = spr_val;
@@ -207,75 +215,83 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	return emulated;
 }
 
-int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
 	int emulated = EMULATE_DONE;
 
 	switch (sprn) {
 	case SPRN_IVPR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break;
+		*spr_val = vcpu->arch.ivpr;
+		break;
 	case SPRN_DEAR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
+		*spr_val = vcpu->arch.shared->dar;
+		break;
 	case SPRN_ESR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break;
+		*spr_val = vcpu->arch.shared->esr;
+		break;
 	case SPRN_DBCR0:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break;
+		*spr_val = vcpu->arch.dbcr0;
+		break;
 	case SPRN_DBCR1:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break;
+		*spr_val = vcpu->arch.dbcr1;
+		break;
 	case SPRN_DBSR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break;
+		*spr_val = vcpu->arch.dbsr;
+		break;
 	case SPRN_TSR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break;
+		*spr_val = vcpu->arch.tsr;
+		break;
 	case SPRN_TCR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break;
+		*spr_val = vcpu->arch.tcr;
+		break;
 
 	case SPRN_IVOR0:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
 		break;
 	case SPRN_IVOR1:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
 		break;
 	case SPRN_IVOR2:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
 		break;
 	case SPRN_IVOR3:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
 		break;
 	case SPRN_IVOR4:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
 		break;
 	case SPRN_IVOR5:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
 		break;
 	case SPRN_IVOR6:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
 		break;
 	case SPRN_IVOR7:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
 		break;
 	case SPRN_IVOR8:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
 		break;
 	case SPRN_IVOR9:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
 		break;
 	case SPRN_IVOR10:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
 		break;
 	case SPRN_IVOR11:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
 		break;
 	case SPRN_IVOR12:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
 		break;
 	case SPRN_IVOR13:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
 		break;
 	case SPRN_IVOR14:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
 		break;
 	case SPRN_IVOR15:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
 		break;
 
 	default:
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 9b2dcda..8b99e07 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -140,11 +140,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return emulated;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int emulated = EMULATE_DONE;
-	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
 
 	switch (sprn) {
 #ifndef CONFIG_KVM_BOOKE_HV
@@ -154,25 +153,32 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_PID1:
 		if (spr_val != 0)
 			return EMULATE_FAIL;
-		vcpu_e500->pid[1] = spr_val; break;
+		vcpu_e500->pid[1] = spr_val;
+		break;
 	case SPRN_PID2:
 		if (spr_val != 0)
 			return EMULATE_FAIL;
-		vcpu_e500->pid[2] = spr_val; break;
+		vcpu_e500->pid[2] = spr_val;
+		break;
 	case SPRN_MAS0:
-		vcpu->arch.shared->mas0 = spr_val; break;
+		vcpu->arch.shared->mas0 = spr_val;
+		break;
 	case SPRN_MAS1:
-		vcpu->arch.shared->mas1 = spr_val; break;
+		vcpu->arch.shared->mas1 = spr_val;
+		break;
 	case SPRN_MAS2:
-		vcpu->arch.shared->mas2 = spr_val; break;
+		vcpu->arch.shared->mas2 = spr_val;
+		break;
 	case SPRN_MAS3:
 		vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff;
 		vcpu->arch.shared->mas7_3 |= spr_val;
 		break;
 	case SPRN_MAS4:
-		vcpu->arch.shared->mas4 = spr_val; break;
+		vcpu->arch.shared->mas4 = spr_val;
+		break;
 	case SPRN_MAS6:
-		vcpu->arch.shared->mas6 = spr_val; break;
+		vcpu->arch.shared->mas6 = spr_val;
+		break;
 	case SPRN_MAS7:
 		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
 		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
@@ -183,11 +189,14 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
 		break;
 	case SPRN_L1CSR1:
-		vcpu_e500->l1csr1 = spr_val; break;
+		vcpu_e500->l1csr1 = spr_val;
+		break;
 	case SPRN_HID0:
-		vcpu_e500->hid0 = spr_val; break;
+		vcpu_e500->hid0 = spr_val;
+		break;
 	case SPRN_HID1:
-		vcpu_e500->hid1 = spr_val; break;
+		vcpu_e500->hid1 = spr_val;
+		break;
 
 	case SPRN_MMUCSR0:
 		emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500,
@@ -216,90 +225,103 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		break;
 #endif
 	default:
-		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
+		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val);
 	}
 
 	return emulated;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int emulated = EMULATE_DONE;
 
 	switch (sprn) {
 #ifndef CONFIG_KVM_BOOKE_HV
-		unsigned long val;
-
 	case SPRN_PID:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break;
+		*spr_val = vcpu_e500->pid[0];
+		break;
 	case SPRN_PID1:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break;
+		*spr_val = vcpu_e500->pid[1];
+		break;
 	case SPRN_PID2:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break;
+		*spr_val = vcpu_e500->pid[2];
+		break;
 	case SPRN_MAS0:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break;
+		*spr_val = vcpu->arch.shared->mas0;
+		break;
 	case SPRN_MAS1:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break;
+		*spr_val = vcpu->arch.shared->mas1;
+		break;
 	case SPRN_MAS2:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break;
+		*spr_val = vcpu->arch.shared->mas2;
+		break;
 	case SPRN_MAS3:
-		val = (u32)vcpu->arch.shared->mas7_3;
-		kvmppc_set_gpr(vcpu, rt, val);
+		*spr_val = (u32)vcpu->arch.shared->mas7_3;
 		break;
 	case SPRN_MAS4:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break;
+		*spr_val = vcpu->arch.shared->mas4;
+		break;
 	case SPRN_MAS6:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break;
+		*spr_val = vcpu->arch.shared->mas6;
+		break;
 	case SPRN_MAS7:
-		val = vcpu->arch.shared->mas7_3 >> 32;
-		kvmppc_set_gpr(vcpu, rt, val);
+		*spr_val = vcpu->arch.shared->mas7_3 >> 32;
 		break;
 #endif
 	case SPRN_TLB0CFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[0]); break;
+		*spr_val = vcpu->arch.tlbcfg[0];
+		break;
 	case SPRN_TLB1CFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[1]); break;
+		*spr_val = vcpu->arch.tlbcfg[1];
+		break;
 	case SPRN_L1CSR0:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break;
+		*spr_val = vcpu_e500->l1csr0;
+		break;
 	case SPRN_L1CSR1:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break;
+		*spr_val = vcpu_e500->l1csr1;
+		break;
 	case SPRN_HID0:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break;
+		*spr_val = vcpu_e500->hid0;
+		break;
 	case SPRN_HID1:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break;
+		*spr_val = vcpu_e500->hid1;
+		break;
 	case SPRN_SVR:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->svr); break;
+		*spr_val = vcpu_e500->svr;
+		break;
 
 	case SPRN_MMUCSR0:
-		kvmppc_set_gpr(vcpu, rt, 0); break;
+		*spr_val = 0;
+		break;
 
 	case SPRN_MMUCFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucfg); break;
+		*spr_val = vcpu->arch.mmucfg;
+		break;
 
 	/* extra exceptions */
 	case SPRN_IVOR32:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
 		break;
 	case SPRN_IVOR33:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
 		break;
 	case SPRN_IVOR34:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
 		break;
 	case SPRN_IVOR35:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
 		break;
 #ifdef CONFIG_KVM_BOOKE_HV
 	case SPRN_IVOR36:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
 		break;
 	case SPRN_IVOR37:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]);
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
 		break;
 #endif
 	default:
-		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
+		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val);
 	}
 
 	return emulated;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index f63b5cb..f90e86d 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -154,6 +154,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int sprn = get_sprn(inst);
 	enum emulation_result emulated = EMULATE_DONE;
 	int advance = 1;
+	ulong spr_val = 0;
 
 	/* this default type might be overwritten by subcategories */
 	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -235,55 +236,59 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		case OP_31_XOP_MFSPR:
 			switch (sprn) {
 			case SPRN_SRR0:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0);
+				spr_val = vcpu->arch.shared->srr0;
 				break;
 			case SPRN_SRR1:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1);
+				spr_val = vcpu->arch.shared->srr1;
 				break;
 			case SPRN_PVR:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break;
+				spr_val = vcpu->arch.pvr;
+				break;
 			case SPRN_PIR:
-				kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break;
+				spr_val = vcpu->vcpu_id;
+				break;
 			case SPRN_MSSSR0:
-				kvmppc_set_gpr(vcpu, rt, 0); break;
+				spr_val = 0;
+				break;
 
 			/* Note: mftb and TBRL/TBWL are user-accessible, so
 			 * the guest can always access the real TB anyways.
 			 * In fact, we probably will never see these traps. */
 			case SPRN_TBWL:
-				kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break;
+				spr_val = get_tb() >> 32;
+				break;
 			case SPRN_TBWU:
-				kvmppc_set_gpr(vcpu, rt, get_tb()); break;
+				spr_val = get_tb();
+				break;
 
 			case SPRN_SPRG0:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0);
+				spr_val = vcpu->arch.shared->sprg0;
 				break;
 			case SPRN_SPRG1:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1);
+				spr_val = vcpu->arch.shared->sprg1;
 				break;
 			case SPRN_SPRG2:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2);
+				spr_val = vcpu->arch.shared->sprg2;
 				break;
 			case SPRN_SPRG3:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3);
+				spr_val = vcpu->arch.shared->sprg3;
 				break;
 			/* Note: SPRG4-7 are user-readable, so we don't get
 			 * a trap. */
 
 			case SPRN_DEC:
-			{
-				kvmppc_set_gpr(vcpu, rt,
-					       kvmppc_get_dec(vcpu, get_tb()));
+				spr_val = kvmppc_get_dec(vcpu, get_tb());
 				break;
-			}
 			default:
-				emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
-				if (emulated == EMULATE_FAIL) {
-					printk("mfspr: unknown spr %x\n", sprn);
-					kvmppc_set_gpr(vcpu, rt, 0);
+				emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
+								     &spr_val);
+				if (unlikely(emulated == EMULATE_FAIL)) {
+					printk(KERN_INFO "mfspr: unknown spr "
+						"0x%x\n", sprn);
 				}
 				break;
 			}
+			kvmppc_set_gpr(vcpu, rt, spr_val);
 			kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
 			break;
 
@@ -301,12 +306,13 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_MTSPR:
+			spr_val = kvmppc_get_gpr(vcpu, rs);
 			switch (sprn) {
 			case SPRN_SRR0:
-				vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.shared->srr0 = spr_val;
 				break;
 			case SPRN_SRR1:
-				vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.shared->srr1 = spr_val;
 				break;
 
 			/* XXX We need to context-switch the timebase for
@@ -317,27 +323,29 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			case SPRN_MSSSR0: break;
 
 			case SPRN_DEC:
-				vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.dec = spr_val;
 				kvmppc_emulate_dec(vcpu);
 				break;
 
 			case SPRN_SPRG0:
-				vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.shared->sprg0 = spr_val;
 				break;
 			case SPRN_SPRG1:
-				vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.shared->sprg1 = spr_val;
 				break;
 			case SPRN_SPRG2:
-				vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.shared->sprg2 = spr_val;
 				break;
 			case SPRN_SPRG3:
-				vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs);
+				vcpu->arch.shared->sprg3 = spr_val;
 				break;
 
 			default:
-				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
+				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
+								     spr_val);
 				if (emulated == EMULATE_FAIL)
-					printk("mtspr: unknown spr %x\n", sprn);
+					printk(KERN_INFO "mtspr: unknown spr "
+						"0x%x\n", sprn);
 				break;
 			}
 			kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
-- 
cgit v0.10.2


From 5f3fbc342f408199e5cbb4b3dc220569147a99a7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 14 May 2012 14:58:58 +0800
Subject: KVM: VMX: unlike vmcs on fail path

fix:

[ 1529.577273] Call Trace:
[ 1529.577289]  [<ffffffffa060d58f>] kvm_arch_hardware_disable+0x13/0x30 [kvm]
[ 1529.577302]  [<ffffffffa05fa2d4>] hardware_disable_nolock+0x35/0x39 [kvm]
[ 1529.577311]  [<ffffffffa05fa29f>] ? cpumask_clear_cpu.constprop.31+0x13/0x13 [kvm]
[ 1529.577315]  [<ffffffff81096ba8>] on_each_cpu+0x44/0x84
[ 1529.577326]  [<ffffffffa05f98b5>] hardware_disable_all_nolock+0x34/0x36 [kvm]
[ 1529.577335]  [<ffffffffa05f98e2>] hardware_disable_all+0x2b/0x39 [kvm]
[ 1529.577349]  [<ffffffffa05fafe5>] kvm_put_kvm+0xed/0x10f [kvm]
[ 1529.577358]  [<ffffffffa05fb3d7>] kvm_vm_release+0x22/0x28 [kvm]

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61ebdb6..3062ea9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6350,7 +6350,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	return &vmx->vcpu;
 
 free_vmcs:
-	free_vmcs(vmx->loaded_vmcs->vmcs);
+	free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
-- 
cgit v0.10.2


From d54e4237bcbb400fda11c902fd538aa0b4805720 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 7 May 2012 12:12:25 +0200
Subject: KVM: x86 emulator: convert bsf/bsr instructions to
 emulate_2op_SrcV_nobyte()

The instruction emulation for bsrw is broken in KVM because
the code always uses bsr with 32 or 64 bit operand size for
emulation. Fix that by using emulate_2op_SrcV_nobyte() macro
to use guest operand size for emulation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7fd2576..f95d242 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3133,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
 
 static int em_bsf(struct x86_emulate_ctxt *ctxt)
 {
-	u8 zf;
-
-	__asm__ ("bsf %2, %0; setz %1"
-		 : "=r"(ctxt->dst.val), "=q"(zf)
-		 : "r"(ctxt->src.val));
-
-	ctxt->eflags &= ~X86_EFLAGS_ZF;
-	if (zf) {
-		ctxt->eflags |= X86_EFLAGS_ZF;
-		/* Disable writeback. */
-		ctxt->dst.type = OP_NONE;
-	}
+	emulate_2op_SrcV_nobyte(ctxt, "bsf");
 	return X86EMUL_CONTINUE;
 }
 
 static int em_bsr(struct x86_emulate_ctxt *ctxt)
 {
-	u8 zf;
-
-	__asm__ ("bsr %2, %0; setz %1"
-		 : "=r"(ctxt->dst.val), "=q"(zf)
-		 : "r"(ctxt->src.val));
-
-	ctxt->eflags &= ~X86_EFLAGS_ZF;
-	if (zf) {
-		ctxt->eflags |= X86_EFLAGS_ZF;
-		/* Disable writeback. */
-		ctxt->dst.type = OP_NONE;
-	}
+	emulate_2op_SrcV_nobyte(ctxt, "bsr");
 	return X86EMUL_CONTINUE;
 }
 
-- 
cgit v0.10.2


From 512d5649e8dc3ed36f2ebf0818da64a4d4c2544a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 13 May 2012 19:53:23 +0300
Subject: KVM: VMX: Fix %ds/%es clobber

The vmx exit code unconditionally restores %ds and %es to __USER_DS.  This
can override the user's values, since %ds and %es are not saved and restored
in x86_64 syscalls.  In practice, this isn't dangerous since nobody uses
segment registers in long mode, least of all programs that use KVM.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3062ea9..f2ee016 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6102,7 +6102,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u16 _ds, _es;
 
+	savesegment(ds, _ds);
+	savesegment(es, _es);
 	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 		if (vmcs12->idt_vectoring_info_field &
@@ -6263,7 +6266,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+	loadsegment(ds, _ds);
+	loadsegment(es, _es);
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-- 
cgit v0.10.2


From b2da15ac26a0c00fc0d399a2bc5cf3c4e15f0b4f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 13 May 2012 19:53:24 +0300
Subject: KVM: VMX: Optimize %ds, %es reload

On x86_64, we can defer %ds and %es reload to the heavyweight context switch,
since nothing in the lightweight paths uses the host %ds or %es (they are
ignored by the processor).  Furthermore we can avoid the load if the segments
are null, by letting the hardware load the null segments for us.  This is the
expected case.

On i386, we could avoid the reload entirely, since the entry.S paths take care
of reload, except for the SYSEXIT path which leaves %ds and %es set to __USER_DS.
So we set them to the same values as well.

Saves about 70 cycles out of 1600 (around 4%; noisy measurements).

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f2ee016..32eb588 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -393,6 +393,9 @@ struct vcpu_vmx {
 	struct {
 		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+		u16           ds_sel, es_sel;
+#endif
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
 	} host_state;
@@ -1418,6 +1421,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	}
 
 #ifdef CONFIG_X86_64
+	savesegment(ds, vmx->host_state.ds_sel);
+	savesegment(es, vmx->host_state.es_sel);
+#endif
+
+#ifdef CONFIG_X86_64
 	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
 	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
 #else
@@ -1457,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	}
 	if (vmx->host_state.fs_reload_needed)
 		loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
+		loadsegment(ds, vmx->host_state.ds_sel);
+		loadsegment(es, vmx->host_state.es_sel);
+	}
+#else
+	/*
+	 * The sysexit path does not restore ds/es, so we must set them to
+	 * a reasonable value ourselves.
+	 */
+	loadsegment(ds, __USER_DS);
+	loadsegment(es, __USER_DS);
+#endif
 	reload_tss();
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -3640,8 +3661,18 @@ static void vmx_set_constant_host_state(void)
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+#ifdef CONFIG_X86_64
+	/*
+	 * Load null selectors, so we can avoid reloading them in
+	 * __vmx_load_host_state(), in case userspace uses the null selectors
+	 * too (the expected case).
+	 */
+	vmcs_write16(HOST_DS_SELECTOR, 0);
+	vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
 	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+#endif
 	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
@@ -6102,10 +6133,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u16 _ds, _es;
 
-	savesegment(ds, _ds);
-	savesegment(es, _es);
 	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 		if (vmcs12->idt_vectoring_info_field &
@@ -6266,8 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	loadsegment(ds, _ds);
-	loadsegment(es, _es);
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-- 
cgit v0.10.2


From c142786c6291189b5c85f53d91743e1eefbd8fe0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 14 May 2012 15:44:06 +0300
Subject: KVM: MMU: Don't use RCU for lockless shadow walking

Using RCU for lockless shadow walking can increase the amount of memory
in use by the system, since RCU grace periods are unpredictable.  We also
have an unconditional write to a shared variable (reader_counter), which
isn't good for scaling.

Replace that with a scheme similar to x86's get_user_pages_fast(): disable
interrupts during lockless shadow walk to force the freer
(kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
processor with interrupts enabled.

We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
kvm_flush_remote_tlbs() from avoiding the IPI.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e39bc..64c8989 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -240,8 +240,6 @@ struct kvm_mmu_page {
 #endif
 
 	int write_flooding_count;
-
-	struct rcu_head rcu;
 };
 
 struct kvm_pio_request {
@@ -540,8 +538,6 @@ struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
-	atomic_t reader_counter;
-
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07424cf..72102e0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -551,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
-	rcu_read_lock();
-	atomic_inc(&vcpu->kvm->arch.reader_counter);
-
-	/* Increase the counter before walking shadow page table */
-	smp_mb__after_atomic_inc();
+	/*
+	 * Prevent page table teardown by making any free-er wait during
+	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
+	 */
+	local_irq_disable();
+	vcpu->mode = READING_SHADOW_PAGE_TABLES;
+	/*
+	 * Make sure a following spte read is not reordered ahead of the write
+	 * to vcpu->mode.
+	 */
+	smp_mb();
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 {
-	/* Decrease the counter after walking shadow page table finished */
-	smp_mb__before_atomic_dec();
-	atomic_dec(&vcpu->kvm->arch.reader_counter);
-	rcu_read_unlock();
+	/*
+	 * Make sure the write to vcpu->mode is not reordered in front of
+	 * reads to sptes.  If it does, kvm_commit_zap_page() can see us
+	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+	 */
+	smp_mb();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	local_irq_enable();
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -1989,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
-	struct kvm_mmu_page *sp;
-
-	list_for_each_entry(sp, invalid_list, link)
-		kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
-	struct kvm_mmu_page *next, *sp;
-
-	sp = container_of(head, struct kvm_mmu_page, rcu);
-	while (sp) {
-		if (!list_empty(&sp->link))
-			next = list_first_entry(&sp->link,
-				      struct kvm_mmu_page, link);
-		else
-			next = NULL;
-		kvm_mmu_free_page(sp);
-		sp = next;
-	}
-}
-
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -2021,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	if (list_empty(invalid_list))
 		return;
 
-	kvm_flush_remote_tlbs(kvm);
-
-	if (atomic_read(&kvm->arch.reader_counter)) {
-		kvm_mmu_isolate_pages(invalid_list);
-		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
-		list_del_init(invalid_list);
+	/*
+	 * wmb: make sure everyone sees our modifications to the page tables
+	 * rmb: make sure we see changes to vcpu->mode
+	 */
+	smp_mb();
 
-		trace_kvm_mmu_delay_free_pages(sp);
-		call_rcu(&sp->rcu, free_pages_rcu);
-		return;
-	}
+	/*
+	 * Wait for all vcpus to exit guest mode and/or lockless shadow
+	 * page table walks.
+	 */
+	kvm_flush_remote_tlbs(kvm);
 
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -2039,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		kvm_mmu_isolate_page(sp);
 		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
-
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cae342d..c446435 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
-	EXITING_GUEST_MODE
+	EXITING_GUEST_MODE,
+	READING_SHADOW_PAGE_TABLES,
 };
 
 /*
-- 
cgit v0.10.2


From d8368af8b46b904def42a0f341d2f4f29001fa77 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 14 May 2012 18:07:56 +0300
Subject: KVM: Fix mmu_reload() clash with nested vmx event injection

Currently the inject_pending_event() call during guest entry happens after
kvm_mmu_reload().  This is for historical reasons - we used to
inject_pending_event() in atomic context, while kvm_mmu_reload() needs task
context.

A problem is that nested vmx can cause the mmu context to be reset, if event
injection is intercepted and causes a #VMEXIT instead (the #VMEXIT resets
CR0/CR3/CR4).  If this happens, we end up with invalid root_hpa, and since
kvm_mmu_reload() has already run, no one will fix it and we end up entering
the guest this way.

Fix by reordering event injection to be before kvm_mmu_reload().  Use
->cancel_injection() to undo if kvm_mmu_reload() fails.

https://bugzilla.kernel.org/show_bug.cgi?id=42980

Reported-by: Luke-Jr <luke-jr+linuxbugs@utopios.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4de705c..b78f89d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5279,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_deliver_pmi(vcpu);
 	}
 
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		inject_pending_event(vcpu);
 
@@ -5298,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r)) {
+		kvm_x86_ops->cancel_injection(vcpu);
+		goto out;
+	}
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-- 
cgit v0.10.2


From 1526bf9ccf310f1d35c1275b8b477a249d25aaf2 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 15 May 2012 14:15:25 +0200
Subject: KVM: s390: add capability indicating COW support

Currently qemu/kvm on s390 uses a guest mapping that does not
allow the guest backing page table to be write-protected to
support older systems. On those older systems a host write
protection fault will be delivered to the guest.

Newer systems allow to write-protect the guest backing memory
and let the fault be delivered to the host, thus allowing COW.

Use a capability bit to tell qemu if that is possible.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index fed7bee..bf238c5 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -48,6 +48,7 @@ int sclp_cpu_deconfigure(u8 cpu);
 void sclp_facilities_detect(void);
 unsigned long long sclp_get_rnmax(void);
 unsigned long long sclp_get_rzm(void);
+u8 sclp_get_fac85(void);
 int sclp_sdias_blk_count(void);
 int sclp_sdias_copy(void *dest, int blk_num, int nr_blks);
 int sclp_chp_configure(struct chp_id chpid);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e5e3800..5c761bf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -28,6 +28,7 @@
 #include <asm/pgtable.h>
 #include <asm/nmi.h>
 #include <asm/switch_to.h>
+#include <asm/sclp.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -140,6 +141,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_S390_COW:
+		r = sclp_get_fac85() & 0x2;
+		break;
 	default:
 		r = 0;
 	}
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 231a1d8..032171e 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/memory.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <asm/chpid.h>
 #include <asm/sclp.h>
@@ -38,7 +39,8 @@ struct read_info_sccb {
 	u64	facilities;		/* 48-55 */
 	u8	_reserved2[84 - 56];	/* 56-83 */
 	u8	fac84;			/* 84 */
-	u8	_reserved3[91 - 85];	/* 85-90 */
+	u8	fac85;			/* 85 */
+	u8	_reserved3[91 - 86];	/* 86-90 */
 	u8	flags;			/* 91 */
 	u8	_reserved4[100 - 92];	/* 92-99 */
 	u32	rnsize2;		/* 100-103 */
@@ -51,6 +53,7 @@ static int __initdata early_read_info_sccb_valid;
 
 u64 sclp_facilities;
 static u8 sclp_fac84;
+static u8 sclp_fac85;
 static unsigned long long rzm;
 static unsigned long long rnmax;
 
@@ -112,6 +115,7 @@ void __init sclp_facilities_detect(void)
 	sccb = &early_read_info_sccb;
 	sclp_facilities = sccb->facilities;
 	sclp_fac84 = sccb->fac84;
+	sclp_fac85 = sccb->fac85;
 	rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2;
 	rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2;
 	rzm <<= 20;
@@ -127,6 +131,12 @@ unsigned long long sclp_get_rzm(void)
 	return rzm;
 }
 
+u8 sclp_get_fac85(void)
+{
+	return sclp_fac85;
+}
+EXPORT_SYMBOL_GPL(sclp_get_fac85);
+
 /*
  * This function will be called after sclp_facilities_detect(), which gets
  * called from early.c code. Therefore the sccb should have valid contents.
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8d696cf..09f2b3a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -616,6 +616,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_KVMCLOCK_CTRL 76
 #define KVM_CAP_SIGNAL_MSI 77
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
+#define KVM_CAP_S390_COW 79
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v0.10.2


From 14eebd917d154f3181d2cb219e40b18eec314db7 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 15 May 2012 14:15:26 +0200
Subject: KVM: s390: KVM_GET/SET_ONEREG for s390

This patch enables KVM_CAP_ONE_REG for s390 and implements stubs
for KVM_GET/SET_ONE_REG. This is based on the ppc implementation.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5c761bf..894b3e4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -135,6 +135,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_S390_UCONTROL:
 #endif
 	case KVM_CAP_SYNC_REGS:
+	case KVM_CAP_ONE_REG:
 		r = 1;
 		break;
 	case KVM_CAP_NR_VCPUS:
@@ -439,6 +440,31 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
+					   struct kvm_one_reg *reg)
+{
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	default:
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
+					   struct kvm_one_reg *reg)
+{
+	int r = -EINVAL;
+
+	switch (reg->id) {
+	default:
+		break;
+	}
+
+	return r;
+}
 
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
@@ -770,6 +796,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_S390_INITIAL_RESET:
 		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
 		break;
+	case KVM_SET_ONE_REG:
+	case KVM_GET_ONE_REG: {
+		struct kvm_one_reg reg;
+		r = -EFAULT;
+		if (copy_from_user(&reg, argp, sizeof(reg)))
+			break;
+		if (ioctl == KVM_SET_ONE_REG)
+			r = kvm_arch_vcpu_ioctl_set_one_reg(vcpu, &reg);
+		else
+			r = kvm_arch_vcpu_ioctl_get_one_reg(vcpu, &reg);
+		break;
+	}
 #ifdef CONFIG_KVM_S390_UCONTROL
 	case KVM_S390_UCAS_MAP: {
 		struct kvm_s390_ucas_mapping ucasmap;
-- 
cgit v0.10.2


From 29b7c71b5ecf2caaa4c2105ecc0094826db8a8a8 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 15 May 2012 14:15:27 +0200
Subject: KVM: s390: epoch difference and TOD programmable field

This patch makes vcpu epoch difference and the TOD programmable
field accessible from userspace. This is needed in order to
implement a couple of instructions that deal with the time of
day clock on s390, such as SET CLOCK and for migration.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 9607667..53a5372 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -52,4 +52,7 @@ struct kvm_sync_regs {
 	__u32 acrs[16];	/* access registers */
 	__u64 crs[16];	/* control registers */
 };
+
+#define KVM_REG_S390_TODPR	(KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1)
+#define KVM_REG_S390_EPOCHDIFF	(KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2)
 #endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 894b3e4..cc4c013 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -446,6 +446,14 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
 	int r = -EINVAL;
 
 	switch (reg->id) {
+	case KVM_REG_S390_TODPR:
+		r = put_user(vcpu->arch.sie_block->todpr,
+			     (u32 __user *)reg->addr);
+		break;
+	case KVM_REG_S390_EPOCHDIFF:
+		r = put_user(vcpu->arch.sie_block->epoch,
+			     (u64 __user *)reg->addr);
+		break;
 	default:
 		break;
 	}
@@ -459,6 +467,14 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
 	int r = -EINVAL;
 
 	switch (reg->id) {
+	case KVM_REG_S390_TODPR:
+		r = get_user(vcpu->arch.sie_block->todpr,
+			     (u32 __user *)reg->addr);
+		break;
+	case KVM_REG_S390_EPOCHDIFF:
+		r = get_user(vcpu->arch.sie_block->epoch,
+			     (u64 __user *)reg->addr);
+		break;
 	default:
 		break;
 	}
-- 
cgit v0.10.2


From 46a6dd1c87c4ff17202574127daf70cc0bb38d6d Mon Sep 17 00:00:00 2001
From: "Jason J. herne" <jjherne@us.ibm.com>
Date: Tue, 15 May 2012 14:15:28 +0200
Subject: KVM: s390: onereg for timer related registers

Enhance the KVM ONE_REG capability within S390 to allow
getting/setting the following special cpu registers: clock comparator
and the cpu timer. These are needed for migration.

Signed-off-by: Jason J. herne <jjherne@us.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 53a5372..bdcbe0f 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -55,4 +55,6 @@ struct kvm_sync_regs {
 
 #define KVM_REG_S390_TODPR	(KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1)
 #define KVM_REG_S390_EPOCHDIFF	(KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2)
+#define KVM_REG_S390_CPU_TIMER  (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3)
+#define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4)
 #endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index cc4c013..664766d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -454,6 +454,14 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
 		r = put_user(vcpu->arch.sie_block->epoch,
 			     (u64 __user *)reg->addr);
 		break;
+	case KVM_REG_S390_CPU_TIMER:
+		r = put_user(vcpu->arch.sie_block->cputm,
+			     (u64 __user *)reg->addr);
+		break;
+	case KVM_REG_S390_CLOCK_COMP:
+		r = put_user(vcpu->arch.sie_block->ckc,
+			     (u64 __user *)reg->addr);
+		break;
 	default:
 		break;
 	}
@@ -475,6 +483,14 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
 		r = get_user(vcpu->arch.sie_block->epoch,
 			     (u64 __user *)reg->addr);
 		break;
+	case KVM_REG_S390_CPU_TIMER:
+		r = get_user(vcpu->arch.sie_block->cputm,
+			     (u64 __user *)reg->addr);
+		break;
+	case KVM_REG_S390_CLOCK_COMP:
+		r = get_user(vcpu->arch.sie_block->ckc,
+			     (u64 __user *)reg->addr);
+		break;
 	default:
 		break;
 	}
-- 
cgit v0.10.2


From 322728e55aa7834e2fab2786b76df183c4843a12 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 18 May 2012 13:59:39 -0400
Subject: KVM: make asm-generic/kvm_para.h have an ifdef __KERNEL__ block

There are two functions in this asm-generic file.  Looking at
other arch which do not use the generic version, these two fcns
are within an #ifdef __KERNEL__ block, so make the generic one
consistent with those.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
index 9a7bbad..5cba37f 100644
--- a/include/asm-generic/kvm_para.h
+++ b/include/asm-generic/kvm_para.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_GENERIC_KVM_PARA_H
 #define _ASM_GENERIC_KVM_PARA_H
 
+#ifdef __KERNEL__
 
 /*
  * This function is used by architectures that support kvm to avoid issuing
@@ -16,4 +17,6 @@ static inline unsigned int kvm_arch_para_features(void)
 	return 0;
 }
 
+#endif	/* _KERNEL__ */
+
 #endif
-- 
cgit v0.10.2