From ca60dfbb69afb549e33527cbf676e4daf8febfb5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Tue, 24 Jun 2008 17:02:38 +0800
Subject: KVM: VMX: Rename misnamed msr bits

MSR_IA32_FEATURE_LOCKED is just a bit in fact, which shouldn't be prefixed with
MSR_.  So is MSR_IA32_FEATURE_VMXON_ENABLED.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc5..81dac72 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1031,9 +1031,9 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-	    == MSR_IA32_FEATURE_CONTROL_LOCKED;
+	return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT |
+		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
+	    == IA32_FEATURE_CONTROL_LOCKED_BIT;
 	/* locked but not enabled */
 }
 
@@ -1045,14 +1045,14 @@ static void hardware_enable(void *garbage)
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		    MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-	    != (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+	if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT |
+		    IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
+	    != (IA32_FEATURE_CONTROL_LOCKED_BIT |
+		IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       MSR_IA32_FEATURE_CONTROL_LOCKED |
-		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+		       IA32_FEATURE_CONTROL_LOCKED_BIT |
+		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT);
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 17e2599..86059f4 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,8 +331,8 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
+#define IA32_FEATURE_CONTROL_LOCKED_BIT		0x1
+#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT	0x4
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
-- 
cgit v0.10.2


From 5fdbf9765b7ba6a45100851154768de703d51e76 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 27 Jun 2008 14:58:02 -0300
Subject: KVM: x86: accessors for guest registers

As suggested by Avi, introduce accessors to read/write guest registers.
This simplifies the ->cache_regs/->decache_regs interface, and improves
register caching which is important for VMX, where the cost of
vmcs_read/vmcs_write is significant.

[avi: fix warnings]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 0000000..1ff819d
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+					      enum kvm_reg reg)
+{
+	if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+		kvm_x86_ops->cache_reg(vcpu, reg);
+
+	return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+				      enum kvm_reg reg,
+				      unsigned long val)
+{
+	vcpu->arch.regs[reg] = val;
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+	return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de..9fde0ac 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <asm/atomic.h>
+#include "kvm_cache_regs.h"
 #include "irq.h"
 
 #define PRId64 "d"
@@ -558,8 +559,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
 	struct kvm_run *run = vcpu->run;
 
 	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
-	kvm_x86_ops->cache_regs(vcpu);
-	run->tpr_access.rip = vcpu->arch.rip;
+	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.is_write = write;
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86..54b0bf3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
 #include "kvm_svm.h"
 #include "irq.h"
 #include "mmu.h"
+#include "kvm_cache_regs.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -236,13 +237,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		printk(KERN_DEBUG "%s: NOP\n", __func__);
 		return;
 	}
-	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
-		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-		       __func__,
-		       svm->vmcb->save.rip,
-		       svm->next_rip);
+	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+		printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+		       __func__, kvm_rip_read(vcpu), svm->next_rip);
 
-	vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
 	vcpu->arch.interrupt_window_open = 1;
@@ -581,6 +580,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	save->dr7 = 0x400;
 	save->rflags = 2;
 	save->rip = 0x0000fff0;
+	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
 	/*
 	 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -615,10 +615,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 	init_vmcb(svm);
 
 	if (vcpu->vcpu_id != 0) {
-		svm->vmcb->save.rip = 0;
+		kvm_rip_write(vcpu, 0);
 		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
 		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
 	}
+	vcpu->arch.regs_avail = ~0;
+	vcpu->arch.regs_dirty = ~0;
 
 	return 0;
 }
@@ -721,23 +723,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	rdtscll(vcpu->arch.host_tsc);
 }
 
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-	vcpu->arch.rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->arch.rip;
-}
-
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	return to_svm(vcpu)->vmcb->save.rflags;
@@ -1139,14 +1124,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
 static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	svm->next_rip = svm->vmcb->save.rip + 1;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
 	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_halt(&svm->vcpu);
 }
 
 static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	svm->next_rip = svm->vmcb->save.rip + 3;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 	kvm_emulate_hypercall(&svm->vcpu);
 	return 1;
@@ -1178,7 +1163,7 @@ static int task_switch_interception(struct vcpu_svm *svm,
 
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	svm->next_rip = svm->vmcb->save.rip + 2;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 	kvm_emulate_cpuid(&svm->vcpu);
 	return 1;
 }
@@ -1273,9 +1258,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 		KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
 			    (u32)(data >> 32), handler);
 
-		svm->vmcb->save.rax = data & 0xffffffff;
+		svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
 		svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
-		svm->next_rip = svm->vmcb->save.rip + 2;
+		svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 		skip_emulated_instruction(&svm->vcpu);
 	}
 	return 1;
@@ -1359,13 +1344,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	u64 data = (svm->vmcb->save.rax & -1u)
+	u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
 		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
 	KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
 		    handler);
 
-	svm->next_rip = svm->vmcb->save.rip + 2;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 	if (svm_set_msr(&svm->vcpu, ecx, data))
 		kvm_inject_gp(&svm->vcpu, 0);
 	else
@@ -1723,6 +1708,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	u16 gs_selector;
 	u16 ldt_selector;
 
+	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
@@ -1858,6 +1847,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		load_db_regs(svm->host_db_regs);
 
 	vcpu->arch.cr2 = svm->vmcb->save.cr2;
+	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
 	write_dr6(svm->host_dr6);
 	write_dr7(svm->host_dr7);
@@ -1977,8 +1969,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_gdt = svm_set_gdt,
 	.get_dr = svm_get_dr,
 	.set_dr = svm_set_dr,
-	.cache_regs = svm_cache_regs,
-	.decache_regs = svm_decache_regs,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81dac72..5a3a032 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
+#include "kvm_cache_regs.h"
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -715,9 +716,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	unsigned long rip;
 	u32 interruptibility;
 
-	rip = vmcs_readl(GUEST_RIP);
+	rip = kvm_rip_read(vcpu);
 	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-	vmcs_writel(GUEST_RIP, rip);
+	kvm_rip_write(vcpu, rip);
 
 	/*
 	 * We emulated an instruction, so temporary interrupt blocking
@@ -947,24 +948,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	return ret;
 }
 
-/*
- * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->arch.regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-	vcpu->arch.rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs.  Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
-	vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-	vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+	switch (reg) {
+	case VCPU_REGS_RSP:
+		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+		break;
+	case VCPU_REGS_RIP:
+		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+		break;
+	default:
+		break;
+	}
 }
 
 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -2019,6 +2015,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	u64 msr;
 	int ret;
 
+	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 	down_read(&vcpu->kvm->slots_lock);
 	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
@@ -2072,10 +2069,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
 	if (vmx->vcpu.vcpu_id == 0)
-		vmcs_writel(GUEST_RIP, 0xfff0);
+		kvm_rip_write(vcpu, 0xfff0);
 	else
-		vmcs_writel(GUEST_RIP, 0);
-	vmcs_writel(GUEST_RSP, 0);
+		kvm_rip_write(vcpu, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
 	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
 	vmcs_writel(GUEST_DR7, 0x400);
@@ -2139,11 +2136,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
-		vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2288,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	error_code = 0;
-	rip = vmcs_readl(GUEST_RIP);
+	rip = kvm_rip_read(vcpu);
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
@@ -2386,27 +2383,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	reg = (exit_qualification >> 8) & 15;
 	switch ((exit_qualification >> 4) & 3) {
 	case 0: /* mov to cr */
-		KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
-			    (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
+		KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
+			    (u32)kvm_register_read(vcpu, reg),
+			    (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
+			    handler);
 		switch (cr) {
 		case 0:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 4:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			if (irqchip_in_kernel(vcpu->kvm))
 				return 1;
@@ -2415,7 +2410,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		};
 		break;
 	case 2: /* clts */
-		vcpu_load_rsp_rip(vcpu);
 		vmx_fpu_deactivate(vcpu);
 		vcpu->arch.cr0 &= ~X86_CR0_TS;
 		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2420,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	case 1: /*mov from cr*/
 		switch (cr) {
 		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = vcpu->arch.cr3;
-			vcpu_put_rsp_rip(vcpu);
+			kvm_register_write(vcpu, reg, vcpu->arch.cr3);
 			KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
-				    (u32)vcpu->arch.regs[reg],
-				    (u32)((u64)vcpu->arch.regs[reg] >> 32),
+				    (u32)kvm_register_read(vcpu, reg),
+				    (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
 				    handler);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
-			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
-			vcpu_put_rsp_rip(vcpu);
+			kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
 			KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
-				    (u32)vcpu->arch.regs[reg], handler);
+				    (u32)kvm_register_read(vcpu, reg), handler);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		}
@@ -2472,7 +2462,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	dr = exit_qualification & 7;
 	reg = (exit_qualification >> 8) & 15;
-	vcpu_load_rsp_rip(vcpu);
 	if (exit_qualification & 16) {
 		/* mov from dr */
 		switch (dr) {
@@ -2485,12 +2474,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		default:
 			val = 0;
 		}
-		vcpu->arch.regs[reg] = val;
+		kvm_register_write(vcpu, reg, val);
 		KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	} else {
 		/* mov to dr */
 	}
-	vcpu_put_rsp_rip(vcpu);
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
@@ -2735,8 +2723,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
-		    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
+		    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
@@ -2922,9 +2910,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 {
 	vmx->rmode.irq.pending = 0;
-	if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
 		return;
-	vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
 	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
 		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
 		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2941,6 +2929,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info;
 
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
 	/*
 	 * Loading guest fpu may have cleared host cr0.ts
 	 */
@@ -3061,6 +3054,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 	      );
 
+	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+	vcpu->arch.regs_dirty = 0;
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
@@ -3224,8 +3220,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
 	.set_gdt = vmx_set_gdt,
-	.cache_regs = vcpu_load_rsp_rip,
-	.decache_regs = vcpu_put_rsp_rip,
+	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d682fc..2f0696b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "mmu.h"
 #include "i8254.h"
 #include "tss.h"
+#include "kvm_cache_regs.h"
 
 #include <linux/clocksource.h>
 #include <linux/kvm.h>
@@ -61,6 +62,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
 
 struct kvm_x86_ops *kvm_x86_ops;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -2080,7 +2082,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 {
 	u8 opcodes[4];
-	unsigned long rip = vcpu->arch.rip;
+	unsigned long rip = kvm_rip_read(vcpu);
 	unsigned long rip_linear;
 
 	if (!printk_ratelimit())
@@ -2102,6 +2104,14 @@ static struct x86_emulate_ops emulate_ops = {
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 };
 
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+	kvm_register_read(vcpu, VCPU_REGS_RAX);
+	kvm_register_read(vcpu, VCPU_REGS_RSP);
+	kvm_register_read(vcpu, VCPU_REGS_RIP);
+	vcpu->arch.regs_dirty = ~0;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			struct kvm_run *run,
 			unsigned long cr2,
@@ -2112,7 +2122,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	struct decode_cache *c;
 
 	vcpu->arch.mmio_fault_cr2 = cr2;
-	kvm_x86_ops->cache_regs(vcpu);
+	/*
+	 * TODO: fix x86_emulate.c to use guest_read/write_register
+	 * instead of direct ->regs accesses, can save hundred cycles
+	 * on Intel for instructions that don't read/change RSP, for
+	 * for example.
+	 */
+	cache_all_regs(vcpu);
 
 	vcpu->mmio_is_write = 0;
 	vcpu->arch.pio.string = 0;
@@ -2172,7 +2188,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_x86_ops->decache_regs(vcpu);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
 	if (vcpu->mmio_is_write) {
@@ -2225,20 +2240,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
 	struct kvm_pio_request *io = &vcpu->arch.pio;
 	long delta;
 	int r;
-
-	kvm_x86_ops->cache_regs(vcpu);
+	unsigned long val;
 
 	if (!io->string) {
-		if (io->in)
-			memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
-			       io->size);
+		if (io->in) {
+			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+			memcpy(&val, vcpu->arch.pio_data, io->size);
+			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+		}
 	} else {
 		if (io->in) {
 			r = pio_copy_data(vcpu);
-			if (r) {
-				kvm_x86_ops->cache_regs(vcpu);
+			if (r)
 				return r;
-			}
 		}
 
 		delta = 1;
@@ -2248,19 +2262,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
 			 * The size of the register should really depend on
 			 * current address size.
 			 */
-			vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+			val -= delta;
+			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
 		}
 		if (io->down)
 			delta = -delta;
 		delta *= io->size;
-		if (io->in)
-			vcpu->arch.regs[VCPU_REGS_RDI] += delta;
-		else
-			vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+		if (io->in) {
+			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+			val += delta;
+			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+		} else {
+			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+			val += delta;
+			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+		}
 	}
 
-	kvm_x86_ops->decache_regs(vcpu);
-
 	io->count -= io->cur_count;
 	io->cur_count = 0;
 
@@ -2313,6 +2332,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  int size, unsigned port)
 {
 	struct kvm_io_device *pio_dev;
+	unsigned long val;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2353,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
 			    handler);
 
-	kvm_x86_ops->cache_regs(vcpu);
-	memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	memcpy(vcpu->arch.pio_data, &val, 4);
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 
@@ -2519,13 +2539,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	unsigned long nr, a0, a1, a2, a3, ret;
 	int r = 1;
 
-	kvm_x86_ops->cache_regs(vcpu);
-
-	nr = vcpu->arch.regs[VCPU_REGS_RAX];
-	a0 = vcpu->arch.regs[VCPU_REGS_RBX];
-	a1 = vcpu->arch.regs[VCPU_REGS_RCX];
-	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
-	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
 
 	KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
 
@@ -2548,8 +2566,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		ret = -KVM_ENOSYS;
 		break;
 	}
-	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
-	kvm_x86_ops->decache_regs(vcpu);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 	++vcpu->stat.hypercalls;
 	return r;
 }
@@ -2559,6 +2576,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 {
 	char instruction[3];
 	int ret = 0;
+	unsigned long rip = kvm_rip_read(vcpu);
 
 
 	/*
@@ -2568,9 +2586,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 	 */
 	kvm_mmu_zap_all(vcpu->kvm);
 
-	kvm_x86_ops->cache_regs(vcpu);
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
-	if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+	if (emulator_write_emulated(rip, instruction, 3, vcpu)
 	    != X86EMUL_CONTINUE)
 		ret = -EFAULT;
 
@@ -2700,13 +2717,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 	u32 function, index;
 	struct kvm_cpuid_entry2 *e, *best;
 
-	kvm_x86_ops->cache_regs(vcpu);
-	function = vcpu->arch.regs[VCPU_REGS_RAX];
-	index = vcpu->arch.regs[VCPU_REGS_RCX];
-	vcpu->arch.regs[VCPU_REGS_RAX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RBX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RCX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
 	best = NULL;
 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
 		e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2740,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 				best = e;
 	}
 	if (best) {
-		vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
-		vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
-		vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
-		vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
 	}
-	kvm_x86_ops->decache_regs(vcpu);
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 	KVMTRACE_5D(CPUID, vcpu, function,
-		    (u32)vcpu->arch.regs[VCPU_REGS_RAX],
-		    (u32)vcpu->arch.regs[VCPU_REGS_RBX],
-		    (u32)vcpu->arch.regs[VCPU_REGS_RCX],
-		    (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
@@ -2917,8 +2932,8 @@ again:
 	 * Profile KVM exit RIPs:
 	 */
 	if (unlikely(prof_on == KVM_PROFILING)) {
-		kvm_x86_ops->cache_regs(vcpu);
-		profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+		unsigned long rip = kvm_rip_read(vcpu);
+		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 
 	if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2999,11 +3014,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 #endif
-	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
-		kvm_x86_ops->cache_regs(vcpu);
-		vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
-		kvm_x86_ops->decache_regs(vcpu);
-	}
+	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+		kvm_register_write(vcpu, VCPU_REGS_RAX,
+				     kvm_run->hypercall.ret);
 
 	r = __vcpu_run(vcpu, kvm_run);
 
@@ -3019,28 +3032,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	vcpu_load(vcpu);
 
-	kvm_x86_ops->cache_regs(vcpu);
-
-	regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
-	regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
-	regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
-	regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
-	regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
-	regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
 #ifdef CONFIG_X86_64
-	regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
-	regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
-	regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
-	regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
-	regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
-	regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
-	regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
-	regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
 #endif
 
-	regs->rip = vcpu->arch.rip;
+	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_x86_ops->get_rflags(vcpu);
 
 	/*
@@ -3058,29 +3069,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	vcpu_load(vcpu);
 
-	vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
-	vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
-	vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
-	vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
-	vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
-	vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
 #ifdef CONFIG_X86_64
-	vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
-	vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
-	vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
-	vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
-	vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
-	vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
-	vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
-	vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+
 #endif
 
-	vcpu->arch.rip = regs->rip;
+	kvm_rip_write(vcpu, regs->rip);
 	kvm_x86_ops->set_rflags(vcpu, regs->rflags);
 
-	kvm_x86_ops->decache_regs(vcpu);
 
 	vcpu->arch.exception.pending = false;
 
@@ -3316,17 +3327,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 				struct tss_segment_32 *tss)
 {
 	tss->cr3 = vcpu->arch.cr3;
-	tss->eip = vcpu->arch.rip;
+	tss->eip = kvm_rip_read(vcpu);
 	tss->eflags = kvm_x86_ops->get_rflags(vcpu);
-	tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
-	tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-	tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
-	tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
-	tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
-	tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
-	tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
-	tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
-
+	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
 	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
 	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3352,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 {
 	kvm_set_cr3(vcpu, tss->cr3);
 
-	vcpu->arch.rip = tss->eip;
+	kvm_rip_write(vcpu, tss->eip);
 	kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
 
-	vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
-	vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
-	vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
-	vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
-	vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
-	vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
 
 	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
 		return 1;
@@ -3380,16 +3390,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
 				struct tss_segment_16 *tss)
 {
-	tss->ip = vcpu->arch.rip;
+	tss->ip = kvm_rip_read(vcpu);
 	tss->flag = kvm_x86_ops->get_rflags(vcpu);
-	tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
-	tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
-	tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
-	tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
-	tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
-	tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
-	tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
-	tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
 
 	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3412,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
 				 struct tss_segment_16 *tss)
 {
-	vcpu->arch.rip = tss->ip;
+	kvm_rip_write(vcpu, tss->ip);
 	kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
-	vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
-	vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
-	vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
-	vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
-	vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
-	vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
 
 	if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
 		return 1;
@@ -3534,7 +3544,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	}
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	kvm_x86_ops->cache_regs(vcpu);
 
 	if (nseg_desc.type & 8)
 		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3568,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	tr_seg.type = 11;
 	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
 out:
-	kvm_x86_ops->decache_regs(vcpu);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f9046..d5da7f1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
 #define DPRINTF(_f, _a ...) printf(_f , ## _a)
 #else
 #include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include <linux/module.h>
@@ -839,7 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* Shadow copy of register state. Committed on successful emulation. */
 
 	memset(c, 0, sizeof(struct decode_cache));
-	c->eip = ctxt->vcpu->arch.rip;
+	c->eip = kvm_rip_read(ctxt->vcpu);
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -1267,7 +1268,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
 		if (c->regs[VCPU_REGS_RCX] == 0) {
-			ctxt->vcpu->arch.rip = c->eip;
+			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
 		/* The second termination condition only applies for REPE
@@ -1281,17 +1282,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 				(c->b == 0xae) || (c->b == 0xaf)) {
 			if ((c->rep_prefix == REPE_PREFIX) &&
 				((ctxt->eflags & EFLG_ZF) == 0)) {
-					ctxt->vcpu->arch.rip = c->eip;
+					kvm_rip_write(ctxt->vcpu, c->eip);
 					goto done;
 			}
 			if ((c->rep_prefix == REPNE_PREFIX) &&
 				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-				ctxt->vcpu->arch.rip = c->eip;
+				kvm_rip_write(ctxt->vcpu, c->eip);
 				goto done;
 			}
 		}
 		c->regs[VCPU_REGS_RCX]--;
-		c->eip = ctxt->vcpu->arch.rip;
+		c->eip = kvm_rip_read(ctxt->vcpu);
 	}
 
 	if (c->src.type == OP_MEM) {
@@ -1768,7 +1769,7 @@ writeback:
 
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-	ctxt->vcpu->arch.rip = c->eip;
+	kvm_rip_write(ctxt->vcpu, c->eip);
 
 done:
 	if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1794,7 @@ twobyte_insn:
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
-			c->eip = ctxt->vcpu->arch.rip;
+			c->eip = kvm_rip_read(ctxt->vcpu);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -1889,7 +1890,7 @@ twobyte_insn:
 		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
 		if (rc) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->vcpu->arch.rip;
+			c->eip = kvm_rip_read(ctxt->vcpu);
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
@@ -1899,7 +1900,7 @@ twobyte_insn:
 		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
 		if (rc) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->vcpu->arch.rip;
+			c->eip = kvm_rip_read(ctxt->vcpu);
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 6979454..8c886be 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -89,7 +89,7 @@ extern struct list_head vm_list;
 struct kvm_vcpu;
 struct kvm;
 
-enum {
+enum kvm_reg {
 	VCPU_REGS_RAX = 0,
 	VCPU_REGS_RCX = 1,
 	VCPU_REGS_RDX = 2,
@@ -108,6 +108,7 @@ enum {
 	VCPU_REGS_R14 = 14,
 	VCPU_REGS_R15 = 15,
 #endif
+	VCPU_REGS_RIP,
 	NR_VCPU_REGS
 };
 
@@ -219,8 +220,13 @@ struct kvm_vcpu_arch {
 	int interrupt_window_open;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
-	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
-	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
+	/*
+	 * rip and regs accesses must go through
+	 * kvm_{register,rip}_{read,write} functions.
+	 */
+	unsigned long regs[NR_VCPU_REGS];
+	u32 regs_avail;
+	u32 regs_dirty;
 
 	unsigned long cr0;
 	unsigned long cr2;
@@ -414,8 +420,7 @@ struct kvm_x86_ops {
 	unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
 	void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 		       int *exception);
-	void (*cache_regs)(struct kvm_vcpu *vcpu);
-	void (*decache_regs)(struct kvm_vcpu *vcpu);
+	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-- 
cgit v0.10.2


From d98e6346350ac909f095768beb28b82368bd126f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 1 Jul 2008 16:23:49 -0500
Subject: KVM: Move KVM TRACE DEFINITIONS to common header

Move KVM trace definitions from x86 specific kvm headers to common kvm
headers to create a cross-architecture numbering scheme for trace
events. This means the kvmtrace_format userspace tool won't need to know
which architecture produced the log file being processed.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
index 78e954d..ba0dd79 100644
--- a/include/asm-x86/kvm.h
+++ b/include/asm-x86/kvm.h
@@ -208,26 +208,4 @@ struct kvm_pit_channel_state {
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
-
-#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
-#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
-#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
-#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
-#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
-#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
-#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
-#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
-#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
-#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
-#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
-#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
-#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
-#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
-#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
-#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
-#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
-#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
-#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
-#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
-
 #endif /* ASM_X86__KVM_H */
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 8c886be..4f2bd88 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -691,24 +691,6 @@ enum {
 	TASK_SWITCH_GATE = 3,
 };
 
-#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 5, d1, d2, d3, d4, d5)
-#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 4, d1, d2, d3, d4, 0)
-#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 3, d1, d2, d3, 0, 0)
-#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 2, d1, d2, 0, 0, 0)
-#define KVMTRACE_1D(evt, vcpu, d1, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 1, d1, 0, 0, 0, 0)
-#define KVMTRACE_0D(evt, vcpu, name) \
-	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-						vcpu, 0, 0, 0, 0, 0, 0)
 
 #ifdef CONFIG_64BIT
 # define KVM_EX_ENTRY ".quad"
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 70a3065..8a3cead 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -440,4 +440,25 @@ struct kvm_trace_rec {
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8525afc..a18aaad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -326,6 +326,25 @@ struct kvm_stats_debugfs_item {
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 
+#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 5, d1, d2, d3, d4, d5)
+#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 4, d1, d2, d3, d4, 0)
+#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 3, d1, d2, d3, 0, 0)
+#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 2, d1, d2, 0, 0, 0)
+#define KVMTRACE_1D(evt, vcpu, d1, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 1, d1, 0, 0, 0, 0)
+#define KVMTRACE_0D(evt, vcpu, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 0, 0, 0, 0, 0, 0)
+
 #ifdef CONFIG_KVM_TRACE
 int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
 void kvm_trace_cleanup(void);
-- 
cgit v0.10.2


From 867767a365ee74a3adcfaba27075eefb66b14bfd Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Fri, 27 Jun 2008 15:55:02 +0300
Subject: KVM: Introduce kvm_set_irq to inject interrupts in guests

This function injects an interrupt into the guest given the kvm struct,
the (guest) irq number and the interrupt level.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b..0d9e552 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -100,3 +100,14 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 	__kvm_migrate_apic_timer(vcpu);
 	__kvm_migrate_pit_timer(vcpu);
 }
+
+/* This should be called with the kvm->lock mutex held */
+void kvm_set_irq(struct kvm *kvm, int irq, int level)
+{
+	/* Not possible to detect if the guest uses the PIC or the
+	 * IOAPIC.  So set the bit in both. The guest will ignore
+	 * writes to the unused one.
+	 */
+	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
+	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cb..07ff2ae 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -82,6 +82,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
+void kvm_set_irq(struct kvm *kvm, int irq, int level);
+
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-- 
cgit v0.10.2


From 31aa2b44afd5e73365221b1de66f6081e4616f33 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 11 Jul 2008 17:59:46 +0300
Subject: KVM: MMU: Separate the code for unlinking a shadow page from its
 parents

Place into own function, in preparation for further cleanups.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508..81016a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -991,11 +991,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
 			kvm->vcpus[i]->arch.last_pte_updated = NULL;
 }
 
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
 
-	++kvm->stat.mmu_shadow_zapped;
 	while (sp->multimapped || sp->parent_pte) {
 		if (!sp->multimapped)
 			parent_pte = sp->parent_pte;
@@ -1010,7 +1009,13 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_mmu_put_page(sp, parent_pte);
 		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
 	}
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
+	kvm_mmu_unlink_parents(kvm, sp);
 	if (!sp->root_count) {
 		if (!sp->role.metaphysical && !sp->role.invalid)
 			unaccount_shadowed(kvm, sp->gfn);
-- 
cgit v0.10.2


From 5b5c6a5a60801effb559e787a947885d9850a7da Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 11 Jul 2008 18:07:26 +0300
Subject: KVM: MMU: Simplify kvm_mmu_zap_page()

The twisty maze of conditionals can be reduced.

[joerg: fix tlb flushing]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 81016a3..c3afbfe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -955,7 +955,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 				rmap_remove(kvm, &pt[i]);
 			pt[i] = shadow_trap_nonpresent_pte;
 		}
-		kvm_flush_remote_tlbs(kvm);
 		return;
 	}
 
@@ -974,7 +973,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
 	}
-	kvm_flush_remote_tlbs(kvm);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1016,18 +1014,16 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
+	kvm_flush_remote_tlbs(kvm);
+	if (!sp->role.invalid && !sp->role.metaphysical)
+		unaccount_shadowed(kvm, sp->gfn);
 	if (!sp->root_count) {
-		if (!sp->role.metaphysical && !sp->role.invalid)
-			unaccount_shadowed(kvm, sp->gfn);
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
 	} else {
-		int invalid = sp->role.invalid;
-		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		sp->role.invalid = 1;
+		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
-		if (!sp->role.metaphysical && !invalid)
-			unaccount_shadowed(kvm, sp->gfn);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
 }
@@ -1842,7 +1838,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.metaphysical)
+		if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
 			continue;
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-- 
cgit v0.10.2


From cf393f75661f4b17451377b353833eb5502a9688 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 1 Jul 2008 16:20:21 +0300
Subject: KVM: Move NMI IRET fault processing to new vmx_complete_interrupts()

Currently most interrupt exit processing is handled on the entry path,
which is confusing.  Move the NMI IRET fault processing to a new function,
vmx_complete_interrupts(), which is called on the vmexit path.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5a3a032..2adfacb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2817,6 +2817,27 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
 		enable_irq_window(vcpu);
 }
 
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	u32 exit_intr_info;
+	bool unblock_nmi;
+	u8 vector;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	if (cpu_has_virtual_nmis()) {
+		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+		/*
+		 * SDM 3: 25.7.1.2
+		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
+		 * a guest IRET fault.
+		 */
+		if (unblock_nmi && vector != DF_VECTOR)
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+	}
+}
+
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2873,23 +2894,12 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		return;
 	}
 	if (cpu_has_virtual_nmis()) {
-		/*
-		 * SDM 3: 25.7.1.2
-		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
-		 * a guest IRET fault.
-		 */
-		if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
-		    (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
-			vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-				vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
-				GUEST_INTR_STATE_NMI);
-		else if (vcpu->arch.nmi_pending) {
+		if (vcpu->arch.nmi_pending) {
 			if (vmx_nmi_enabled(vcpu))
 				vmx_inject_nmi(vcpu);
 			enable_intr_window(vcpu);
 			return;
 		}
-
 	}
 	if (!kvm_cpu_has_interrupt(vcpu))
 		return;
@@ -3076,6 +3086,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		KVMTRACE_0D(NMI, vcpu, handler);
 		asm("int $2");
 	}
+
+	vmx_complete_interrupts(vmx);
 }
 
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 668f612fa0d8d4120ec5dc0725d7e1ca3152a954 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 2 Jul 2008 09:28:55 +0300
Subject: KVM: VMX: Move nmi injection failure processing to vm exit path

Instead of processing nmi injection failure in the vm entry path, move
it to the vm exit path (vm_complete_interrupts()).  This separates nmi
injection from nmi post-processing, and moves the nmi state from the VT
state into vcpu state (new variable nmi_injected specifying an injection
in progress).

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2adfacb..ce13b53 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2151,7 +2151,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-	vcpu->arch.nmi_pending = 0;
 }
 
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2820,8 +2819,11 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
+	u32 idt_vectoring_info;
 	bool unblock_nmi;
 	u8 vector;
+	int type;
+	bool idtv_info_valid;
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (cpu_has_virtual_nmis()) {
@@ -2836,18 +2838,34 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				      GUEST_INTR_STATE_NMI);
 	}
+
+	idt_vectoring_info = vmx->idt_vectoring_info;
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+	if (vmx->vcpu.arch.nmi_injected) {
+		/*
+		 * SDM 3: 25.7.1.2
+		 * Clear bit "block by NMI" before VM entry if a NMI delivery
+		 * faulted.
+		 */
+		if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
+			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+					GUEST_INTR_STATE_NMI);
+		else
+			vmx->vcpu.arch.nmi_injected = false;
+	}
 }
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 idtv_info_field, intr_info_field, exit_intr_info_field;
+	u32 idtv_info_field, intr_info_field;
 	int vector;
 
 	update_tpr_threshold(vcpu);
 
 	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-	exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
 	idtv_info_field = vmx->idt_vectoring_info;
 	if (intr_info_field & INTR_INFO_VALID_MASK) {
 		if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2871,17 +2889,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
 		KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
 
-		/*
-		 * SDM 3: 25.7.1.2
-		 * Clear bit "block by NMI" before VM entry if a NMI delivery
-		 * faulted.
-		 */
-		if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-		    == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
-			vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-				vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-				~GUEST_INTR_STATE_NMI);
-
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
 				& ~INTR_INFO_RESVD_BITS_MASK);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
@@ -2894,9 +2901,17 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		return;
 	}
 	if (cpu_has_virtual_nmis()) {
-		if (vcpu->arch.nmi_pending) {
-			if (vmx_nmi_enabled(vcpu))
-				vmx_inject_nmi(vcpu);
+		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+			if (vmx_nmi_enabled(vcpu)) {
+				vcpu->arch.nmi_pending = false;
+				vcpu->arch.nmi_injected = true;
+			} else {
+				enable_intr_window(vcpu);
+				return;
+			}
+		}
+		if (vcpu->arch.nmi_injected) {
+			vmx_inject_nmi(vcpu);
 			enable_intr_window(vcpu);
 			return;
 		}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 4f2bd88..7cf69fd 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -300,6 +300,7 @@ struct kvm_vcpu_arch {
 	struct page *time_page;
 
 	bool nmi_pending;
+	bool nmi_injected;
 
 	u64 mtrr[0x100];
 };
-- 
cgit v0.10.2


From 26eef70c3e8c76e73dff2579c792fc7355f8a291 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 14:59:22 +0300
Subject: KVM: Clear exception queue before emulating an instruction

If we're emulating an instruction, either it will succeed, in which case
any previously queued exception will be spurious, or we will requeue the
same exception.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f0696b..5620df2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -20,6 +20,7 @@
 #include "i8254.h"
 #include "tss.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/clocksource.h>
 #include <linux/kvm.h>
@@ -2121,6 +2122,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	int r;
 	struct decode_cache *c;
 
+	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
 	/*
 	 * TODO: fix x86_emulate.c to use guest_read/write_register
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 0000000..c666649
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,11 @@
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.exception.pending = false;
+}
+
+#endif
-- 
cgit v0.10.2


From 35920a356957eea9fd1f9da043f93469e8d72eab Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 14:50:12 +0300
Subject: KVM: VMX: Fix pending exception processing

The vmx code assumes that IDT-Vectoring can only be set when an exception
is injected due to the exception in question.  That's not true, however:
if the exception is injected correctly, and later another exception occurs
but its delivery is blocked due to a fault, then we will incorrectly assume
the first exception was not delivered.

Fix by unconditionally dequeuing the pending exception, and requeuing it
(or the second exception) if we see it in the IDT-Vectoring field.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ce13b53..ed4fe8e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -744,9 +745,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	return false;
 }
 
 /*
@@ -2824,6 +2823,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
+	u32 error;
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (cpu_has_virtual_nmis()) {
@@ -2855,6 +2855,15 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		else
 			vmx->vcpu.arch.nmi_injected = false;
 	}
+	kvm_clear_exception_queue(&vmx->vcpu);
+	if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			kvm_queue_exception_e(&vmx->vcpu, vector, error);
+		} else
+			kvm_queue_exception(&vmx->vcpu, vector);
+		vmx->idt_vectoring_info = 0;
+	}
 }
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 937a7eaef9f08342958d17055a350982b7bd92cb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 15:17:01 +0300
Subject: KVM: Add a pending interrupt queue

Similar to the exception queue, this hold interrupts that have been
accepted by the virtual processor core but not yet injected.

Not yet used.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c666649..6a4be78 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,4 +8,15 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.exception.pending = false;
 }
 
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+{
+	vcpu->arch.interrupt.pending = true;
+	vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.interrupt.pending = false;
+}
+
 #endif
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 7cf69fd..65c6a0e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -275,6 +275,11 @@ struct kvm_vcpu_arch {
 		u32 error_code;
 	} exception;
 
+	struct kvm_queued_interrupt {
+		bool pending;
+		u8 nr;
+	} interrupt;
+
 	struct {
 		int active;
 		u8 save_iopl;
-- 
cgit v0.10.2


From f7d9238f5dc9306fd3bf1653c2939eef72f94d06 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 16:14:28 +0300
Subject: KVM: VMX: Move interrupt post-processing to vmx_complete_interrupts()

Instead of looking at failed injections in the vm entry path, move
processing to the exit path in vmx_complete_interrupts().  This simplifes
the logic and removes any state that is hidden in vmx registers.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed4fe8e..e8fed9b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1002,17 +1002,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 
 static int vmx_get_irq(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 idtv_info_field;
-
-	idtv_info_field = vmx->idt_vectoring_info;
-	if (idtv_info_field & INTR_INFO_VALID_MASK) {
-		if (is_external_interrupt(idtv_info_field))
-			return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-		else
-			printk(KERN_DEBUG "pending exception: not handled yet\n");
-	}
-	return -1;
+	if (!vcpu->arch.interrupt.pending)
+		return -1;
+	return vcpu->arch.interrupt.nr;
 }
 
 static __init int cpu_has_kvm_support(void)
@@ -2293,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
-		if (vect_info & VECTORING_INFO_VALID_MASK)
+		if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
 			kvm_mmu_unprotect_page_virt(vcpu, cr2);
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
@@ -2864,51 +2856,20 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			kvm_queue_exception(&vmx->vcpu, vector);
 		vmx->idt_vectoring_info = 0;
 	}
+	kvm_clear_interrupt_queue(&vmx->vcpu);
+	if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+		kvm_queue_interrupt(&vmx->vcpu, vector);
+		vmx->idt_vectoring_info = 0;
+	}
 }
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 idtv_info_field, intr_info_field;
-	int vector;
+	u32 intr_info_field;
 
 	update_tpr_threshold(vcpu);
 
 	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-	idtv_info_field = vmx->idt_vectoring_info;
-	if (intr_info_field & INTR_INFO_VALID_MASK) {
-		if (idtv_info_field & INTR_INFO_VALID_MASK) {
-			/* TODO: fault when IDT_Vectoring */
-			if (printk_ratelimit())
-				printk(KERN_ERR "Fault when IDT_Vectoring\n");
-		}
-		enable_intr_window(vcpu);
-		return;
-	}
-	if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
-		if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-		    == INTR_TYPE_EXT_INTR
-		    && vcpu->arch.rmode.active) {
-			u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
-			vmx_inject_irq(vcpu, vect);
-			enable_intr_window(vcpu);
-			return;
-		}
-
-		KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
-
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
-				& ~INTR_INFO_RESVD_BITS_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
-		if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
-			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-				vmcs_read32(IDT_VECTORING_ERROR_CODE));
-		enable_intr_window(vcpu);
-		return;
-	}
 	if (cpu_has_virtual_nmis()) {
 		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 			if (vmx_nmi_enabled(vcpu)) {
@@ -2925,14 +2886,16 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 			return;
 		}
 	}
-	if (!kvm_cpu_has_interrupt(vcpu))
-		return;
-	if (vmx_irq_enabled(vcpu)) {
-		vector = kvm_cpu_get_interrupt(vcpu);
-		vmx_inject_irq(vcpu, vector);
-		kvm_timer_intr_post(vcpu, vector);
-	} else
-		enable_irq_window(vcpu);
+	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
+		if (vmx_irq_enabled(vcpu))
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+		else
+			enable_irq_window(vcpu);
+	}
+	if (vcpu->arch.interrupt.pending) {
+		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+	}
 }
 
 /*
-- 
cgit v0.10.2


From 60bd83a125030878665684f353c7d36fd54f09fd Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sat, 12 Jul 2008 16:02:08 +0300
Subject: KVM: VMX: Remove redundant check in handle_rmode_exception

Since checking for vcpu->arch.rmode.active is already done whenever we
call handle_rmode_exception(), checking it inside the function is redundant.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8fed9b..9591ca7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2224,9 +2224,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
-	if (!vcpu->arch.rmode.active)
-		return 0;
-
 	/*
 	 * Instruction with address size override prefix opcode 0x67
 	 * Cause the #SS fault with 0 error code in VM86 mode.
-- 
cgit v0.10.2


From 7edd0ce05892831c77fa4cebe24a6056d33336d5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 7 Jul 2008 14:45:39 +0300
Subject: KVM: Consolidate PIC isr clearing into a function

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e..55e179a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,11 @@
 
 #include <linux/kvm_host.h>
 
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+	s->isr &= ~(1 << irq);
+}
+
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
@@ -141,11 +146,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
  */
 static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 {
+	s->isr |= 1 << irq;
 	if (s->auto_eoi) {
 		if (s->rotate_on_auto_eoi)
 			s->priority_add = (irq + 1) & 7;
-	} else
-		s->isr |= (1 << irq);
+		pic_clear_isr(s, irq);
+	}
 	/*
 	 * We don't clear a level sensitive interrupt here
 	 */
@@ -243,7 +249,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 				priority = get_priority(s, s->isr);
 				if (priority != 8) {
 					irq = (priority + s->priority_add) & 7;
-					s->isr &= ~(1 << irq);
+					pic_clear_isr(s, irq);
 					if (cmd == 5)
 						s->priority_add = (irq + 1) & 7;
 					pic_update_irq(s->pics_state);
@@ -251,7 +257,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 				break;
 			case 3:
 				irq = val & 7;
-				s->isr &= ~(1 << irq);
+				pic_clear_isr(s, irq);
 				pic_update_irq(s->pics_state);
 				break;
 			case 6:
@@ -260,8 +266,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 				break;
 			case 7:
 				irq = val & 7;
-				s->isr &= ~(1 << irq);
 				s->priority_add = (irq + 1) & 7;
+				pic_clear_isr(s, irq);
 				pic_update_irq(s->pics_state);
 				break;
 			default:
@@ -303,7 +309,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
 			s->pics_state->pics[0].irr &= ~(1 << 2);
 		}
 		s->irr &= ~(1 << ret);
-		s->isr &= ~(1 << ret);
+		pic_clear_isr(s, ret);
 		if (addr1 >> 7 || ret != 2)
 			pic_update_irq(s->pics_state);
 	} else {
-- 
cgit v0.10.2


From 19bd8afdc4e6fbb47e4841f8d771f8cb29916d9f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sun, 13 Jul 2008 13:40:55 +0200
Subject: KVM: Consolidate XX_VECTOR defines

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 54b0bf3..743c98d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,10 +36,6 @@ MODULE_LICENSE("GPL");
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
 #define DR7_GD_MASK (1 << 13)
 #define DR6_BD_MASK (1 << 13)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9591ca7..5c9dac5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -470,7 +470,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
 	if (vcpu->guest_debug.enabled)
-		eb |= 1u << 1;
+		eb |= 1u << DB_VECTOR;
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
 	if (vm_need_ept())
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 65c6a0e..da3c197 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -57,6 +57,7 @@
 #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
 
 #define DE_VECTOR 0
+#define DB_VECTOR 1
 #define UD_VECTOR 6
 #define NM_VECTOR 7
 #define DF_VECTOR 8
-- 
cgit v0.10.2


From 77ab6db0a1c403387b403e9351ab3f5ae1df83e6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Mon, 14 Jul 2008 12:28:51 +0200
Subject: KVM: VMX: Reinject real mode exception

As we execute real mode guests in VM86 mode, exception have to be
reinjected appropriately when the guest triggered them. For this purpose
the patch adopts the real-mode injection pattern used in vmx_inject_irq
to vmx_queue_exception, additionally taking care that the IP is set
correctly for #BP exceptions. Furthermore it extends
handle_rmode_exception to reinject all those exceptions that can be
raised in real mode.

This fixes the execution of himem.exe from FreeDOS and also makes its
debug.com work properly.

Note that guest debugging in real mode is broken now. This has to be
fixed by the scheduled debugging infrastructure rework (will be done
once base patches for QEMU have been accepted).

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5c9dac5..2879880 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -735,12 +735,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (has_error_code)
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+
+	if (vcpu->arch.rmode.active) {
+		vmx->rmode.irq.pending = true;
+		vmx->rmode.irq.vector = nr;
+		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+		if (nr == BP_VECTOR)
+			vmx->rmode.irq.rip++;
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     nr | INTR_TYPE_SOFT_INTR
+			     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
+			     | INTR_INFO_VALID_MASK);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		return;
+	}
+
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 		     nr | INTR_TYPE_EXCEPTION
 		     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
 		     | INTR_INFO_VALID_MASK);
-	if (has_error_code)
-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -2231,6 +2249,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
 		if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
 			return 1;
+	/*
+	 * Forward all other exceptions that are valid in real mode.
+	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+	 *        the required debugging infrastructure rework.
+	 */
+	switch (vec) {
+	case DE_VECTOR:
+	case DB_VECTOR:
+	case BP_VECTOR:
+	case OF_VECTOR:
+	case BR_VECTOR:
+	case UD_VECTOR:
+	case DF_VECTOR:
+	case SS_VECTOR:
+	case GP_VECTOR:
+	case MF_VECTOR:
+		kvm_queue_exception(vcpu, vec);
+		return 1;
+	}
 	return 0;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index da3c197..83afa10 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -58,6 +58,9 @@
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
 #define UD_VECTOR 6
 #define NM_VECTOR 7
 #define DF_VECTOR 8
@@ -66,6 +69,7 @@
 #define SS_VECTOR 12
 #define GP_VECTOR 13
 #define PF_VECTOR 14
+#define MF_VECTOR 16
 #define MC_VECTOR 18
 
 #define SELECTOR_TI_MASK (1 << 2)
-- 
cgit v0.10.2


From c801949ddf0a51074937f488a52072825ed50174 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Jul 2008 14:44:59 +0300
Subject: KVM: VMX: Unify register save/restore across 32 and 64 bit hosts

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2879880..fc8996b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2955,6 +2955,14 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2972,26 +2980,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	asm(
 		/* Store host registers */
-#ifdef CONFIG_X86_64
-		"push %%rdx; push %%rbp;"
-		"push %%rcx \n\t"
-#else
-		"push %%edx; push %%ebp;"
-		"push %%ecx \n\t"
-#endif
+		"push %%"R"dx; push %%"R"bp;"
+		"push %%"R"cx \n\t"
 		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
 		/* Check if vmlaunch of vmresume is needed */
 		"cmpl $0, %c[launched](%0) \n\t"
 		/* Load guest registers.  Don't clobber flags. */
+		"mov %c[cr2](%0), %%"R"ax \n\t"
+		"mov %%"R"ax, %%cr2 \n\t"
+		"mov %c[rax](%0), %%"R"ax \n\t"
+		"mov %c[rbx](%0), %%"R"bx \n\t"
+		"mov %c[rdx](%0), %%"R"dx \n\t"
+		"mov %c[rsi](%0), %%"R"si \n\t"
+		"mov %c[rdi](%0), %%"R"di \n\t"
+		"mov %c[rbp](%0), %%"R"bp \n\t"
 #ifdef CONFIG_X86_64
-		"mov %c[cr2](%0), %%rax \n\t"
-		"mov %%rax, %%cr2 \n\t"
-		"mov %c[rax](%0), %%rax \n\t"
-		"mov %c[rbx](%0), %%rbx \n\t"
-		"mov %c[rdx](%0), %%rdx \n\t"
-		"mov %c[rsi](%0), %%rsi \n\t"
-		"mov %c[rdi](%0), %%rdi \n\t"
-		"mov %c[rbp](%0), %%rbp \n\t"
 		"mov %c[r8](%0),  %%r8  \n\t"
 		"mov %c[r9](%0),  %%r9  \n\t"
 		"mov %c[r10](%0), %%r10 \n\t"
@@ -3000,18 +3003,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %c[r13](%0), %%r13 \n\t"
 		"mov %c[r14](%0), %%r14 \n\t"
 		"mov %c[r15](%0), %%r15 \n\t"
-		"mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
-#else
-		"mov %c[cr2](%0), %%eax \n\t"
-		"mov %%eax,   %%cr2 \n\t"
-		"mov %c[rax](%0), %%eax \n\t"
-		"mov %c[rbx](%0), %%ebx \n\t"
-		"mov %c[rdx](%0), %%edx \n\t"
-		"mov %c[rsi](%0), %%esi \n\t"
-		"mov %c[rdi](%0), %%edi \n\t"
-		"mov %c[rbp](%0), %%ebp \n\t"
-		"mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
 #endif
+		"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
 		/* Enter guest mode */
 		"jne .Llaunched \n\t"
 		__ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -3019,15 +3013,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
 		".Lkvm_vmx_return: "
 		/* Save guest registers, load host registers, keep flags */
+		"xchg %0,     (%%"R"sp) \n\t"
+		"mov %%"R"ax, %c[rax](%0) \n\t"
+		"mov %%"R"bx, %c[rbx](%0) \n\t"
+		"push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+		"mov %%"R"dx, %c[rdx](%0) \n\t"
+		"mov %%"R"si, %c[rsi](%0) \n\t"
+		"mov %%"R"di, %c[rdi](%0) \n\t"
+		"mov %%"R"bp, %c[rbp](%0) \n\t"
 #ifdef CONFIG_X86_64
-		"xchg %0,     (%%rsp) \n\t"
-		"mov %%rax, %c[rax](%0) \n\t"
-		"mov %%rbx, %c[rbx](%0) \n\t"
-		"pushq (%%rsp); popq %c[rcx](%0) \n\t"
-		"mov %%rdx, %c[rdx](%0) \n\t"
-		"mov %%rsi, %c[rsi](%0) \n\t"
-		"mov %%rdi, %c[rdi](%0) \n\t"
-		"mov %%rbp, %c[rbp](%0) \n\t"
 		"mov %%r8,  %c[r8](%0) \n\t"
 		"mov %%r9,  %c[r9](%0) \n\t"
 		"mov %%r10, %c[r10](%0) \n\t"
@@ -3036,24 +3030,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %%r13, %c[r13](%0) \n\t"
 		"mov %%r14, %c[r14](%0) \n\t"
 		"mov %%r15, %c[r15](%0) \n\t"
-		"mov %%cr2, %%rax   \n\t"
-		"mov %%rax, %c[cr2](%0) \n\t"
-
-		"pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
-#else
-		"xchg %0, (%%esp) \n\t"
-		"mov %%eax, %c[rax](%0) \n\t"
-		"mov %%ebx, %c[rbx](%0) \n\t"
-		"pushl (%%esp); popl %c[rcx](%0) \n\t"
-		"mov %%edx, %c[rdx](%0) \n\t"
-		"mov %%esi, %c[rsi](%0) \n\t"
-		"mov %%edi, %c[rdi](%0) \n\t"
-		"mov %%ebp, %c[rbp](%0) \n\t"
-		"mov %%cr2, %%eax  \n\t"
-		"mov %%eax, %c[cr2](%0) \n\t"
-
-		"pop %%ebp; pop %%ebp; pop %%edx \n\t"
 #endif
+		"mov %%cr2, %%"R"ax   \n\t"
+		"mov %%"R"ax, %c[cr2](%0) \n\t"
+
+		"pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
 		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -3077,11 +3058,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
 	      : "cc", "memory"
+		, R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
-		, "rbx", "rdi", "rsi"
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
-		, "ebx", "edi", "rsi"
 #endif
 	      );
 
@@ -3111,6 +3090,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vmx_complete_interrupts(vmx);
 }
 
+#undef R
+#undef Q
+
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-- 
cgit v0.10.2


From 80e31d4f61f69d0e480ae092cda0e590d6a30aeb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Jul 2008 14:44:59 +0300
Subject: KVM: SVM: Unify register save/restore across 32 and 64 bit hosts

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 743c98d..179c2e0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1697,6 +1697,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
+#ifdef CONFIG_X86_64
+#define R "r"
+#else
+#define R "e"
+#endif
+
 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1735,19 +1741,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	local_irq_enable();
 
 	asm volatile (
+		"push %%"R"bp; \n\t"
+		"mov %c[rbx](%[svm]), %%"R"bx \n\t"
+		"mov %c[rcx](%[svm]), %%"R"cx \n\t"
+		"mov %c[rdx](%[svm]), %%"R"dx \n\t"
+		"mov %c[rsi](%[svm]), %%"R"si \n\t"
+		"mov %c[rdi](%[svm]), %%"R"di \n\t"
+		"mov %c[rbp](%[svm]), %%"R"bp \n\t"
 #ifdef CONFIG_X86_64
-		"push %%rbp; \n\t"
-#else
-		"push %%ebp; \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
-		"mov %c[rbx](%[svm]), %%rbx \n\t"
-		"mov %c[rcx](%[svm]), %%rcx \n\t"
-		"mov %c[rdx](%[svm]), %%rdx \n\t"
-		"mov %c[rsi](%[svm]), %%rsi \n\t"
-		"mov %c[rdi](%[svm]), %%rdi \n\t"
-		"mov %c[rbp](%[svm]), %%rbp \n\t"
 		"mov %c[r8](%[svm]),  %%r8  \n\t"
 		"mov %c[r9](%[svm]),  %%r9  \n\t"
 		"mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1756,41 +1757,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %c[r13](%[svm]), %%r13 \n\t"
 		"mov %c[r14](%[svm]), %%r14 \n\t"
 		"mov %c[r15](%[svm]), %%r15 \n\t"
-#else
-		"mov %c[rbx](%[svm]), %%ebx \n\t"
-		"mov %c[rcx](%[svm]), %%ecx \n\t"
-		"mov %c[rdx](%[svm]), %%edx \n\t"
-		"mov %c[rsi](%[svm]), %%esi \n\t"
-		"mov %c[rdi](%[svm]), %%edi \n\t"
-		"mov %c[rbp](%[svm]), %%ebp \n\t"
 #endif
 
-#ifdef CONFIG_X86_64
 		/* Enter guest mode */
-		"push %%rax \n\t"
-		"mov %c[vmcb](%[svm]), %%rax \n\t"
+		"push %%"R"ax \n\t"
+		"mov %c[vmcb](%[svm]), %%"R"ax \n\t"
 		__ex(SVM_VMLOAD) "\n\t"
 		__ex(SVM_VMRUN) "\n\t"
 		__ex(SVM_VMSAVE) "\n\t"
-		"pop %%rax \n\t"
-#else
-		/* Enter guest mode */
-		"push %%eax \n\t"
-		"mov %c[vmcb](%[svm]), %%eax \n\t"
-		__ex(SVM_VMLOAD) "\n\t"
-		__ex(SVM_VMRUN) "\n\t"
-		__ex(SVM_VMSAVE) "\n\t"
-		"pop %%eax \n\t"
-#endif
+		"pop %%"R"ax \n\t"
 
 		/* Save guest registers, load host registers */
+		"mov %%"R"bx, %c[rbx](%[svm]) \n\t"
+		"mov %%"R"cx, %c[rcx](%[svm]) \n\t"
+		"mov %%"R"dx, %c[rdx](%[svm]) \n\t"
+		"mov %%"R"si, %c[rsi](%[svm]) \n\t"
+		"mov %%"R"di, %c[rdi](%[svm]) \n\t"
+		"mov %%"R"bp, %c[rbp](%[svm]) \n\t"
 #ifdef CONFIG_X86_64
-		"mov %%rbx, %c[rbx](%[svm]) \n\t"
-		"mov %%rcx, %c[rcx](%[svm]) \n\t"
-		"mov %%rdx, %c[rdx](%[svm]) \n\t"
-		"mov %%rsi, %c[rsi](%[svm]) \n\t"
-		"mov %%rdi, %c[rdi](%[svm]) \n\t"
-		"mov %%rbp, %c[rbp](%[svm]) \n\t"
 		"mov %%r8,  %c[r8](%[svm]) \n\t"
 		"mov %%r9,  %c[r9](%[svm]) \n\t"
 		"mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1799,18 +1783,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %%r13, %c[r13](%[svm]) \n\t"
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
-
-		"pop  %%rbp; \n\t"
-#else
-		"mov %%ebx, %c[rbx](%[svm]) \n\t"
-		"mov %%ecx, %c[rcx](%[svm]) \n\t"
-		"mov %%edx, %c[rdx](%[svm]) \n\t"
-		"mov %%esi, %c[rsi](%[svm]) \n\t"
-		"mov %%edi, %c[rdi](%[svm]) \n\t"
-		"mov %%ebp, %c[rbp](%[svm]) \n\t"
-
-		"pop  %%ebp; \n\t"
 #endif
+		"pop %%"R"bp"
 		:
 		: [svm]"a"(svm),
 		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1831,11 +1805,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
 #endif
 		: "cc", "memory"
+		, R"bx", R"cx", R"dx", R"si", R"di"
 #ifdef CONFIG_X86_64
-		, "rbx", "rcx", "rdx", "rsi", "rdi"
 		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
-		, "ebx", "ecx", "edx" , "esi", "edi"
 #endif
 		);
 
@@ -1867,6 +1839,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	svm->next_rip = 0;
 }
 
+#undef R
+
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-- 
cgit v0.10.2


From e32c8f2c0720fb21c6f4a5f6ccbebdadc878f707 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:00 +0200
Subject: KVM: kvmtrace: Remove use of bit fields in kvm trace structure

This patch fixes kvmtrace use on big endian systems. When using bit fields the
compiler will lay data out in the wrong order expected when laid down into a
file.
This fixes it by using one variable instead of using bit fields.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a3cead..8a16b08 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -311,9 +311,13 @@ struct kvm_s390_interrupt {
 
 /* This structure represents a single trace buffer record. */
 struct kvm_trace_rec {
-	__u32 event:28;
-	__u32 extra_u32:3;
-	__u32 cycle_in:1;
+	/* variable rec_val
+	 * is split into:
+	 * bits 0 - 27  -> event id
+	 * bits 28 -30  -> number of extra data args of size u32
+	 * bits 31      -> binary indicator for if tsc is in record
+	 */
+	__u32 rec_val;
 	__u32 pid;
 	__u32 vcpu_id;
 	union {
@@ -327,6 +331,13 @@ struct kvm_trace_rec {
 	} u;
 };
 
+#define TRACE_REC_EVENT_ID(val) \
+		(0x0fffffff & (val))
+#define TRACE_REC_NUM_DATA_ARGS(val) \
+		(0x70000000 & ((val) << 28))
+#define TRACE_REC_TCS(val) \
+		(0x80000000 & ((val) << 31))
+
 #define KVMIO 0xAE
 
 /*
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
index 58141f3..9acb78b 100644
--- a/virt/kvm/kvm_trace.c
+++ b/virt/kvm/kvm_trace.c
@@ -54,12 +54,13 @@ static void kvm_add_trace(void *probe_private, void *call_data,
 	struct kvm_trace *kt = kvm_trace;
 	struct kvm_trace_rec rec;
 	struct kvm_vcpu *vcpu;
-	int    i, extra, size;
+	int    i, size;
+	u32    extra;
 
 	if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
 		return;
 
-	rec.event	= va_arg(*args, u32);
+	rec.rec_val	= TRACE_REC_EVENT_ID(va_arg(*args, u32));
 	vcpu		= va_arg(*args, struct kvm_vcpu *);
 	rec.pid		= current->tgid;
 	rec.vcpu_id	= vcpu->vcpu_id;
@@ -67,21 +68,21 @@ static void kvm_add_trace(void *probe_private, void *call_data,
 	extra   	= va_arg(*args, u32);
 	WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
 	extra 		= min_t(u32, extra, KVM_TRC_EXTRA_MAX);
-	rec.extra_u32   = extra;
 
-	rec.cycle_in 	= p->cycle_in;
+	rec.rec_val |= TRACE_REC_TCS(p->cycle_in)
+			| TRACE_REC_NUM_DATA_ARGS(extra);
 
-	if (rec.cycle_in) {
+	if (p->cycle_in) {
 		rec.u.cycle.cycle_u64 = get_cycles();
 
-		for (i = 0; i < rec.extra_u32; i++)
+		for (i = 0; i < extra; i++)
 			rec.u.cycle.extra_u32[i] = va_arg(*args, u32);
 	} else {
-		for (i = 0; i < rec.extra_u32; i++)
+		for (i = 0; i < extra; i++)
 			rec.u.nocycle.extra_u32[i] = va_arg(*args, u32);
 	}
 
-	size = calc_rec_size(rec.cycle_in, rec.extra_u32 * sizeof(u32));
+	size = calc_rec_size(p->cycle_in, extra * sizeof(u32));
 	relay_write(kt->rchan, &rec, size);
 }
 
-- 
cgit v0.10.2


From 3f7f95c65ef6a89472a28da1b9436eaeee288831 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:01 +0200
Subject: KVM: kvmtrace: replace get_cycles with ktime_get v3

The current kvmtrace code uses get_cycles() while the interpretation would be
easier using using nanoseconds. ktime_get() should give at least the same
accuracy as get_cycles on all architectures (even better on 32bit archs) but
at a better unit (e.g. comparable between hosts with different frequencies.

[avi: avoid ktime_t in public header]

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a16b08..5d08f11b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -322,12 +322,12 @@ struct kvm_trace_rec {
 	__u32 vcpu_id;
 	union {
 		struct {
-			__u64 cycle_u64;
+			__u64 timestamp;
 			__u32 extra_u32[KVM_TRC_EXTRA_MAX];
-		} __attribute__((packed)) cycle;
+		} __attribute__((packed)) timestamp;
 		struct {
 			__u32 extra_u32[KVM_TRC_EXTRA_MAX];
-		} nocycle;
+		} notimestamp;
 	} u;
 };
 
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
index 9acb78b..41dcc84 100644
--- a/virt/kvm/kvm_trace.c
+++ b/virt/kvm/kvm_trace.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/relay.h>
 #include <linux/debugfs.h>
+#include <linux/ktime.h>
 
 #include <linux/kvm_host.h>
 
@@ -35,16 +36,16 @@ static struct kvm_trace *kvm_trace;
 struct kvm_trace_probe {
 	const char *name;
 	const char *format;
-	u32 cycle_in;
+	u32 timestamp_in;
 	marker_probe_func *probe_func;
 };
 
-static inline int calc_rec_size(int cycle, int extra)
+static inline int calc_rec_size(int timestamp, int extra)
 {
 	int rec_size = KVM_TRC_HEAD_SIZE;
 
 	rec_size += extra;
-	return cycle ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
+	return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
 }
 
 static void kvm_add_trace(void *probe_private, void *call_data,
@@ -69,20 +70,20 @@ static void kvm_add_trace(void *probe_private, void *call_data,
 	WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
 	extra 		= min_t(u32, extra, KVM_TRC_EXTRA_MAX);
 
-	rec.rec_val |= TRACE_REC_TCS(p->cycle_in)
+	rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
 			| TRACE_REC_NUM_DATA_ARGS(extra);
 
-	if (p->cycle_in) {
-		rec.u.cycle.cycle_u64 = get_cycles();
+	if (p->timestamp_in) {
+		rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
 
 		for (i = 0; i < extra; i++)
-			rec.u.cycle.extra_u32[i] = va_arg(*args, u32);
+			rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
 	} else {
 		for (i = 0; i < extra; i++)
-			rec.u.nocycle.extra_u32[i] = va_arg(*args, u32);
+			rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
 	}
 
-	size = calc_rec_size(p->cycle_in, extra * sizeof(u32));
+	size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
 	relay_write(kt->rchan, &rec, size);
 }
 
-- 
cgit v0.10.2


From 12f67556023389a6be929a56617142a8e8ab20fe Mon Sep 17 00:00:00 2001
From: Jerone Young <jyoung5@us.ibm.com>
Date: Mon, 14 Jul 2008 14:00:02 +0200
Subject: KVM: ppc: enable KVM_TRACE building for powerpc

This patch enables KVM_TRACE to build for PowerPC arch. This means just
adding sections to Kconfig and Makefile.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 6b07601..53aaa66 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST
 	  Provides host support for KVM on Book E PowerPC processors. Currently
 	  this works on 440 processors only.
 
+config KVM_TRACE
+	bool "KVM trace support"
+	depends on KVM && MARKERS && SYSFS
+	select RELAY
+	select DEBUG_FS
+	default n
+	---help---
+	  This option allows reading a trace of kvm-related events through
+	  relayfs.  Note the ABI is not considered stable and will be
+	  modified in future updates.
+
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 04e3449..2a5d439 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -4,9 +4,11 @@
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
-kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o
+common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+
+kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
 obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
-- 
cgit v0.10.2


From 31711f2294b38d8334efaf7dbac6da4781fd151e Mon Sep 17 00:00:00 2001
From: Jerone Young <jyoung5@us.ibm.com>
Date: Mon, 14 Jul 2008 14:00:03 +0200
Subject: KVM: ppc: adds trace points for ppc tlb activity

This patch adds trace points to track powerpc TLB activities using the
KVM_TRACE infrastructure.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 5a5602d..a207d16 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -19,6 +19,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
 #include <asm/mmu-44x.h>
@@ -175,6 +176,10 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
 	                                            vcpu->arch.msr & MSR_PR);
+
+	KVMTRACE_5D(STLB_WRITE, vcpu, victim,
+			stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
+			handler);
 }
 
 void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -204,6 +209,9 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 		kvmppc_44x_shadow_release(vcpu, i);
 		stlbe->word0 = 0;
+		KVMTRACE_5D(STLB_INVAL, vcpu, i,
+				stlbe->tid, stlbe->word0, stlbe->word1,
+				stlbe->word2, handler);
 	}
 	up_write(&current->mm->mmap_sem);
 }
@@ -217,8 +225,13 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 	/* XXX Replace loop with fancy data structures. */
 	down_write(&current->mm->mmap_sem);
 	for (i = 0; i <= tlb_44x_hwater; i++) {
+		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+
 		kvmppc_44x_shadow_release(vcpu, i);
-		vcpu->arch.shadow_tlb[i].word0 = 0;
+		stlbe->word0 = 0;
+		KVMTRACE_5D(STLB_INVAL, vcpu, i,
+				stlbe->tid, stlbe->word0, stlbe->word1,
+				stlbe->word2, handler);
 	}
 	up_write(&current->mm->mmap_sem);
 }
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 8c605d0..4a3e274 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
 		kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
 	}
 
+	KVMTRACE_5D(GTLB_WRITE, vcpu, index,
+			tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
+			handler);
+
 	return EMULATE_DONE;
 }
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 5d08f11b..e21a505 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -471,5 +471,8 @@ struct kvm_trace_rec {
 #define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
 #define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
 #define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 
 #endif
-- 
cgit v0.10.2


From 3b4bd7969f7b61a1ab455bff084ee4f0a2411055 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:04 +0200
Subject: KVM: ppc: trace powerpc instruction emulation

This patch adds a trace point for the instruction emulation on embedded powerpc
utilizing the KVM_TRACE interface.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 4a3e274..c3ed63b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -769,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 	}
 
+	KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
+
 	if (advance)
 		vcpu->arch.pc += 4; /* Advance past emulated instruction. */
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e21a505..d29b648 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -474,5 +474,6 @@ struct kvm_trace_rec {
 #define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
 #define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
 #endif
-- 
cgit v0.10.2


From 313dbd49dc239205b96da79fba09f7637cf84f3c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 17 Jul 2008 18:04:30 +0300
Subject: KVM: VMX: Avoid vmwrite(HOST_RSP) when possible

Usually HOST_RSP retains its value across guest entries.  Take advantage
of this and avoid a vmwrite() when this is so.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fc8996b..ddb49e3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -58,6 +58,7 @@ struct vmcs {
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
+	unsigned long         host_rsp;
 	int                   launched;
 	u8                    fail;
 	u32                   idt_vectoring_info;
@@ -2982,7 +2983,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
 		"push %%"R"cx \n\t"
+		"cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+		"je 1f \n\t"
+		"mov %%"R"sp, %c[host_rsp](%0) \n\t"
 		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+		"1: \n\t"
 		/* Check if vmlaunch of vmresume is needed */
 		"cmpl $0, %c[launched](%0) \n\t"
 		/* Load guest registers.  Don't clobber flags. */
@@ -3039,6 +3044,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
 		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
+		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
 		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
 		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
-- 
cgit v0.10.2


From b5e2fec0ebc3fcaff954092bb69444a67a904c0a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 22 Jul 2008 08:00:45 +0200
Subject: KVM: Ignore DEBUGCTL MSRs with no effect

Netware writes to DEBUGCTL and reads from the DEBUGCTL and LAST*IP MSRs
without further checks and is really confused to receive a #GP during that.
To make it happy we should just make them stubs, which is exactly what SVM
already does.

Writes to DEBUGCTL that are vendor-specific are resembled to behave as if the
virtual CPU does not know them.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5620df2..94a2165 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -665,6 +665,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 			__func__, data);
 		break;
+	case MSR_IA32_DEBUGCTLMSR:
+		if (!data) {
+			/* We support the non-activated case already */
+			break;
+		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+			/* Values other than LBR and BTF are vendor-specific,
+			   thus reserved and should throw a #GP */
+			return 1;
+		}
+		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+			__func__, data);
+		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
 		break;
@@ -757,6 +769,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MC0_MISC+16:
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
+	case MSR_IA32_DEBUGCTLMSR:
+	case MSR_IA32_LASTBRANCHFROMIP:
+	case MSR_IA32_LASTBRANCHTOIP:
+	case MSR_IA32_LASTINTFROMIP:
+	case MSR_IA32_LASTINTTOIP:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
-- 
cgit v0.10.2


From 6a0ab738ef42d87951b3980f61b1f4cbb14d4171 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:49 -0500
Subject: KVM: ppc: guest breakpoint support

Allow host userspace to program hardware debug registers to set breakpoints
inside guests.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2655e2a..23bad40 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -86,6 +86,11 @@ struct kvm_vcpu_arch {
 
 	u32 host_stack;
 	u32 host_pid;
+	u32 host_dbcr0;
+	u32 host_dbcr1;
+	u32 host_dbcr2;
+	u32 host_iac[4];
+	u32 host_msr;
 
 	u64 fpr[32];
 	u32 gpr[32];
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 9c8ad85..3cca079 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	}
 
+	case BOOKE_INTERRUPT_DEBUG: {
+		u32 dbsr;
+
+		vcpu->arch.pc = mfspr(SPRN_CSRR0);
+
+		/* clear IAC events in DBSR register */
+		dbsr = mfspr(SPRN_DBSR);
+		dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
+		mtspr(SPRN_DBSR, dbsr);
+
+		run->exit_reason = KVM_EXIT_DEBUG;
+		r = RESUME_HOST;
+		break;
+	}
+
 	default:
 		printk(KERN_EMERG "exit_nr %d\n", exit_nr);
 		BUG();
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 3b653b5..8eaba26 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -42,7 +42,8 @@
 #define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */
 
 #define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
-                        (1<<BOOKE_INTERRUPT_DTLB_MISS))
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_DEBUG))
 
 #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
                         (1<<BOOKE_INTERRUPT_DTLB_MISS))
@@ -431,6 +432,14 @@ lightweight_exit:
 	oris	r3, r3, KVMPPC_MSR_MASK@h
 	ori	r3, r3, KVMPPC_MSR_MASK@l
 	mtsrr1	r3
+
+	/* Clear any debug events which occurred since we disabled MSR[DE].
+	 * XXX This gives us a 3-instruction window in which a breakpoint
+	 * intended for guest context could fire in the host instead. */
+	lis	r3, 0xffff
+	ori	r3, r3, 0xffff
+	mtspr	SPRN_DBSR, r3
+
 	lwz	r3, VCPU_GPR(r3)(r4)
 	lwz	r4, VCPU_GPR(r4)(r4)
 	rfi
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 53826a5..b756071 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -239,18 +239,100 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 }
 
+/* Note: clearing MSR[DE] just means that the debug interrupt will not be
+ * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
+ * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
+ * will be delivered as an "imprecise debug event" (which is indicated by
+ * DBSR[IDE].
+ */
+static void kvmppc_disable_debug_interrupts(void)
+{
+	mtmsr(mfmsr() & ~MSR_DE);
+}
+
+static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
+{
+	kvmppc_disable_debug_interrupts();
+
+	mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
+	mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
+	mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
+	mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
+	mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
+	mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
+	mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
+	mtmsr(vcpu->arch.host_msr);
+}
+
+static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
+{
+	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+	u32 dbcr0 = 0;
+
+	vcpu->arch.host_msr = mfmsr();
+	kvmppc_disable_debug_interrupts();
+
+	/* Save host debug register state. */
+	vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
+	vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
+	vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
+	vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
+	vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
+	vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
+	vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
+
+	/* set registers up for guest */
+
+	if (dbg->bp[0]) {
+		mtspr(SPRN_IAC1, dbg->bp[0]);
+		dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
+	}
+	if (dbg->bp[1]) {
+		mtspr(SPRN_IAC2, dbg->bp[1]);
+		dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
+	}
+	if (dbg->bp[2]) {
+		mtspr(SPRN_IAC3, dbg->bp[2]);
+		dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
+	}
+	if (dbg->bp[3]) {
+		mtspr(SPRN_IAC4, dbg->bp[3]);
+		dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
+	}
+
+	mtspr(SPRN_DBCR0, dbcr0);
+	mtspr(SPRN_DBCR1, 0);
+	mtspr(SPRN_DBCR2, 0);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	if (vcpu->guest_debug.enabled)
+		kvmppc_load_guest_debug_registers(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	if (vcpu->guest_debug.enabled)
+		kvmppc_restore_host_debug_state(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
                                     struct kvm_debug_guest *dbg)
 {
-	return -ENOTSUPP;
+	int i;
+
+	vcpu->guest_debug.enabled = dbg->enabled;
+	if (vcpu->guest_debug.enabled) {
+		for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
+			if (dbg->breakpoints[i].enabled)
+				vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
+			else
+				vcpu->guest_debug.bp[i] = 0;
+		}
+	}
+
+	return 0;
 }
 
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
-- 
cgit v0.10.2


From 20754c2495a791b5b429c0da63394c86ade978e7 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:51 -0500
Subject: KVM: ppc: Stop saving host TLB state

We're saving the host TLB state to memory on every exit, but never using it.
Originally I had thought that we'd want to restore host TLB for heavyweight
exits, but that could actually hurt when context switching to an unrelated host
process (i.e. not qemu).

Since this decreases the performance penalty of all exits, this patch improves
guest boot time by about 15%.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 23bad40..dc3a756 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -81,8 +81,6 @@ struct kvm_vcpu_arch {
 	struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
 	/* Pages which are referenced in the shadow TLB. */
 	struct page *shadow_pages[PPC44x_TLB_SIZE];
-	/* Copy of the host's TLB. */
-	struct tlbe host_tlb[PPC44x_TLB_SIZE];
 
 	u32 host_stack;
 	u32 host_pid;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 92768d3..5940649 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -356,7 +356,6 @@ int main(void)
 
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
-	DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
 	DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 8eaba26..3e88dfa 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -342,26 +342,15 @@ lightweight_exit:
 	andc	r6, r5, r6
 	mtmsr	r6
 
-	/* Save the host's non-pinned TLB mappings, and load the guest mappings
-	 * over them. Leave the host's "pinned" kernel mappings in place. */
-	/* XXX optimization: use generation count to avoid swapping unmodified
-	 * entries. */
+	/* Load the guest mappings, leaving the host's "pinned" kernel mappings
+	 * in place. */
+	/* XXX optimization: load only modified guest entries. */
 	mfspr	r10, SPRN_MMUCR			/* Save host MMUCR. */
 	lis	r8, tlb_44x_hwater@ha
 	lwz	r8, tlb_44x_hwater@l(r8)
-	addi	r3, r4, VCPU_HOST_TLB - 4
 	addi	r9, r4, VCPU_SHADOW_TLB - 4
 	li	r6, 0
 1:
-	/* Save host entry. */
-	tlbre	r7, r6, PPC44x_TLB_PAGEID
-	mfspr	r5, SPRN_MMUCR
-	stwu	r5, 4(r3)
-	stwu	r7, 4(r3)
-	tlbre	r7, r6, PPC44x_TLB_XLAT
-	stwu	r7, 4(r3)
-	tlbre	r7, r6, PPC44x_TLB_ATTRIB
-	stwu	r7, 4(r3)
 	/* Load guest entry. */
 	lwzu	r7, 4(r9)
 	mtspr	SPRN_MMUCR, r7
-- 
cgit v0.10.2


From 83aae4a8098eb8a40a2e9dab3714354182143b4f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:52 -0500
Subject: KVM: ppc: Write only modified shadow entries into the TLB on exit

Track which TLB entries need to be written, instead of overwriting everything
below the high water mark. Typically only a single guest TLB entry will be
modified in a single exit.

Guest boot time performance improvement: about 15%.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index dc3a756..4338b03 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -82,6 +82,9 @@ struct kvm_vcpu_arch {
 	/* Pages which are referenced in the shadow TLB. */
 	struct page *shadow_pages[PPC44x_TLB_SIZE];
 
+	/* Track which TLB entries we've modified in the current exit. */
+	u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+
 	u32 host_stack;
 	u32 host_pid;
 	u32 host_dbcr0;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a8b0687..8e7e429 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -65,6 +65,9 @@ extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                                   gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 
+/* XXX Book E specific */
+extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
+
 extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
 
 static inline void kvmppc_queue_exception(struct kvm_vcpu *vcpu, int exception)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 5940649..1631d67 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -357,6 +357,7 @@ int main(void)
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
+	DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index a207d16..06a5fcf 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -125,6 +125,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
 	}
 }
 
+void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
+{
+    vcpu->arch.shadow_tlb_mod[i] = 1;
+}
+
 /* Caller must ensure that the specified guest TLB entry is safe to insert into
  * the shadow TLB. */
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
@@ -172,10 +177,10 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	 * use host large pages in the future. */
 	stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
 	               | PPC44x_TLB_4K;
-
 	stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
 	                                            vcpu->arch.msr & MSR_PR);
+	kvmppc_tlbe_set_modified(vcpu, victim);
 
 	KVMTRACE_5D(STLB_WRITE, vcpu, victim,
 			stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
@@ -209,6 +214,7 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 		kvmppc_44x_shadow_release(vcpu, i);
 		stlbe->word0 = 0;
+		kvmppc_tlbe_set_modified(vcpu, i);
 		KVMTRACE_5D(STLB_INVAL, vcpu, i,
 				stlbe->tid, stlbe->word0, stlbe->word1,
 				stlbe->word2, handler);
@@ -229,6 +235,7 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 
 		kvmppc_44x_shadow_release(vcpu, i);
 		stlbe->word0 = 0;
+		kvmppc_tlbe_set_modified(vcpu, i);
 		KVMTRACE_5D(STLB_INVAL, vcpu, i,
 				stlbe->tid, stlbe->word0, stlbe->word1,
 				stlbe->word2, handler);
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 3e88dfa..564ea32 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -335,7 +335,7 @@ lightweight_exit:
 	lwz	r3, VCPU_PID(r4)
 	mtspr	SPRN_PID, r3
 
-	/* Prevent all TLB updates. */
+	/* Prevent all asynchronous TLB updates. */
 	mfmsr	r5
 	lis	r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
 	ori	r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
@@ -344,28 +344,45 @@ lightweight_exit:
 
 	/* Load the guest mappings, leaving the host's "pinned" kernel mappings
 	 * in place. */
-	/* XXX optimization: load only modified guest entries. */
 	mfspr	r10, SPRN_MMUCR			/* Save host MMUCR. */
-	lis	r8, tlb_44x_hwater@ha
-	lwz	r8, tlb_44x_hwater@l(r8)
-	addi	r9, r4, VCPU_SHADOW_TLB - 4
-	li	r6, 0
+	li	r5, PPC44x_TLB_SIZE
+	lis	r5, tlb_44x_hwater@ha
+	lwz	r5, tlb_44x_hwater@l(r5)
+	mtctr	r5
+	addi	r9, r4, VCPU_SHADOW_TLB
+	addi	r5, r4, VCPU_SHADOW_MOD
+	li	r3, 0
 1:
+	lbzx	r7, r3, r5
+	cmpwi	r7, 0
+	beq	3f
+
 	/* Load guest entry. */
-	lwzu	r7, 4(r9)
+	mulli	r11, r3, TLBE_BYTES
+	add	r11, r11, r9
+	lwz	r7, 0(r11)
 	mtspr	SPRN_MMUCR, r7
-	lwzu	r7, 4(r9)
-	tlbwe	r7, r6, PPC44x_TLB_PAGEID
-	lwzu	r7, 4(r9)
-	tlbwe	r7, r6, PPC44x_TLB_XLAT
-	lwzu	r7, 4(r9)
-	tlbwe	r7, r6, PPC44x_TLB_ATTRIB
-	/* Increment index. */
-	addi	r6, r6, 1
-	cmpw	r6, r8
-	blt	1b
+	lwz	r7, 4(r11)
+	tlbwe	r7, r3, PPC44x_TLB_PAGEID
+	lwz	r7, 8(r11)
+	tlbwe	r7, r3, PPC44x_TLB_XLAT
+	lwz	r7, 12(r11)
+	tlbwe	r7, r3, PPC44x_TLB_ATTRIB
+3:
+	addi	r3, r3, 1                       /* Increment index. */
+	bdnz	1b
+
 	mtspr	SPRN_MMUCR, r10			/* Restore host MMUCR. */
 
+	/* Clear bitmap of modified TLB entries */
+	li	r5, PPC44x_TLB_SIZE>>2
+	mtctr	r5
+	addi	r5, r4, VCPU_SHADOW_MOD - 4
+	li	r6, 0
+1:
+	stwu	r6, 4(r5)
+	bdnz	1b
+
 	iccci	0, 0 /* XXX hack */
 
 	/* Load some guest volatiles. */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b756071..90a6fc4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,7 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include <asm/tlbflush.h>
 
 
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
@@ -307,14 +308,28 @@ static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	int i;
+
 	if (vcpu->guest_debug.enabled)
 		kvmppc_load_guest_debug_registers(vcpu);
+
+	/* Mark every guest entry in the shadow TLB entry modified, so that they
+	 * will all be reloaded on the next vcpu run (instead of being
+	 * demand-faulted). */
+	for (i = 0; i <= tlb_44x_hwater; i++)
+		kvmppc_tlbe_set_modified(vcpu, i);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_debug.enabled)
 		kvmppc_restore_host_debug_state(vcpu);
+
+	/* Don't leave guest TLB entries resident when being de-scheduled. */
+	/* XXX It would be nice to differentiate between heavyweight exit and
+	 * sched_out here, since we could avoid the TLB flush for heavyweight
+	 * exits. */
+	_tlbia();
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-- 
cgit v0.10.2


From 49dd2c492895828a90ecdf889e7fe9cfb40a82a7 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:53 -0500
Subject: KVM: powerpc: Map guest userspace with TID=0 mappings

When we use TID=N userspace mappings, we must ensure that kernel mappings have
been destroyed when entering userspace. Using TID=1/TID=0 for kernel/user
mappings and running userspace with PID=0 means that userspace can't access the
kernel mappings, but the kernel can directly access userspace.

The net is that we don't need to flush the TLB on privilege switches, but we do
on guest context switches (which are far more infrequent). Guest boot time
performance improvement: about 30%.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 4338b03..34b52b7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -129,7 +129,11 @@ struct kvm_vcpu_arch {
 	u32 ivor[16];
 	u32 ivpr;
 	u32 pir;
+
+	u32 shadow_pid;
 	u32 pid;
+	u32 swap_pid;
+
 	u32 pvr;
 	u32 ccr0;
 	u32 ccr1;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 8e7e429..8931ba7 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -64,6 +64,7 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
 extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                                   gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
+extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
 /* XXX Book E specific */
 extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
@@ -95,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 		kvm_vcpu_block(vcpu);
 }
 
+static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
+{
+	if (vcpu->arch.pid != new_pid) {
+		vcpu->arch.pid = new_pid;
+		vcpu->arch.swap_pid = 1;
+	}
+}
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1631d67..52649da 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -369,7 +369,7 @@ int main(void)
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
 	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
-	DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid));
+	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 06a5fcf..3594bbd 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -170,7 +170,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 
 	/* XXX what about AS? */
 
-	stlbe->tid = asid & 0xff;
+	stlbe->tid = !(asid & 0xff);
 
 	/* Force TS=1 for all guest mappings. */
 	/* For now we hardcode 4KB mappings, but it will be important to
@@ -190,7 +190,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                            gva_t eend, u32 asid)
 {
-	unsigned int pid = asid & 0xff;
+	unsigned int pid = !(asid & 0xff);
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
@@ -222,23 +222,30 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	up_write(&current->mm->mmap_sem);
 }
 
-/* Invalidate all mappings, so that when they fault back in they will get the
- * proper permission bits. */
+/* Invalidate all mappings on the privilege switch after PID has been changed.
+ * The guest always runs with PID=1, so we must clear the entire TLB when
+ * switching address spaces. */
 void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 {
 	int i;
 
-	/* XXX Replace loop with fancy data structures. */
-	down_write(&current->mm->mmap_sem);
-	for (i = 0; i <= tlb_44x_hwater; i++) {
-		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
-
-		kvmppc_44x_shadow_release(vcpu, i);
-		stlbe->word0 = 0;
-		kvmppc_tlbe_set_modified(vcpu, i);
-		KVMTRACE_5D(STLB_INVAL, vcpu, i,
-				stlbe->tid, stlbe->word0, stlbe->word1,
-				stlbe->word2, handler);
+	if (vcpu->arch.swap_pid) {
+		/* XXX Replace loop with fancy data structures. */
+		down_write(&current->mm->mmap_sem);
+		for (i = 0; i <= tlb_44x_hwater; i++) {
+			struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+
+			/* Future optimization: clear only userspace mappings. */
+			kvmppc_44x_shadow_release(vcpu, i);
+			stlbe->word0 = 0;
+			kvmppc_tlbe_set_modified(vcpu, i);
+			KVMTRACE_5D(STLB_INVAL, vcpu, i,
+			            stlbe->tid, stlbe->word0, stlbe->word1,
+			            stlbe->word2, handler);
+		}
+		up_write(&current->mm->mmap_sem);
+		vcpu->arch.swap_pid = 0;
 	}
-	up_write(&current->mm->mmap_sem);
+
+	vcpu->arch.shadow_pid = !usermode;
 }
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 3cca079..7b2591e 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -486,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.msr = 0;
 	vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
 
+	vcpu->arch.shadow_pid = 1;
+
 	/* Eye-catching number so we know if the guest takes an interrupt
 	 * before it's programmed its own IVPR. */
 	vcpu->arch.ivpr = 0x55550000;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 564ea32..95e165b 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -332,7 +332,7 @@ lightweight_exit:
 
 	mfspr	r3, SPRN_PID
 	stw	r3, VCPU_HOST_PID(r4)
-	lwz	r3, VCPU_PID(r4)
+	lwz	r3, VCPU_SHADOW_PID(r4)
 	mtspr	SPRN_PID, r3
 
 	/* Prevent all asynchronous TLB updates. */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index c3ed63b..0fce4fb 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -508,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			case SPRN_MMUCR:
 				vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
 			case SPRN_PID:
-				vcpu->arch.pid = vcpu->arch.gpr[rs]; break;
+				kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
 			case SPRN_CCR0:
 				vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
 			case SPRN_CCR1:
-- 
cgit v0.10.2


From 564f15378f04921d5749f27ec53d5e68a6d1d446 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sat, 26 Jul 2008 17:00:59 -0300
Subject: KVM: Add irq ack notifier list

This can be used by kvm subsystems that are interested in when
interrupts are acked, for example time drift compensation.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 0d9e552..9091195 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -111,3 +111,25 @@ void kvm_set_irq(struct kvm *kvm, int irq, int level)
 	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
 	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
 }
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+{
+	struct kvm_irq_ack_notifier *kian;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
+		if (kian->gsi == gsi)
+			kian->irq_acked(kian);
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian)
+{
+	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian)
+{
+	hlist_del(&kian->link);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 07ff2ae..95fe718 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,6 +83,11 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_set_irq(struct kvm *kvm, int irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian);
 
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 83afa10..d451928 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -321,6 +321,12 @@ struct kvm_mem_alias {
 	gfn_t target_gfn;
 };
 
+struct kvm_irq_ack_notifier {
+	struct hlist_node link;
+	unsigned gsi;
+	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
+};
+
 struct kvm_arch{
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
@@ -336,6 +342,7 @@ struct kvm_arch{
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
+	struct hlist_head irq_ack_notifier_list;
 
 	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
-- 
cgit v0.10.2


From f52447261bc8c21dfd4635196e32d2da1352f589 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sat, 26 Jul 2008 17:01:00 -0300
Subject: KVM: irq ack notification

Based on a patch from: Ben-Ami Yassour <benami@il.ibm.com>
which was based on a patch from: Amit Shah <amit.shah@qumranet.com>

Notify IRQ acking on PIC/APIC emulation. The previous patch missed two things:

- Edge triggered interrupts on IOAPIC
- PIC reset with IRR/ISR set should be equivalent to ack (LAPIC probably
needs something similar).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: Amit Shah <amit.shah@qumranet.com>
CC: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 55e179a..de70499 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -159,9 +159,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 		s->irr &= ~(1 << irq);
 }
 
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
 {
 	int irq, irq2, intno;
+	struct kvm_pic *s = pic_irqchip(kvm);
 
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
@@ -187,12 +188,21 @@ int kvm_pic_read_irq(struct kvm_pic *s)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
+	kvm_notify_acked_irq(kvm, irq);
 
 	return intno;
 }
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
+	int irq;
+	struct kvm *kvm = s->pics_state->irq_request_opaque;
+
+	for (irq = 0; irq < PIC_NUM_PINS; irq++) {
+		if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) ||
+		    s->isr & (1 << irq)))
+			kvm_notify_acked_irq(kvm, irq);
+	}
 	s->last_irr = 0;
 	s->irr = 0;
 	s->imr = 0;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 9091195..3c508af 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);
 			s->output = 0;		/* PIC */
-			vector = kvm_pic_read_irq(s);
+			vector = kvm_pic_read_irq(v->kvm);
 		}
 	}
 	return vector;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 95fe718..479a3d2 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,11 +63,12 @@ struct kvm_pic {
 	void *irq_request_opaque;
 	int output;		/* intr from master PIC */
 	struct kvm_io_device dev;
+	void (*ack_notifier)(void *opaque, int irq);
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
 void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
 static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9fde0ac..be94f93 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -439,7 +439,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
-
+	int trigger_mode;
 	/*
 	 * Not every write EOI will has corresponding ISR,
 	 * one example is when Kernel check timer on setup_IO_APIC
@@ -451,7 +451,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 	apic_update_ppr(apic);
 
 	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+		trigger_mode = IOAPIC_LEVEL_TRIG;
+	else
+		trigger_mode = IOAPIC_EDGE_TRIG;
+	kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c0d2287..515cd7c 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -39,6 +39,7 @@
 
 #include "ioapic.h"
 #include "lapic.h"
+#include "irq.h"
 
 #if 0
 #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
@@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 	}
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
+				    int trigger_mode)
 {
 	union ioapic_redir_entry *ent;
 
 	ent = &ioapic->redirtbl[gsi];
-	ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 
-	ent->fields.remote_irr = 0;
-	if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
-		ioapic_service(ioapic, gsi);
+	kvm_notify_acked_irq(ioapic->kvm, gsi);
+
+	if (trigger_mode == IOAPIC_LEVEL_TRIG) {
+		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+		ent->fields.remote_irr = 0;
+		if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
+			ioapic_service(ioapic, gsi);
+	}
 }
 
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	int i;
 
 	for (i = 0; i < IOAPIC_NUM_PINS; i++)
 		if (ioapic->redirtbl[i].fields.vector == vector)
-			__kvm_ioapic_update_eoi(ioapic, i);
+			__kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
 }
 
 static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7f16675..b52732f 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -58,6 +58,7 @@ struct kvm_ioapic {
 	} redirtbl[IOAPIC_NUM_PINS];
 	struct kvm_io_device dev;
 	struct kvm *kvm;
+	void (*ack_notifier)(void *opaque, int irq);
 };
 
 #ifdef DEBUG
@@ -87,7 +88,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 				       unsigned long bitmap);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-- 
cgit v0.10.2


From 3cf57fed216e2c1b6fdfeccb792650bab72a350a Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sat, 26 Jul 2008 17:01:01 -0300
Subject: KVM: PIT: fix injection logic and count

The PIT injection logic is problematic under the following cases:

1) If there is a higher priority vector to be delivered by the time
kvm_pit_timer_intr_post is invoked ps->inject_pending won't be set.
This opens the possibility for missing many PIT event injections (say if
guest executes hlt at this point).

2) ps->inject_pending is racy with more than two vcpus. Since there's no locking
around read/dec of pt->pending, two vcpu's can inject two interrupts for a single
pt->pending count.

Fix 1 by using an irq ack notifier: only reinject when the previous irq
has been acked. Fix 2 with appropriate locking around manipulation of
pending count and irq_ack by the injection / ack paths.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872..7d04dd3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -207,6 +207,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 
 	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
 	pt->scheduled = ktime_to_ns(pt->timer.expires);
+	if (pt->period)
+		ps->channels[0].count_load_time = pt->timer.expires;
 
 	return (pt->period == 0 ? 0 : 1);
 }
@@ -215,12 +217,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 
-	if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+	if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
 		return atomic_read(&pit->pit_state.pit_timer.pending);
-
 	return 0;
 }
 
+void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+						 irq_ack_notifier);
+	spin_lock(&ps->inject_lock);
+	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+		WARN_ON(1);
+	ps->irq_ack = 1;
+	spin_unlock(&ps->inject_lock);
+}
+
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps;
@@ -255,8 +267,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
 	hrtimer_cancel(&pt->timer);
 }
 
-static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
+	struct kvm_kpit_timer *pt = &ps->pit_timer;
 	s64 interval;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +281,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
 	pt->period = (is_period == 0) ? 0 : interval;
 	pt->timer.function = pit_timer_fn;
 	atomic_set(&pt->pending, 0);
+	ps->irq_ack = 1;
 
 	hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
 		      HRTIMER_MODE_ABS);
@@ -302,11 +316,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		create_pit_timer(&ps->pit_timer, val, 0);
+		create_pit_timer(ps, val, 0);
 		break;
 	case 2:
 	case 3:
-		create_pit_timer(&ps->pit_timer, val, 1);
+		create_pit_timer(ps, val, 1);
 		break;
 	default:
 		destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +534,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	mutex_unlock(&pit->pit_state.lock);
 
 	atomic_set(&pit->pit_state.pit_timer.pending, 0);
-	pit->pit_state.inject_pending = 1;
+	pit->pit_state.irq_ack = 1;
 }
 
 struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
+	spin_lock_init(&pit->pit_state.inject_lock);
 
 	/* Initialize PIO device */
 	pit->dev.read = pit_ioport_read;
@@ -555,6 +570,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->pit = pit;
 	hrtimer_init(&pit_state->pit_timer.timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	pit_state->irq_ack_notifier.gsi = 0;
+	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
@@ -592,37 +610,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
 	struct kvm_kpit_state *ps;
 
 	if (vcpu && pit) {
+		int inject = 0;
 		ps = &pit->pit_state;
 
-		/* Try to inject pending interrupts when:
-		 * 1. Pending exists
-		 * 2. Last interrupt was accepted or waited for too long time*/
-		if (atomic_read(&ps->pit_timer.pending) &&
-		    (ps->inject_pending ||
-		    (jiffies - ps->last_injected_time
-				>= KVM_MAX_PIT_INTR_INTERVAL))) {
-			ps->inject_pending = 0;
-			__inject_pit_timer_intr(kvm);
-			ps->last_injected_time = jiffies;
-		}
-	}
-}
-
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-	struct kvm_arch *arch = &vcpu->kvm->arch;
-	struct kvm_kpit_state *ps;
-
-	if (vcpu && arch->vpit) {
-		ps = &arch->vpit->pit_state;
-		if (atomic_read(&ps->pit_timer.pending) &&
-		(((arch->vpic->pics[0].imr & 1) == 0 &&
-		  arch->vpic->pics[0].irq_base == vec) ||
-		  (arch->vioapic->redirtbl[0].fields.vector == vec &&
-		  arch->vioapic->redirtbl[0].fields.mask != 1))) {
-			ps->inject_pending = 1;
-			atomic_dec(&ps->pit_timer.pending);
-			ps->channels[0].count_load_time = ktime_get();
+		/* Try to inject pending interrupts when
+		 * last one has been acked.
+		 */
+		spin_lock(&ps->inject_lock);
+		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+			ps->irq_ack = 0;
+			inject = 1;
 		}
+		spin_unlock(&ps->inject_lock);
+		if (inject)
+			__inject_pit_timer_intr(kvm);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a..e436d49 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
 	int irq;
 	s64 period; /* unit: ns */
 	s64 scheduled;
-	ktime_t last_update;
 	atomic_t pending;
 };
 
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	bool inject_pending; /* if inject pending interrupts */
-	unsigned long last_injected_time;
+	spinlock_t inject_lock;
+	unsigned long irq_ack;
+	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
 struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
 #define KVM_PIT_CHANNEL_MASK	    0x3
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm);
 void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3c508af..8c1b9c5 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
 	kvm_apic_timer_intr_post(vcpu, vec);
-	kvm_pit_timer_intr_post(vcpu, vec);
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
-- 
cgit v0.10.2


From 3807f345b2c610336c17c7624a0d496a38df75a0 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Mon, 28 Jul 2008 11:47:52 -0300
Subject: x86: paravirt: factor out cpu_khz to common code

KVM intends to use paravirt code to calibrate khz. Xen
current code will do just fine. So as a first step, factor out
code to pvclock.c.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a..1c54b5f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 	return dst->version;
 }
 
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+	u64 tsc_khz = 1000000ULL << 32;
+
+	do_div(tsc_khz, src->tsc_to_system_mul);
+	if (src->tsc_shift < 0)
+		tsc_khz <<= -src->tsc_shift;
+	else
+		tsc_khz >>= src->tsc_shift;
+	return tsc_khz;
+}
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 004ba86..c9f7cda 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
 /* Get the TSC speed from Xen */
 unsigned long xen_tsc_khz(void)
 {
-	u64 xen_khz = 1000000ULL << 32;
-	const struct pvclock_vcpu_time_info *info =
+	struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
 
-	do_div(xen_khz, info->tsc_to_system_mul);
-	if (info->tsc_shift < 0)
-		xen_khz <<= -info->tsc_shift;
-	else
-		xen_khz >>= info->tsc_shift;
-
-	return xen_khz;
+	return pvclock_tsc_khz(info);
 }
 
 cycle_t xen_clocksource_read(void)
diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h
index 1a38f68..ad29e27 100644
--- a/include/asm-x86/pvclock.h
+++ b/include/asm-x86/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
 			    struct timespec *ts);
-- 
cgit v0.10.2


From 0293615f3fb9886b6b23800c121be293bb7483e9 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Mon, 28 Jul 2008 11:47:53 -0300
Subject: x86: KVM guest: use paravirt function to calculate cpu khz

We're currently facing timing problems in guests that do
calibration under heavy load, and then the load vanishes.
This means we'll have a much lower lpj than we actually should,
and delays end up taking less time than they should, which is a
nasty bug.

Solution is to pass on the lpj value from host to guest, and have it
preset.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def0..774ac49 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+	return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+	struct pvclock_vcpu_time_info *src;
+	unsigned long khz;
+	u64 lpj;
+
+	src = &per_cpu(hv_clock, 0);
+	khz = pvclock_tsc_khz(src);
+
+	lpj = ((u64)khz * 1000);
+	do_div(lpj, HZ);
+	preset_lpj = lpj;
+}
+
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
+		pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
 #endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_KEXEC
 		machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
+		kvm_get_preset_lpj();
 		clocksource_register(&kvm_clock);
 	}
 }
-- 
cgit v0.10.2


From cbff90a7caa49507d399c9a55ba4a411e840bfb4 Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Mon, 28 Jul 2008 19:26:24 +0300
Subject: KVM: direct mmio pfn check

Userspace may specify memory slots that are backed by mmio pages rather than
normal RAM.  In some cases it is not enough to identify these mmio pages
by pfn_valid().  This patch adds checking the PageReserved as well.

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7dd9b0b..5eb96c7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -76,6 +76,14 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
+static inline int is_mmio_pfn(pfn_t pfn)
+{
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));
+
+	return true;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
@@ -740,7 +748,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		}
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-		BUG_ON(pfn_valid(pfn));
+		BUG_ON(!is_mmio_pfn(pfn));
 	} else
 		pfn = page_to_pfn(page[0]);
 
@@ -754,10 +762,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 	pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		return pfn_to_page(pfn);
 
-	WARN_ON(!pfn_valid(pfn));
+	WARN_ON(is_mmio_pfn(pfn));
 
 	get_page(bad_page);
 	return bad_page;
@@ -773,7 +781,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -799,7 +807,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-	if (pfn_valid(pfn)) {
+	if (!is_mmio_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 		if (!PageReserved(page))
 			SetPageDirty(page);
@@ -809,14 +817,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
 void kvm_get_pfn(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);
-- 
cgit v0.10.2


From 4d5c5d0fe89c921336b95f5e7e4f529a9df92f53 Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Mon, 28 Jul 2008 19:26:26 +0300
Subject: KVM: pci device assignment

Based on a patch from: Amit Shah <amit.shah@qumranet.com>

This patch adds support for handling PCI devices that are assigned to
the guest.

The device to be assigned to the guest is registered in the host kernel
and interrupt delivery is handled.  If a device is already assigned, or
the device driver for it is still loaded on the host, the device
assignment is failed by conveying a -EBUSY reply to the userspace.

Devices that share their interrupt line are not supported at the moment.

By itself, this patch will not make devices work within the guest.
The VT-d extension is required to enable the device to perform DMA.
Another alternative is PVDMA.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94a2165..a97157c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
  * derived from drivers/kvm/kvm_main.c
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Amit Shah    <amit.shah@qumranet.com>
+ *   Ben-Ami Yassour <benami@il.ibm.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -23,8 +27,10 @@
 #include "x86.h"
 
 #include <linux/clocksource.h>
+#include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
+#include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
@@ -98,6 +104,219 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+						      int assigned_dev_id)
+{
+	struct list_head *ptr;
+	struct kvm_assigned_dev_kernel *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+		if (match->assigned_dev_id == assigned_dev_id)
+			return match;
+	}
+	return NULL;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+				    interrupt_work);
+
+	/* This is taken to safely inject irq inside the guest. When
+	 * the interrupt injection (or the ioapic code) uses a
+	 * finer-grained lock, update this
+	 */
+	mutex_lock(&assigned_dev->kvm->lock);
+	kvm_set_irq(assigned_dev->kvm,
+		    assigned_dev->guest_irq, 1);
+	mutex_unlock(&assigned_dev->kvm->lock);
+	kvm_put_kvm(assigned_dev->kvm);
+}
+
+/* FIXME: Implement the OR logic needed to make shared interrupts on
+ * this line behave properly
+ */
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev =
+		(struct kvm_assigned_dev_kernel *) dev_id;
+
+	kvm_get_kvm(assigned_dev->kvm);
+	schedule_work(&assigned_dev->interrupt_work);
+	disable_irq_nosync(irq);
+	return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_assigned_dev_kernel *dev;
+
+	if (kian->gsi == -1)
+		return;
+
+	dev = container_of(kian, struct kvm_assigned_dev_kernel,
+			   ack_notifier);
+	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
+	enable_irq(dev->host_irq);
+}
+
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+				   struct kvm_assigned_irq
+				   *assigned_irq)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match) {
+		mutex_unlock(&kvm->lock);
+		return -EINVAL;
+	}
+
+	if (match->irq_requested) {
+		match->guest_irq = assigned_irq->guest_irq;
+		match->ack_notifier.gsi = assigned_irq->guest_irq;
+		mutex_unlock(&kvm->lock);
+		return 0;
+	}
+
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
+
+	if (irqchip_in_kernel(kvm)) {
+		if (assigned_irq->host_irq)
+			match->host_irq = assigned_irq->host_irq;
+		else
+			match->host_irq = match->dev->irq;
+		match->guest_irq = assigned_irq->guest_irq;
+		match->ack_notifier.gsi = assigned_irq->guest_irq;
+		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+
+		/* Even though this is PCI, we don't want to use shared
+		 * interrupts. Sharing host devices with guest-assigned devices
+		 * on the same interrupt line is not a happy situation: there
+		 * are going to be long delays in accepting, acking, etc.
+		 */
+		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
+				"kvm_assigned_device", (void *)match)) {
+			printk(KERN_INFO "%s: couldn't allocate irq for pv "
+			       "device\n", __func__);
+			r = -EIO;
+			goto out;
+		}
+	}
+
+	match->irq_requested = true;
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+				      struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+	struct pci_dev *dev;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (match) {
+		/* device already assigned */
+		r = -EINVAL;
+		goto out;
+	}
+
+	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+	if (match == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory\n",
+		       __func__);
+		r = -ENOMEM;
+		goto out;
+	}
+	dev = pci_get_bus_and_slot(assigned_dev->busnr,
+				   assigned_dev->devfn);
+	if (!dev) {
+		printk(KERN_INFO "%s: host device not found\n", __func__);
+		r = -EINVAL;
+		goto out_free;
+	}
+	if (pci_enable_device(dev)) {
+		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+		r = -EBUSY;
+		goto out_put;
+	}
+	r = pci_request_regions(dev, "kvm_assigned_device");
+	if (r) {
+		printk(KERN_INFO "%s: Could not get access to device regions\n",
+		       __func__);
+		goto out_disable;
+	}
+	match->assigned_dev_id = assigned_dev->assigned_dev_id;
+	match->host_busnr = assigned_dev->busnr;
+	match->host_devfn = assigned_dev->devfn;
+	match->dev = dev;
+
+	match->kvm = kvm;
+
+	list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+out_disable:
+	pci_disable_device(dev);
+out_put:
+	pci_dev_put(dev);
+out_free:
+	kfree(match);
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static void kvm_free_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) {
+			free_irq(assigned_dev->host_irq,
+				 (void *)assigned_dev);
+
+			kvm_unregister_irq_ack_notifier(kvm,
+							&assigned_dev->
+							ack_notifier);
+		}
+
+		if (cancel_work_sync(&assigned_dev->interrupt_work))
+			/* We had pending work. That means we will have to take
+			 * care of kvm_put_kvm.
+			 */
+			kvm_put_kvm(kvm);
+
+		pci_release_regions(assigned_dev->dev);
+		pci_disable_device(assigned_dev->dev);
+		pci_dev_put(assigned_dev->dev);
+
+		list_del(&assigned_dev->list);
+		kfree(assigned_dev);
+	}
+}
 
 unsigned long segment_base(u16 selector)
 {
@@ -1766,6 +1985,28 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_ASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
 	case KVM_GET_PIT: {
 		struct kvm_pit_state ps;
 		r = -EFAULT;
@@ -3945,6 +4186,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	return kvm;
 }
@@ -3977,6 +4219,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_free_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d451928..99dddfc 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -327,6 +327,21 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+struct kvm_assigned_dev_kernel {
+	struct kvm_irq_ack_notifier ack_notifier;
+	struct work_struct interrupt_work;
+	struct list_head list;
+	struct kvm_assigned_pci_dev assigned_dev;
+	int assigned_dev_id;
+	int host_busnr;
+	int host_devfn;
+	int host_irq;
+	int guest_irq;
+	int irq_requested;
+	struct pci_dev *dev;
+	struct kvm *kvm;
+};
+
 struct kvm_arch{
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
@@ -339,6 +354,7 @@ struct kvm_arch{
 	 * Hash table of struct kvm_mmu_page.
 	 */
 	struct list_head active_mmu_pages;
+	struct list_head assigned_dev_head;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index d29b648..ef4bc6f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -383,6 +383,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#define KVM_CAP_DEVICE_ASSIGNMENT 17
 
 /*
  * ioctls for VM fds
@@ -412,6 +413,10 @@ struct kvm_trace_rec {
 			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
+				   struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
+			    struct kvm_assigned_irq)
 
 /*
  * ioctls for vcpu fds
@@ -476,4 +481,18 @@ struct kvm_trace_rec {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
+struct kvm_assigned_pci_dev {
+	__u32 assigned_dev_id;
+	__u32 busnr;
+	__u32 devfn;
+	__u32 flags;
+};
+
+struct kvm_assigned_irq {
+	__u32 assigned_dev_id;
+	__u32 host_irq;
+	__u32 guest_irq;
+	__u32 flags;
+};
+
 #endif
-- 
cgit v0.10.2


From f0d662759a2465babdba1160749c446648c9d159 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:45 -0700
Subject: KVM: Reduce kvm stack usage in kvm_arch_vm_ioctl()

On my machine with gcc 3.4, kvm uses ~2k of stack in a few
select functions.  This is mostly because gcc fails to
notice that the different case: statements could have their
stack usage combined.  It overflows very nicely if interrupts
happen during one of these large uses.

This patch uses two methods for reducing stack usage.
1. dynamically allocate large objects instead of putting
   on the stack.
2. Use a union{} member for all of the case variables. This
   tricks gcc into combining them all into a single stack
   allocation. (There's also a comment on this)

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a97157c..87d4342 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1869,6 +1869,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r = -EINVAL;
+	/*
+	 * This union makes it completely explicit to gcc-3.x
+	 * that these two variables' stack usage should be
+	 * combined, not added together.
+	 */
+	union {
+		struct kvm_pit_state ps;
+		struct kvm_memory_alias alias;
+	} u;
 
 	switch (ioctl) {
 	case KVM_SET_TSS_ADDR:
@@ -1900,17 +1909,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
-	case KVM_SET_MEMORY_ALIAS: {
-		struct kvm_memory_alias alias;
-
+	case KVM_SET_MEMORY_ALIAS:
 		r = -EFAULT;
-		if (copy_from_user(&alias, argp, sizeof alias))
+		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
 			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
 		if (r)
 			goto out;
 		break;
-	}
 	case KVM_CREATE_IRQCHIP:
 		r = -ENOMEM;
 		kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1952,37 +1958,51 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
+		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
 
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
+		r = -ENOMEM;
+		if (!chip)
 			goto out;
+		r = -EFAULT;
+		if (copy_from_user(chip, argp, sizeof *chip))
+			goto get_irqchip_out;
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+			goto get_irqchip_out;
+		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
-			goto out;
+			goto get_irqchip_out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &chip, sizeof chip))
-			goto out;
+		if (copy_to_user(argp, chip, sizeof *chip))
+			goto get_irqchip_out;
 		r = 0;
+	get_irqchip_out:
+		kfree(chip);
+		if (r)
+			goto out;
 		break;
 	}
 	case KVM_SET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
+		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
 
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
+		r = -ENOMEM;
+		if (!chip)
 			goto out;
+		r = -EFAULT;
+		if (copy_from_user(chip, argp, sizeof *chip))
+			goto set_irqchip_out;
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+			goto set_irqchip_out;
+		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
-			goto out;
+			goto set_irqchip_out;
 		r = 0;
+	set_irqchip_out:
+		kfree(chip);
+		if (r)
+			goto out;
 		break;
 	}
 	case KVM_ASSIGN_PCI_DEVICE: {
@@ -2008,31 +2028,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_PIT: {
-		struct kvm_pit_state ps;
 		r = -EFAULT;
-		if (copy_from_user(&ps, argp, sizeof ps))
+		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
-		r = kvm_vm_ioctl_get_pit(kvm, &ps);
+		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &ps, sizeof ps))
+		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_PIT: {
-		struct kvm_pit_state ps;
 		r = -EFAULT;
-		if (copy_from_user(&ps, argp, sizeof ps))
+		if (copy_from_user(&u.ps, argp, sizeof u.ps))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
-		r = kvm_vm_ioctl_set_pit(kvm, &ps);
+		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
 		if (r)
 			goto out;
 		r = 0;
-- 
cgit v0.10.2


From fa3795a7308df099f0f2c9e5ca2c20a5ff65bdc4 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:46 -0700
Subject: KVM: Reduce stack usage in kvm_vcpu_ioctl()

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5eb96c7..0309571 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1126,6 +1126,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
+	struct kvm_fpu *fpu = NULL;
+	struct kvm_sregs *kvm_sregs = NULL;
 
 	if (vcpu->kvm->mm != current->mm)
 		return -EIO;
@@ -1173,25 +1175,28 @@ out_free2:
 		break;
 	}
 	case KVM_GET_SREGS: {
-		struct kvm_sregs kvm_sregs;
-
-		memset(&kvm_sregs, 0, sizeof kvm_sregs);
-		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+		kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!kvm_sregs)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_SREGS: {
-		struct kvm_sregs kvm_sregs;
-
+		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!kvm_sregs)
+			goto out;
 		r = -EFAULT;
-		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
 			goto out;
-		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
 		if (r)
 			goto out;
 		r = 0;
@@ -1272,25 +1277,28 @@ out_free2:
 		break;
 	}
 	case KVM_GET_FPU: {
-		struct kvm_fpu fpu;
-
-		memset(&fpu, 0, sizeof fpu);
-		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
+		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!fpu)
+			goto out;
+		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &fpu, sizeof fpu))
+		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_FPU: {
-		struct kvm_fpu fpu;
-
+		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!fpu)
+			goto out;
 		r = -EFAULT;
-		if (copy_from_user(&fpu, argp, sizeof fpu))
+		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
 			goto out;
-		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
+		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
 		if (r)
 			goto out;
 		r = 0;
@@ -1300,6 +1308,8 @@ out_free2:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
 out:
+	kfree(fpu);
+	kfree(kvm_sregs);
 	return r;
 }
 
-- 
cgit v0.10.2


From b772ff362ec6b821c8a5227a3355e263f917bfad Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:47 -0700
Subject: KVM: Reduce stack usage in kvm_arch_vcpu_ioctl()

[sheng: fix KVM_GET_LAPIC using wrong size]

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d4342..f1b0223 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1542,28 +1542,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
+	struct kvm_lapic_state *lapic = NULL;
 
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
-		struct kvm_lapic_state lapic;
+		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
-		memset(&lapic, 0, sizeof lapic);
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+		r = -ENOMEM;
+		if (!lapic)
+			goto out;
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &lapic, sizeof lapic))
+		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_LAPIC: {
-		struct kvm_lapic_state lapic;
-
+		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!lapic)
+			goto out;
 		r = -EFAULT;
-		if (copy_from_user(&lapic, argp, sizeof lapic))
+		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
 			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
 		if (r)
 			goto out;
 		r = 0;
@@ -1661,6 +1666,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
+	if (lapic)
+		kfree(lapic);
 	return r;
 }
 
-- 
cgit v0.10.2


From 6ad18fba05228fb1d47cdbc0339fe8b3fca1ca26 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:49 -0700
Subject: KVM: Reduce stack usage in kvm_pv_mmu_op()

We're in a hot path.  We can't use kmalloc() because
it might impact performance.  So, we just stick the buffer that
we need into the kvm_vcpu_arch structure.  This is used very
often, so it is not really a waste.

We also have to move the buffer structure's definition to the
arch-specific x86 kvm header.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c3afbfe..171bcea 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,13 +135,6 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
-struct kvm_pv_mmu_op_buffer {
-	void *ptr;
-	unsigned len;
-	unsigned processed;
-	char buf[512] __aligned(sizeof(long));
-};
-
 struct kvm_rmap_desc {
 	u64 *shadow_ptes[RMAP_EXT];
 	struct kvm_rmap_desc *more;
@@ -2292,18 +2285,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret)
 {
 	int r;
-	struct kvm_pv_mmu_op_buffer buffer;
+	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
 
-	buffer.ptr = buffer.buf;
-	buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
-	buffer.processed = 0;
+	buffer->ptr = buffer->buf;
+	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+	buffer->processed = 0;
 
-	r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
 	if (r)
 		goto out;
 
-	while (buffer.len) {
-		r = kvm_pv_mmu_op_one(vcpu, &buffer);
+	while (buffer->len) {
+		r = kvm_pv_mmu_op_one(vcpu, buffer);
 		if (r < 0)
 			goto out;
 		if (r == 0)
@@ -2312,7 +2305,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 
 	r = 1;
 out:
-	*ret = buffer.processed;
+	*ret = buffer->processed;
 	return r;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 99dddfc..9cb4b4d 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -201,6 +201,13 @@ struct kvm_mmu_page {
 	};
 };
 
+struct kvm_pv_mmu_op_buffer {
+	void *ptr;
+	unsigned len;
+	unsigned processed;
+	char buf[512] __aligned(sizeof(long));
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -248,6 +255,9 @@ struct kvm_vcpu_arch {
 	bool tpr_access_reporting;
 
 	struct kvm_mmu mmu;
+	/* only needed in kvm_pv_mmu_op() path, but it's hot so
+	 * put it here to avoid allocation */
+	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
 
 	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
 	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
-- 
cgit v0.10.2


From 464d17c8b747deb77d1bf8c14cc4f28aab2a4952 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Wed, 13 Aug 2008 14:10:33 +0800
Subject: KVM: VMX: Clean up magic number 0x66 in init_rmode_tss

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ddb49e3..229e2d0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1732,7 +1732,8 @@ static int init_rmode_tss(struct kvm *kvm)
 	if (r < 0)
 		goto out;
 	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-	r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+	r = kvm_write_guest_page(kvm, fn++, &data,
+			TSS_IOPB_BASE_OFFSET, sizeof(u16));
 	if (r < 0)
 		goto out;
 	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
-- 
cgit v0.10.2


From 8349b5cd816adf102c078553ec8ca63b3beb457c Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Tue, 5 Aug 2008 15:30:13 +0300
Subject: KVM: remove unused field from the assigned dev struct

Remove unused field: struct kvm_assigned_pci_dev assigned_dev
from struct: struct kvm_assigned_dev_kernel

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 9cb4b4d..225fdb8 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -341,7 +341,6 @@ struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct work_struct interrupt_work;
 	struct list_head list;
-	struct kvm_assigned_pci_dev assigned_dev;
 	int assigned_dev_id;
 	int host_busnr;
 	int host_devfn;
-- 
cgit v0.10.2


From 29415c37f043d1d54dcf356601d738ff6633b72b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 1 Aug 2008 20:09:13 -0300
Subject: KVM: set debug registers after "schedulable" section

The vcpu thread can be preempted after the guest_debug_pre() callback,
resulting in invalid debug registers on the new vcpu.

Move it inside the non-preemptable section.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1b0223..4a03375 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3113,10 +3113,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	down_read(&vcpu->kvm->slots_lock);
 	vapic_enter(vcpu);
 
-preempted:
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
 again:
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3170,6 +3166,9 @@ again:
 		goto out;
 	}
 
+	if (vcpu->guest_debug.enabled)
+		kvm_x86_ops->guest_debug_pre(vcpu);
+
 	vcpu->guest_mode = 1;
 	/*
 	 * Make sure that guest_mode assignment won't happen after
@@ -3244,7 +3243,7 @@ out:
 	if (r > 0) {
 		kvm_resched(vcpu);
 		down_read(&vcpu->kvm->slots_lock);
-		goto preempted;
+		goto again;
 	}
 
 	post_kvm_run_save(vcpu, kvm_run);
-- 
cgit v0.10.2


From ecfc79c700b02c5ad1ccae58718015caa84824be Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 14 Aug 2008 11:13:16 +0300
Subject: KVM: VMX: Use interrupt queue for !irqchip_in_kernel

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 229e2d0..81db7d4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2173,7 +2173,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
 	if (!vcpu->arch.irq_pending[word_index])
 		clear_bit(word_index, &vcpu->arch.irq_summary);
-	vmx_inject_irq(vcpu, irq);
+	kvm_queue_interrupt(vcpu, irq);
 }
 
 
@@ -2187,13 +2187,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
 	if (vcpu->arch.interrupt_window_open &&
-	    vcpu->arch.irq_summary &&
-	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
-		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
-		 */
+	    vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
 		kvm_do_inject_irq(vcpu);
 
+	if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	if (!vcpu->arch.interrupt_window_open &&
 	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-- 
cgit v0.10.2


From 8ceed34744f81c4a33d68ab825fd9ad3dd5f5505 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 14 Aug 2008 21:25:47 +0300
Subject: KVM: Simplify exception entries by using __ASM_SIZE and _ASM_PTR

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 225fdb8..b6d26b8 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -734,15 +734,6 @@ enum {
 	TASK_SWITCH_GATE = 3,
 };
 
-
-#ifdef CONFIG_64BIT
-# define KVM_EX_ENTRY ".quad"
-# define KVM_EX_PUSH "pushq"
-#else
-# define KVM_EX_ENTRY ".long"
-# define KVM_EX_PUSH "pushl"
-#endif
-
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
@@ -754,11 +745,11 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 	"666: " insn "\n\t" \
 	".pushsection .fixup, \"ax\" \n" \
 	"667: \n\t" \
-	KVM_EX_PUSH " $666b \n\t" \
+	__ASM_SIZE(push) " $666b \n\t"	      \
 	"jmp kvm_handle_fault_on_reboot \n\t" \
 	".popsection \n\t" \
 	".pushsection __ex_table, \"a\" \n\t" \
-	KVM_EX_ENTRY " 666b, 667b \n\t" \
+	_ASM_PTR " 666b, 667b \n\t" \
 	".popsection"
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-- 
cgit v0.10.2


From 85428ac7c39ab5fff23b5d14ccb32941e9401285 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 14 Aug 2008 20:53:25 -0300
Subject: KVM: fix i8259 reset irq acking

The irq ack during pic reset has three problems:

- Ignores slave/master PIC, using gsi 0-8 for both.
- Generates an ACK even if the APIC is in control.
- Depends upon IMR being clear, which is broken if the irq was masked
at the time it was generated.

The last one causes the BIOS to hang after the first reboot of
Windows installation, since PIT interrupts stop.

[avi: fix check whether pic interrupts are seen by cpu]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index de70499..71e3eee 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -195,13 +195,19 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-	int irq;
+	int irq, irqbase;
 	struct kvm *kvm = s->pics_state->irq_request_opaque;
+	struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
 
-	for (irq = 0; irq < PIC_NUM_PINS; irq++) {
-		if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) ||
-		    s->isr & (1 << irq)))
-			kvm_notify_acked_irq(kvm, irq);
+	if (s == &s->pics_state->pics[0])
+		irqbase = 0;
+	else
+		irqbase = 8;
+
+	for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+		if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+			if (s->irr & (1 << irq) || s->isr & (1 << irq))
+				kvm_notify_acked_irq(kvm, irq+irqbase);
 	}
 	s->last_irr = 0;
 	s->irr = 0;
-- 
cgit v0.10.2


From dc7404cea34ef997dfe89ca94d16358e9d29c8d8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 17 Aug 2008 16:03:46 +0300
Subject: KVM: Handle spurious acks for PIT interrupts

Spurious acks can be generated, for example if the PIC is being reset.
Handle those acks gracefully rather than flooding the log with warnings.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 7d04dd3..c842060 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -228,7 +228,7 @@ void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 						 irq_ack_notifier);
 	spin_lock(&ps->inject_lock);
 	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
-		WARN_ON(1);
+		atomic_inc(&ps->pit_timer.pending);
 	ps->irq_ack = 1;
 	spin_unlock(&ps->inject_lock);
 }
-- 
cgit v0.10.2


From 6762b7299aa115e11815decd1fd982d015f09615 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Wed, 13 Aug 2008 16:22:37 +0300
Subject: KVM: Device assignment: Check for privileges before assigning irq

Even though we don't share irqs at the moment, we should ensure
regular user processes don't try to allocate system resources.

We check for capability to access IO devices (CAP_SYS_RAWIO) before
we request_irq on behalf of the guest.

Noticed by Avi.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4a03375..fffdf4f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -191,6 +191,11 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		  kvm_assigned_dev_interrupt_work_handler);
 
 	if (irqchip_in_kernel(kvm)) {
+		if (!capable(CAP_SYS_RAWIO)) {
+			return -EPERM;
+			goto out;
+		}
+
 		if (assigned_irq->host_irq)
 			match->host_irq = assigned_irq->host_irq;
 		else
-- 
cgit v0.10.2


From 648dfaa7df2d3692db4e63dcb18dccb275d9c5a7 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:38:32 +0300
Subject: KVM: VMX: Add Guest State Validity Checks

This patch adds functions to check whether guest state is VMX compliant.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81db7d4..e889b76 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1721,6 +1721,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 	vmcs_writel(GUEST_GDTR_BASE, dt->base);
 }
 
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_segment var;
+	u32 ar;
+
+	vmx_get_segment(vcpu, &var, seg);
+	ar = vmx_segment_access_rights(&var);
+
+	if (var.base != (var.selector << 4))
+		return false;
+	if (var.limit != 0xffff)
+		return false;
+	if (ar != 0xf3)
+		return false;
+
+	return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	unsigned int cs_rpl;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+
+	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+		return false;
+	if (!cs.s)
+		return false;
+	if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+		if (cs.dpl > cs_rpl)
+			return false;
+	} else if (cs.type & AR_TYPE_CODE_MASK) {
+		if (cs.dpl != cs_rpl)
+			return false;
+	}
+	if (!cs.present)
+		return false;
+
+	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+	return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment ss;
+	unsigned int ss_rpl;
+
+	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+	ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+
+	if ((ss.type != 3) || (ss.type != 7))
+		return false;
+	if (!ss.s)
+		return false;
+	if (ss.dpl != ss_rpl) /* DPL != RPL */
+		return false;
+	if (!ss.present)
+		return false;
+
+	return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_segment var;
+	unsigned int rpl;
+
+	vmx_get_segment(vcpu, &var, seg);
+	rpl = var.selector & SELECTOR_RPL_MASK;
+
+	if (!var.s)
+		return false;
+	if (!var.present)
+		return false;
+	if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+		if (var.dpl < rpl) /* DPL < RPL */
+			return false;
+	}
+
+	/* TODO: Add other members to kvm_segment_field to allow checking for other access
+	 * rights flags
+	 */
+	return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment tr;
+
+	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+	if (tr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
+		return false;
+	if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+		return false;
+	if (!tr.present)
+		return false;
+
+	return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment ldtr;
+
+	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+	if (ldtr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
+		return false;
+	if (ldtr.type != 2)
+		return false;
+	if (!ldtr.present)
+		return false;
+
+	return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs, ss;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+	return ((cs.selector & SELECTOR_RPL_MASK) ==
+		 (ss.selector & SELECTOR_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+	/* real mode guest state checks */
+	if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+			return false;
+	} else {
+	/* protected mode guest state checks */
+		if (!cs_ss_rpl_check(vcpu))
+			return false;
+		if (!code_segment_valid(vcpu))
+			return false;
+		if (!stack_segment_valid(vcpu))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+			return false;
+		if (!tr_valid(vcpu))
+			return false;
+		if (!ldtr_valid(vcpu))
+			return false;
+	}
+	/* TODO:
+	 * - Add checks on RIP
+	 * - Add checks on RFLAGS
+	 */
+
+	return true;
+}
+
 static int init_rmode_tss(struct kvm *kvm)
 {
 	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-- 
cgit v0.10.2


From 04fa4d32117b1a7773290fd59a97cf90cfc2a0d4 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:39:48 +0300
Subject: KVM: VMX: Add module parameter and emulation flag.

The patch adds the module parameter required to enable emulating invalid
guest state, as well as the emulation_required flag used to drive
emulation whenever needed.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e889b76..7c5f611 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
 static int enable_ept = 1;
 module_param(enable_ept, bool, 0);
 
+static int emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, 0);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -86,6 +89,7 @@ struct vcpu_vmx {
 		} irq;
 	} rmode;
 	int vpid;
+	bool emulation_required;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From ea953ef0ca84e778187905177e2a789a1974837b Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:47:05 +0300
Subject: KVM: VMX: Add invalid guest state handler

This adds the invalid guest state handler function which invokes the x86
emulator until getting the guest to a VMX-friendly state.

[avi: leave atomic context if scheduling]
[guillaume: return to atomic context correctly]

Signed-off-by: Laurent Vivier <laurent.vivier@bull.net>
Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7c5f611..eae1f2c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2892,6 +2892,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
+				struct kvm_run *kvm_run)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int err;
+
+	preempt_enable();
+	local_irq_enable();
+
+	while (!guest_state_valid(vcpu)) {
+		err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+		switch (err) {
+			case EMULATE_DONE:
+				break;
+			case EMULATE_DO_MMIO:
+				kvm_report_emulation_failure(vcpu, "mmio");
+				/* TODO: Handle MMIO */
+				return;
+			default:
+				kvm_report_emulation_failure(vcpu, "emulation failure");
+				return;
+		}
+
+		if (signal_pending(current))
+			break;
+		if (need_resched())
+			schedule();
+	}
+
+	local_irq_disable();
+	preempt_disable();
+
+	/* Guest state should be valid now, no more emulation should be needed */
+	vmx->emulation_required = 0;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
-- 
cgit v0.10.2


From a89a8fb93ba63d4ad39db97c90997c409c87ccb9 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:42:16 +0300
Subject: KVM: VMX: Modify mode switching and vmentry functions

This patch modifies mode switching and vmentry function in order to
drive invalid guest state emulation.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eae1f2c..9840f37 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1298,7 +1298,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
 static void enter_pmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 0;
 
 	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1315,6 +1317,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
 	update_exception_bitmap(vcpu);
 
+	if (emulate_invalid_guest_state)
+		return;
+
 	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
 	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
 	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1355,7 +1360,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
 static void enter_rmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 1;
 
 	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1377,6 +1384,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
 	update_exception_bitmap(vcpu);
 
+	if (emulate_invalid_guest_state)
+		goto continue_rmode;
+
 	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
 	vmcs_write32(GUEST_SS_LIMIT, 0xffff);
 	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1392,6 +1402,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
+continue_rmode:
 	kvm_mmu_reset_context(vcpu);
 	init_rmode(vcpu->kvm);
 }
@@ -2317,6 +2328,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	ret = 0;
 
+	/* HACK: Don't enable emulation on guest boot/reset */
+	vmx->emulation_required = 0;
+
 out:
 	up_read(&vcpu->kvm->slots_lock);
 	return ret;
@@ -3190,6 +3204,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info;
 
+	/* Handle invalid guest state instead of entering VMX */
+	if (vmx->emulation_required && emulate_invalid_guest_state) {
+		handle_invalid_guest_state(vcpu, kvm_run);
+		return;
+	}
+
 	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
 		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
-- 
cgit v0.10.2


From 94c935a1ee7a9c134f0ebed656384186619d05cb Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Mon, 18 Aug 2008 13:11:46 +0300
Subject: KVM: SVM: Fix typo

Fix typo in as-yet unused macro definition.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 179c2e0..be86c09 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,7 +44,7 @@ MODULE_LICENSE("GPL");
 
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
+#define SVM_FEATURE_SVML (1 << 2)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
-- 
cgit v0.10.2


From 29c8fa32c5d1e2d26d53ad9467b3a13130014cdf Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Mon, 18 Aug 2008 15:07:05 +0300
Subject: KVM: Use kvm_set_irq to inject interrupts

... instead of using the pic and ioapic variants

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c842060..fdaa0f0 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -596,10 +596,8 @@ void kvm_free_pit(struct kvm *kvm)
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
 	mutex_lock(&kvm->lock);
-	kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
-	kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
-	kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
-	kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+	kvm_set_irq(kvm, 0, 1);
+	kvm_set_irq(kvm, 0, 0);
 	mutex_unlock(&kvm->lock);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fffdf4f..5b3c882 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1956,13 +1956,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
 			mutex_lock(&kvm->lock);
-			if (irq_event.irq < 16)
-				kvm_pic_set_irq(pic_irqchip(kvm),
-					irq_event.irq,
-					irq_event.level);
-			kvm_ioapic_set_irq(kvm->arch.vioapic,
-					irq_event.irq,
-					irq_event.level);
+			kvm_set_irq(kvm, irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
 			r = 0;
 		}
-- 
cgit v0.10.2


From ee032c993edd34e0bdf64dab06a55d0e08a4eeb9 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 11 Aug 2008 16:54:20 -0700
Subject: KVM: make irq ack notifier functions static

sparse says:

arch/x86/kvm/x86.c:107:32: warning: symbol 'kvm_find_assigned_dev' was not declared. Should it be static?
arch/x86/kvm/i8254.c:225:6: warning: symbol 'kvm_pit_ack_irq' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index fdaa0f0..4cb4430 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -222,7 +222,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b3c882..22edd95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -104,7 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
-struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {
 	struct list_head *ptr;
-- 
cgit v0.10.2


From 26815a648e1ec2b338a63a2bc301dcf449b93e5a Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Tue, 19 Aug 2008 20:48:03 +0800
Subject: KVM: ia64: add a dummy irq ack notification

Before enabling notify_acked_irq for ia64, leave the related APIs as
nop-op first.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
new file mode 100644
index 0000000..f2e6545
--- /dev/null
+++ b/arch/ia64/kvm/irq.h
@@ -0,0 +1,32 @@
+/*
+ * irq.h: In-kernel interrupt controller related definitions
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Authors:
+ *   Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+struct kvm;
+
+static inline void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+{
+}
+
+#endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 515cd7c..53772bb 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -386,7 +386,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 		break;
 #ifdef	CONFIG_IA64
 	case IOAPIC_REG_EOI:
-		kvm_ioapic_update_eoi(ioapic->kvm, data);
+		kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG);
 		break;
 #endif
 
-- 
cgit v0.10.2


From 5706be0dafd6f42852f85fbae292301dcad4ccec Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Aug 2008 15:07:31 +0300
Subject: KVM: VMX: Change cs reset state to be a data segment

Real mode cs is a data segment, not a code segment.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9840f37..6aa305a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2239,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	fx_init(&vmx->vcpu);
 
+	seg_setup(VCPU_SREG_CS);
 	/*
 	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
 	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
@@ -2250,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
 		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
 	}
-	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 
 	seg_setup(VCPU_SREG_DS);
 	seg_setup(VCPU_SREG_ES);
-- 
cgit v0.10.2


From a16b20da879430fdf245ed45461ed40ffef8db3c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Aug 2008 15:48:27 +0300
Subject: KVM: VMX: Change segment dpl at reset to 3

This is more emulation friendly, if not 100% correct.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6aa305a..71e57ae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1991,7 +1991,7 @@ static void seg_setup(int seg)
 	vmcs_write16(sf->selector, 0);
 	vmcs_writel(sf->base, 0);
 	vmcs_write32(sf->limit, 0xffff);
-	vmcs_write32(sf->ar_bytes, 0x93);
+	vmcs_write32(sf->ar_bytes, 0xf3);
 }
 
 static int alloc_apic_access_page(struct kvm *kvm)
-- 
cgit v0.10.2


From f4bbd9aaaae23007e4d79536d35a30cbbb11d407 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Aug 2008 15:51:42 +0300
Subject: KVM: Load real mode segments correctly

Real mode segments to not reference the GDT or LDT; they simply compute
base = selector * 16.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 22edd95..bfc7c33 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3588,11 +3588,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+	struct kvm_segment segvar = {
+		.base = selector << 4,
+		.limit = 0xffff,
+		.selector = selector,
+		.type = 3,
+		.present = 1,
+		.dpl = 3,
+		.db = 0,
+		.s = 1,
+		.l = 0,
+		.g = 0,
+		.avl = 0,
+		.unusable = 0,
+	};
+	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+	return 0;
+}
+
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 				int type_bits, int seg)
 {
 	struct kvm_segment kvm_seg;
 
+	if (!(vcpu->arch.cr0 & X86_CR0_PE))
+		return kvm_load_realmode_segment(vcpu, selector, seg);
 	if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
 		return 1;
 	kvm_seg.type |= type_bits;
-- 
cgit v0.10.2


From 41afa025878bc31c9c4e18415fba2435fe035376 Mon Sep 17 00:00:00 2001
From: roel kluin <roel.kluin@gmail.com>
Date: Mon, 18 Aug 2008 21:25:01 -0400
Subject: KVM: x86 emulator: remove duplicate SrcImm

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d5da7f1..5d6c144 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -269,7 +269,7 @@ static u16 group_table[] = {
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group3*8] =
-	DstMem | SrcImm | ModRM | SrcImm, 0,
+	DstMem | SrcImm | ModRM, 0,
 	DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
-- 
cgit v0.10.2


From 6eb06cb2863a2ff5704b501f1699216180e790b5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 21 Aug 2008 17:41:39 +0300
Subject: KVM: x86 emulator: remove bad ByteOp specifier from NEG descriptor

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 5d6c144..ae30435a 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -270,7 +270,7 @@ static u16 group_table[] = {
 	0, 0, 0, 0,
 	[Group3*8] =
 	DstMem | SrcImm | ModRM, 0,
-	DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
-- 
cgit v0.10.2


From 135f8c2b078533cc74e75f696e73d47304a61125 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 21 Aug 2008 17:49:56 +0300
Subject: KVM: MMU: Move SHADOW_PT_INDEX to mmu.c

It is not specific to the paging mode, so can be made global (and reusable).

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 171bcea..51d4cd7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,6 +135,8 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
 struct kvm_rmap_desc {
 	u64 *shadow_ptes[RMAP_EXT];
 	struct kvm_rmap_desc *more;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bf..ebb26a0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -29,7 +29,6 @@
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
 	#ifdef CONFIG_X86_64
@@ -46,7 +45,6 @@
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
@@ -504,7 +502,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-#undef SHADOW_PT_INDEX
 #undef PT_LEVEL_MASK
 #undef PT_DIR_BASE_ADDR_MASK
 #undef PT_LEVEL_BITS
-- 
cgit v0.10.2


From 6e37d3dc3e358dbf907f8b96a51282966934124b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:14:17 +0300
Subject: KVM: MMU: Unify direct map 4K and large page paths

The two paths are equivalent except for one argument, which is already
available.  Merge the two codepaths.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51d4cd7..3ee856f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1240,15 +1240,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		ASSERT(VALID_PAGE(table_addr));
 		table = __va(table_addr);
 
-		if (level == 1) {
+		if (level == 1 || (largepage && level == 2)) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 0, gfn, pfn, false);
-			return pt_write;
-		}
-
-		if (largepage && level == 2) {
-			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 1, gfn, pfn, false);
+				     0, write, 1, &pt_write, largepage,
+				     gfn, pfn, false);
 			return pt_write;
 		}
 
-- 
cgit v0.10.2


From 0be9e929e398d6da6406183a8732dbfd0937fafe Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Tue, 26 Aug 2008 08:58:53 +0800
Subject: KVM: ia64: Enable virtio driver for ia64 in Kconfig

kvm/ia64 uses the virtio drivers to optimize its I/O subsytem.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 7914e48..8e99fed 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -46,4 +46,6 @@ config KVM_INTEL
 config KVM_TRACE
        bool
 
+source drivers/virtio/Kconfig
+
 endif # VIRTUALIZATION
-- 
cgit v0.10.2


From 6c41f428b72afe5a581b967590c12538db31d399 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 26 Aug 2008 16:16:08 +0300
Subject: KVM: MMU: Infer shadow root level in direct_map()

In all cases the shadow root level is available in mmu.shadow_root_level,
so there is no need to pass it as a parameter.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ee856f..72f739a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1227,11 +1227,11 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			   int largepage, gfn_t gfn, pfn_t pfn,
-			   int level)
+			int largepage, gfn_t gfn, pfn_t pfn)
 {
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
 	int pt_write = 0;
+	int level = vcpu->arch.mmu.shadow_root_level;
 
 	for (; ; level--) {
 		u32 index = PT64_INDEX(v, level);
@@ -1299,8 +1299,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
-			 PT32E_ROOT_LEVEL);
+	r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -1455,7 +1454,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+			 largepage, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
-- 
cgit v0.10.2


From 3d000db5688c8beff6319fb9ff4b98dcac32f798 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:24:38 +0300
Subject: KVM: MMU: Add generic shadow walker

We currently walk the shadow page tables in two places: direct map (for
real mode and two dimensional paging) and paging mode shadow.  Since we
anticipate requiring a third walk (for invlpg), it makes sense to have
a generic facility for shadow walk.

This patch adds such a shadow walker, walks the page tables and calls a
method for every spte encountered.  The method can examine the spte,
modify it, or even instantiate it.  The walk can be aborted by returning
nonzero from the method.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 72f739a..8b95cf7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -142,6 +142,11 @@ struct kvm_rmap_desc {
 	struct kvm_rmap_desc *more;
 };
 
+struct kvm_shadow_walk {
+	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+		     gva_t addr, u64 *spte, int level);
+};
+
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
@@ -935,6 +940,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	return sp;
 }
 
+static int walk_shadow(struct kvm_shadow_walk *walker,
+		       struct kvm_vcpu *vcpu, gva_t addr)
+{
+	hpa_t shadow_addr;
+	int level;
+	int r;
+	u64 *sptep;
+	unsigned index;
+
+	shadow_addr = vcpu->arch.mmu.root_hpa;
+	level = vcpu->arch.mmu.shadow_root_level;
+	if (level == PT32E_ROOT_LEVEL) {
+		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		shadow_addr &= PT64_BASE_ADDR_MASK;
+		--level;
+	}
+
+	while (level >= PT_PAGE_TABLE_LEVEL) {
+		index = SHADOW_PT_INDEX(addr, level);
+		sptep = ((u64 *)__va(shadow_addr)) + index;
+		r = walker->entry(walker, vcpu, addr, sptep, level);
+		if (r)
+			return r;
+		shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
+		--level;
+	}
+	return 0;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
-- 
cgit v0.10.2


From 140754bc80e1cdbf2d14cdb10d900da1f7718e7b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:28:04 +0300
Subject: KVM: MMU: Convert direct maps to use the generic shadow walker

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b95cf7..a1ca4ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1260,49 +1260,66 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int largepage, gfn_t gfn, pfn_t pfn)
-{
-	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
-	int pt_write = 0;
-	int level = vcpu->arch.mmu.shadow_root_level;
-
-	for (; ; level--) {
-		u32 index = PT64_INDEX(v, level);
-		u64 *table;
+struct direct_shadow_walk {
+	struct kvm_shadow_walk walker;
+	pfn_t pfn;
+	int write;
+	int largepage;
+	int pt_write;
+};
 
-		ASSERT(VALID_PAGE(table_addr));
-		table = __va(table_addr);
+static int direct_map_entry(struct kvm_shadow_walk *_walk,
+			    struct kvm_vcpu *vcpu,
+			    gva_t addr, u64 *sptep, int level)
+{
+	struct direct_shadow_walk *walk =
+		container_of(_walk, struct direct_shadow_walk, walker);
+	struct kvm_mmu_page *sp;
+	gfn_t pseudo_gfn;
+	gfn_t gfn = addr >> PAGE_SHIFT;
+
+	if (level == PT_PAGE_TABLE_LEVEL
+	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
+		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
+			     0, walk->write, 1, &walk->pt_write,
+			     walk->largepage, gfn, walk->pfn, false);
+		return 1;
+	}
 
-		if (level == 1 || (largepage && level == 2)) {
-			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, largepage,
-				     gfn, pfn, false);
-			return pt_write;
+	if (*sptep == shadow_trap_nonpresent_pte) {
+		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1,
+				      1, ACC_ALL, sptep);
+		if (!sp) {
+			pgprintk("nonpaging_map: ENOMEM\n");
+			kvm_release_pfn_clean(walk->pfn);
+			return -ENOMEM;
 		}
 
-		if (table[index] == shadow_trap_nonpresent_pte) {
-			struct kvm_mmu_page *new_table;
-			gfn_t pseudo_gfn;
-
-			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-						     v, level - 1,
-						     1, ACC_ALL, &table[index]);
-			if (!new_table) {
-				pgprintk("nonpaging_map: ENOMEM\n");
-				kvm_release_pfn_clean(pfn);
-				return -ENOMEM;
-			}
-
-			set_shadow_pte(&table[index],
-				       __pa(new_table->spt)
-				       | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				       | shadow_user_mask | shadow_x_mask);
-		}
-		table_addr = table[index] & PT64_BASE_ADDR_MASK;
+		set_shadow_pte(sptep,
+			       __pa(sp->spt)
+			       | PT_PRESENT_MASK | PT_WRITABLE_MASK
+			       | shadow_user_mask | shadow_x_mask);
 	}
+	return 0;
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+			int largepage, gfn_t gfn, pfn_t pfn)
+{
+	int r;
+	struct direct_shadow_walk walker = {
+		.walker = { .entry = direct_map_entry, },
+		.pfn = pfn,
+		.largepage = largepage,
+		.write = write,
+		.pt_write = 0,
+	};
+
+	r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT);
+	if (r < 0)
+		return r;
+	return walker.pt_write;
 }
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-- 
cgit v0.10.2


From abb9e0b8e33e58ac8e04e03b680c46435bc312fd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:11:39 +0300
Subject: KVM: MMU: Convert the paging mode shadow walk to use the generic
 walker

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ebb26a0..b7064e1 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,6 +25,7 @@
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
+	#define shadow_walker shadow_walker64
 	#define FNAME(name) paging##64_##name
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
@@ -41,6 +42,7 @@
 #elif PTTYPE == 32
 	#define pt_element_t u32
 	#define guest_walker guest_walker32
+	#define shadow_walker shadow_walker32
 	#define FNAME(name) paging##32_##name
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
@@ -71,6 +73,17 @@ struct guest_walker {
 	u32 error_code;
 };
 
+struct shadow_walker {
+	struct kvm_shadow_walk walker;
+	struct guest_walker *guest_walker;
+	int user_fault;
+	int write_fault;
+	int largepage;
+	int *ptwrite;
+	pfn_t pfn;
+	u64 *sptep;
+};
+
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
 	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -272,86 +285,86 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *walker,
-			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
+static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
+				    struct kvm_vcpu *vcpu, gva_t addr,
+				    u64 *sptep, int level)
 {
-	hpa_t shadow_addr;
-	int level;
-	u64 *shadow_ent;
-	unsigned access = walker->pt_access;
-
-	if (!is_present_pte(walker->ptes[walker->level - 1]))
-		return NULL;
-
-	shadow_addr = vcpu->arch.mmu.root_hpa;
-	level = vcpu->arch.mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		--level;
+	struct shadow_walker *sw =
+		container_of(_sw, struct shadow_walker, walker);
+	struct guest_walker *gw = sw->guest_walker;
+	unsigned access = gw->pt_access;
+	struct kvm_mmu_page *shadow_page;
+	u64 spte;
+	int metaphysical;
+	gfn_t table_gfn;
+	int r;
+	pt_element_t curr_pte;
+
+	if (level == PT_PAGE_TABLE_LEVEL
+	    || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
+		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+			     sw->user_fault, sw->write_fault,
+			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+			     sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+			     false);
+		sw->sptep = sptep;
+		return 1;
 	}
 
-	for (; ; level--) {
-		u32 index = SHADOW_PT_INDEX(addr, level);
-		struct kvm_mmu_page *shadow_page;
-		u64 shadow_pte;
-		int metaphysical;
-		gfn_t table_gfn;
-
-		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-		if (level == PT_PAGE_TABLE_LEVEL)
-			break;
-
-		if (largepage && level == PT_DIRECTORY_LEVEL)
-			break;
-
-		if (is_shadow_present_pte(*shadow_ent)
-		    && !is_large_pte(*shadow_ent)) {
-			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-			continue;
-		}
+	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+		return 0;
 
-		if (is_large_pte(*shadow_ent))
-			rmap_remove(vcpu->kvm, shadow_ent);
-
-		if (level - 1 == PT_PAGE_TABLE_LEVEL
-		    && walker->level == PT_DIRECTORY_LEVEL) {
-			metaphysical = 1;
-			if (!is_dirty_pte(walker->ptes[level - 1]))
-				access &= ~ACC_WRITE_MASK;
-			table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
-		} else {
-			metaphysical = 0;
-			table_gfn = walker->table_gfn[level - 2];
-		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       metaphysical, access,
-					       shadow_ent);
-		if (!metaphysical) {
-			int r;
-			pt_element_t curr_pte;
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  walker->pte_gpa[level - 2],
-						  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != walker->ptes[level - 2]) {
-				kvm_release_pfn_clean(pfn);
-				return NULL;
-			}
+	if (is_large_pte(*sptep))
+		rmap_remove(vcpu->kvm, sptep);
+
+	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
+		metaphysical = 1;
+		if (!is_dirty_pte(gw->ptes[level - 1]))
+			access &= ~ACC_WRITE_MASK;
+		table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+	} else {
+		metaphysical = 0;
+		table_gfn = gw->table_gfn[level - 2];
+	}
+	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+				       metaphysical, access, sptep);
+	if (!metaphysical) {
+		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
+					  &curr_pte, sizeof(curr_pte));
+		if (r || curr_pte != gw->ptes[level - 2]) {
+			kvm_release_pfn_clean(sw->pfn);
+			sw->sptep = NULL;
+			return 1;
 		}
-		shadow_addr = __pa(shadow_page->spt);
-		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		set_shadow_pte(shadow_ent, shadow_pte);
 	}
 
-	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
-		     user_fault, write_fault,
-		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, pfn, false);
+	spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
+		| PT_WRITABLE_MASK | PT_USER_MASK;
+	*sptep = spte;
+	return 0;
+}
+
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *guest_walker,
+			 int user_fault, int write_fault, int largepage,
+			 int *ptwrite, pfn_t pfn)
+{
+	struct shadow_walker walker = {
+		.walker = { .entry = FNAME(shadow_walk_entry), },
+		.guest_walker = guest_walker,
+		.user_fault = user_fault,
+		.write_fault = write_fault,
+		.largepage = largepage,
+		.ptwrite = ptwrite,
+		.pfn = pfn,
+	};
+
+	if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
+		return NULL;
+
+	walk_shadow(&walker.walker, vcpu, addr);
 
-	return shadow_ent;
+	return walker.sptep;
 }
 
 /*
@@ -499,6 +512,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 
 #undef pt_element_t
 #undef guest_walker
+#undef shadow_walker
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-- 
cgit v0.10.2


From acee3c04e8208c17aad1baff99baa68d71640a19 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 26 Aug 2008 17:22:47 +0300
Subject: KVM: Allocate guest memory as MAP_PRIVATE, not MAP_SHARED

There is no reason to share internal memory slots with fork()ed instances.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfc7c33..675d010 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4296,7 +4296,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 			userspace_addr = do_mmap(NULL, 0,
 						 npages * PAGE_SIZE,
 						 PROT_READ | PROT_WRITE,
-						 MAP_SHARED | MAP_ANONYMOUS,
+						 MAP_PRIVATE | MAP_ANONYMOUS,
 						 0);
 			up_write(&current->mm->mmap_sem);
 
-- 
cgit v0.10.2


From d657c7335b97d746aa6123c56504b46c20e37df3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 26 Aug 2008 17:31:31 +0300
Subject: KVM: Don't call get_user_pages(.force = 1)

This is esoteric and only needed to break COW on MAP_SHARED mappings.  Since
KVM no longer does these sorts of mappings, breaking COW on them is no longer
necessary.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0309571..de3b029 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -734,7 +734,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		return page_to_pfn(bad_page);
 	}
 
-	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
+	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
 				NULL);
 
 	if (unlikely(npages != 1)) {
-- 
cgit v0.10.2


From a5e2e82b8b62acd24a44b851e6bb4fd0793ead01 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 27 Aug 2008 05:02:56 +0300
Subject: KVM: x86 emulator: Add mov r, imm instructions (opcodes 0xb0-0xbf)

The emulator only supported one instance of mov r, imm instruction
(opcode 0xb8), this adds the rest of these instructions.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ae30435a..66e0bd6 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -154,9 +154,16 @@ static u16 opcode_table[256] = {
 	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
 	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
 	ByteOp | ImplicitOps | String, ImplicitOps | String,
-	/* 0xB0 - 0xBF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xB0 - 0xB7 */
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	/* 0xB8 - 0xBF */
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
 	/* 0xC0 - 0xC7 */
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
 	0, ImplicitOps | Stack, 0, 0,
@@ -1660,7 +1667,7 @@ special_insn:
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
-	case 0xb8: /* mov r, imm */
+	case 0xb0 ... 0xbf: /* mov r, imm */
 		goto mov;
 	case 0xc0 ... 0xc1:
 		emulate_grp2(ctxt);
-- 
cgit v0.10.2


From bc2d429979451d69d0985c5dbdf908cace2831cc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 16:30:56 +0300
Subject: KVM: MMU: Account for npt/ept/realmode page faults

Now that two-dimensional paging is becoming common, account for tdp page
faults.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a1ca4ff..a24da8f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1283,6 +1283,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
 		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
 			     0, walk->write, 1, &walk->pt_write,
 			     walk->largepage, gfn, walk->pfn, false);
+		++vcpu->stat.pf_fixed;
 		return 1;
 	}
 
-- 
cgit v0.10.2


From 2245a28fe2e6fdb1bdabc4dcde1ea3a5c37e2a9e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 16:32:24 +0300
Subject: KVM: MMU: Add locking around kvm_mmu_slot_remove_write_access()

It was generally safe due to slots_lock being held for write, but it wasn't
very nice.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a24da8f..5052acd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2097,6 +2097,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
 
+	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
 		u64 *pt;
@@ -2110,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			if (pt[i] & PT_WRITABLE_MASK)
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
+	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
-- 
cgit v0.10.2


From 171d595d3b3254b9a952af8d1f6965d2e85dcbaa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 16:40:51 +0300
Subject: KVM: MMU: Flush tlbs after clearing write permission when accessing
 dirty log

Otherwise, the cpu may allow writes to the tracked pages, and we lose
some display bits or fail to migrate correctly.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5052acd..853a288 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2111,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			if (pt[i] & PT_WRITABLE_MASK)
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
+	kvm_flush_remote_tlbs(kvm);
 	spin_unlock(&kvm->mmu_lock);
 }
 
-- 
cgit v0.10.2


From 3201b5d9f0f7ef392886cd76dcd2c69186d9d5cd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 20:01:04 +0300
Subject: KVM: MMU: Fix setting the accessed bit on non-speculative sptes

The accessed bit was accidentally turned on in a random flag word, rather
than, the spte itself, which was lucky, since it used the non-EPT compatible
PT_ACCESSED_MASK.

Fix by turning the bit on in the spte and changing it to use the portable
accessed mask.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 853a288..866d713 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1192,7 +1192,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	 */
 	spte = shadow_base_present_pte | shadow_dirty_mask;
 	if (!speculative)
-		pte_access |= PT_ACCESSED_MASK;
+		spte |= shadow_accessed_mask;
 	if (!dirty)
 		pte_access &= ~ACC_WRITE_MASK;
 	if (pte_access & ACC_EXEC_MASK)
-- 
cgit v0.10.2


From 48d150394999c542b2e828f03da226842950d46a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 28 Aug 2008 18:27:15 +0300
Subject: KVM: SVM: No need to unprotect memory during event injection when
 using npt

No memory is protected anyway.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index be86c09..6022888 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1021,7 +1021,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	if (npt_enabled)
 		svm_flush_tlb(&svm->vcpu);
 
-	if (event_injection)
+	if (!npt_enabled && event_injection)
 		kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
-- 
cgit v0.10.2


From a89c1ad270ca7ad0eec2667bc754362ce7b142be Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 29 Aug 2008 11:52:07 +0200
Subject: KVM: add MC5_MISC msr read support

Currently KVM implements MC0-MC4_MISC read support. When booting Linux this
results in KVM warnings in the kernel log when the guest tries to read
MC5_MISC. Fix this warnings with this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 675d010..e3b8966 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -991,6 +991,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MC0_MISC+8:
 	case MSR_IA32_MC0_MISC+12:
 	case MSR_IA32_MC0_MISC+16:
+	case MSR_IA32_MC0_MISC+20:
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
-- 
cgit v0.10.2


From a0046b6db1c514149585e11895cd8434e0eafa79 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 29 Aug 2008 13:29:45 +0200
Subject: KVM: s390: Make facility bits future-proof

Heiko Carstens pointed out, that its safer to activate working facilities
instead of disabling problematic facilities. The new code uses the host
facility bits and masks it with known good ones.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d1faf5c..cce40ff 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 	int rc;
 
 	vcpu->stat.instruction_stfl++;
-	facility_list &= ~(1UL<<24); /* no stfle */
-	facility_list &= ~(1UL<<23); /* no large pages */
+	/* only pass the facility bits, which we can handle */
+	facility_list &= 0xfe00fff3;
 
 	rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
 			   &facility_list, sizeof(facility_list));
-- 
cgit v0.10.2


From 20766c083e6ab3c33125f07c7ffe39914c106d98 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 29 Aug 2008 13:30:56 +0200
Subject: KVM: s390: change help text of guest Kconfig

The current help text for CONFIG_S390_GUEST is not very helpful.
Lets add more text.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4c03049..bc581d8 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -565,13 +565,16 @@ config ZFCPDUMP
 	  Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
 
 config S390_GUEST
-bool "s390 guest support (EXPERIMENTAL)"
+bool "s390 guest support for KVM (EXPERIMENTAL)"
 	depends on 64BIT && EXPERIMENTAL
 	select VIRTIO
 	select VIRTIO_RING
 	select VIRTIO_CONSOLE
 	help
-	  Select this option if you want to run the kernel under s390 linux
+	  Select this option if you want to run the kernel as a guest under
+	  the KVM hypervisor. This will add detection for KVM as well  as a
+	  virtio transport. If KVM is detected, the virtio console will be
+	  the default console.
 endmenu
 
 source "net/Kconfig"
-- 
cgit v0.10.2


From fb4616f43148c5b3f3e453a47657572d1bda39ee Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 1 Sep 2008 04:52:24 +0300
Subject: KVM: x86 emulator: Add std and cld instructions (opcodes 0xfc-0xfd)

This adds the std and cld instructions to the emulator.

Encountered while running the BIOS with invalid guest
state emulation enabled.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 66e0bd6..944f1f4 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -187,7 +187,7 @@ static u16 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
 	/* 0xF8 - 0xFF */
 	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	0, 0, Group | Group4, Group | Group5,
+	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
 static u16 twobyte_table[256] = {
@@ -1762,6 +1762,14 @@ special_insn:
 		ctxt->eflags |= X86_EFLAGS_IF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
+	case 0xfc: /* cld */
+		ctxt->eflags &= ~EFLG_DF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xfd: /* std */
+		ctxt->eflags |= EFLG_DF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
 	case 0xfe ... 0xff:	/* Grp4/Grp5 */
 		rc = emulate_grp45(ctxt, ops);
 		if (rc != 0)
-- 
cgit v0.10.2


From 8c4b537da7eceab1246695df21beea10f180d460 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 28 Aug 2008 09:34:08 +0800
Subject: KVM: ia64: Implement kvm_arch_vcpu_ioctl_{set,get}_mpstate

Two ioctl arch functions are added to set vcpu's smp state.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index cd0d1a7..7ad759e 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -179,6 +179,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_MP_STATE:
 
 		r = 1;
 		break;
@@ -1789,11 +1790,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	return -EINVAL;
+	vcpu_load(vcpu);
+	mp_state->mp_state = vcpu->arch.mp_state;
+	vcpu_put(vcpu);
+	return 0;
+}
+
+static int vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	int r;
+	long psr;
+	local_irq_save(psr);
+	r = kvm_insert_vmm_mapping(vcpu);
+	if (r)
+		goto fail;
+
+	vcpu->arch.launched = 0;
+	kvm_arch_vcpu_uninit(vcpu);
+	r = kvm_arch_vcpu_init(vcpu);
+	if (r)
+		goto fail;
+
+	kvm_purge_vmm_mapping(vcpu);
+	r = 0;
+fail:
+	local_irq_restore(psr);
+	return r;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	return -EINVAL;
+	int r = 0;
+
+	vcpu_load(vcpu);
+	vcpu->arch.mp_state = mp_state->mp_state;
+	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
+		r = vcpu_reset(vcpu);
+	vcpu_put(vcpu);
+	return r;
 }
-- 
cgit v0.10.2


From d40a1ee4859c673677c9811ae84475c4051baca5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Mon, 1 Sep 2008 19:41:20 +0800
Subject: KVM: MMU: Modify kvm_shadow_walk.entry to accept u64 addr

EPT is 4 level by default in 32pae(48 bits), but the addr parameter
of kvm_shadow_walk->entry() only accept unsigned long as virtual
address, which is 32bit in 32pae. This result in SHADOW_PT_INDEX()
overflow when try to fetch level 4 index.

Fix it by extend kvm_shadow_walk->entry() to accept 64bit addr in
parameter.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 866d713..bce3e25 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -144,7 +144,7 @@ struct kvm_rmap_desc {
 
 struct kvm_shadow_walk {
 	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
-		     gva_t addr, u64 *spte, int level);
+		     u64 addr, u64 *spte, int level);
 };
 
 static struct kmem_cache *pte_chain_cache;
@@ -941,7 +941,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 }
 
 static int walk_shadow(struct kvm_shadow_walk *walker,
-		       struct kvm_vcpu *vcpu, gva_t addr)
+		       struct kvm_vcpu *vcpu, u64 addr)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -1270,7 +1270,7 @@ struct direct_shadow_walk {
 
 static int direct_map_entry(struct kvm_shadow_walk *_walk,
 			    struct kvm_vcpu *vcpu,
-			    gva_t addr, u64 *sptep, int level)
+			    u64 addr, u64 *sptep, int level)
 {
 	struct direct_shadow_walk *walk =
 		container_of(_walk, struct direct_shadow_walk, walker);
@@ -1289,7 +1289,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
 
 	if (*sptep == shadow_trap_nonpresent_pte) {
 		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1,
+		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
 				      1, ACC_ALL, sptep);
 		if (!sp) {
 			pgprintk("nonpaging_map: ENOMEM\n");
@@ -1317,7 +1317,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		.pt_write = 0,
 	};
 
-	r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT);
+	r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
 	if (r < 0)
 		return r;
 	return walker.pt_write;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b7064e1..b671f61 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
 static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
-				    struct kvm_vcpu *vcpu, gva_t addr,
+				    struct kvm_vcpu *vcpu, u64 addr,
 				    u64 *sptep, int level)
 {
 	struct shadow_walker *sw =
@@ -326,7 +326,7 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
 		metaphysical = 0;
 		table_gfn = gw->table_gfn[level - 2];
 	}
-	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
 				       metaphysical, access, sptep);
 	if (!metaphysical) {
 		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
-- 
cgit v0.10.2


From fa89a81766e33343fa8f0fe0e371819bf94a17a1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 1 Sep 2008 15:57:51 +0300
Subject: KVM: Add statistics for guest irq injections

These can help show whether a guest is making progress or not.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6022888..9b54550 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1519,6 +1519,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 
 	KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
 
+	++svm->vcpu.stat.irq_injections;
 	control = &svm->vmcb->control;
 	control->int_vector = irq;
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 71e57ae..e7e8c86 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2341,6 +2341,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
+	++vcpu->stat.irq_injections;
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3b8966..3f3cb71 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index b6d26b8..68a3ac1 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -413,6 +413,7 @@ struct kvm_vcpu_stat {
 	u32 insn_emulation;
 	u32 insn_emulation_fail;
 	u32 hypercalls;
+	u32 irq_injections;
 };
 
 struct descriptor_table {
-- 
cgit v0.10.2


From a6a3034cb979b1fa3948d8e1e91b2387fc66b89b Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sat, 6 Sep 2008 17:22:29 +0300
Subject: KVM: x86 emulator: Add in/out instructions (opcodes 0xe4-0xe7,
 0xec-0xef)

The patch adds in/out instructions to the x86 emulator.

The instruction was encountered while running the BIOS while using
the invalid guest state emulation patch.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 944f1f4..3ac2f14 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -177,11 +177,14 @@ static u16 opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xE8 - 0xEF */
 	ImplicitOps | Stack, SrcImm | ImplicitOps,
 	ImplicitOps, SrcImmByte | ImplicitOps,
-	0, 0, 0, 0,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
 	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -1259,6 +1262,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u64 msr_data;
 	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
+	unsigned int port;
+	int io_dir_in;
 	int rc = 0;
 
 	/* Shadow copy of register state. Committed on successful emulation.
@@ -1687,6 +1692,16 @@ special_insn:
 		c->src.val = c->regs[VCPU_REGS_RCX];
 		emulate_grp2(ctxt);
 		break;
+	case 0xe4: 	/* inb */
+	case 0xe5: 	/* in */
+		port = insn_fetch(u8, 1, c->eip);
+		io_dir_in = 1;
+		goto do_io;
+	case 0xe6: /* outb */
+	case 0xe7: /* out */
+		port = insn_fetch(u8, 1, c->eip);
+		io_dir_in = 0;
+		goto do_io;
 	case 0xe8: /* call (near) */ {
 		long int rel;
 		switch (c->op_bytes) {
@@ -1737,6 +1752,22 @@ special_insn:
 		jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		break;
+	case 0xec: /* in al,dx */
+	case 0xed: /* in (e/r)ax,dx */
+		port = c->regs[VCPU_REGS_RDX];
+		io_dir_in = 1;
+		goto do_io;
+	case 0xee: /* out al,dx */
+	case 0xef: /* out (e/r)ax,dx */
+		port = c->regs[VCPU_REGS_RDX];
+		io_dir_in = 0;
+	do_io:	if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+				   (c->d & ByteOp) ? 1 : c->op_bytes,
+				   port) != 0) {
+			c->eip = saved_eip;
+			goto cannot_emulate;
+		}
+		return 0;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
 		break;
-- 
cgit v0.10.2


From d76901750ab9f71091d33ef3d2b5909d8a9a4ad4 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 8 Sep 2008 15:23:48 -0300
Subject: KVM: x86: do not execute halted vcpus

Offline or uninitialized vcpu's can be executed if requested to perform
userspace work.

Follow Avi's suggestion to handle halted vcpu's in the main loop,
simplifying kvm_emulate_halt(). Introduce a new vcpu->requests bit to
indicate events that promote state from halted to running.

Also standardize vcpu wake sites.

Signed-off-by: Marcelo Tosatti <mtosatti <at> redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4cb4430..634132a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,10 +200,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 
 	if (!atomic_inc_and_test(&pt->pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+	if (vcpu0 && waitqueue_active(&vcpu0->wq))
 		wake_up_interruptible(&vcpu0->wq);
-	}
 
 	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
 	pt->scheduled = ktime_to_ns(pt->timer.expires);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index be94f93..fd00f69 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -339,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-			kvm_vcpu_kick(vcpu);
-		else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
-			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-			if (waitqueue_active(&vcpu->wq))
-				wake_up_interruptible(&vcpu->wq);
-		}
+		kvm_vcpu_kick(vcpu);
 
 		result = (orig_irr == 0);
 		break;
@@ -384,8 +378,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-			if (waitqueue_active(&vcpu->wq))
-				wake_up_interruptible(&vcpu->wq);
+			kvm_vcpu_kick(vcpu);
 		}
 		break;
 
@@ -950,10 +943,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 
 	if(!atomic_inc_and_test(&apic->timer.pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
-	if (waitqueue_active(q)) {
-		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	if (waitqueue_active(q))
 		wake_up_interruptible(q);
-	}
+
 	if (apic_lvtt_period(apic)) {
 		result = 1;
 		apic->timer.dev.expires = ktime_add_ns(
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3f3cb71..bf98d40 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2798,11 +2798,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 	KVMTRACE_0D(HLT, vcpu, handler);
 	if (irqchip_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
-		up_read(&vcpu->kvm->slots_lock);
-		kvm_vcpu_block(vcpu);
-		down_read(&vcpu->kvm->slots_lock);
-		if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
-			return -EINTR;
 		return 1;
 	} else {
 		vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -3097,24 +3092,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
 	up_read(&vcpu->kvm->slots_lock);
 }
 
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
 
-	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-		pr_debug("vcpu %d received sipi with vector # %x\n",
-		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
-		kvm_lapic_reset(vcpu);
-		r = kvm_x86_ops->vcpu_reset(vcpu);
-		if (r)
-			return r;
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-	}
-
-	down_read(&vcpu->kvm->slots_lock);
-	vapic_enter(vcpu);
-
-again:
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
 			kvm_mmu_unload(vcpu);
@@ -3151,22 +3132,13 @@ again:
 
 	local_irq_disable();
 
-	if (vcpu->requests || need_resched()) {
+	if (vcpu->requests || need_resched() || signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
 		goto out;
 	}
 
-	if (signal_pending(current)) {
-		local_irq_enable();
-		preempt_enable();
-		r = -EINTR;
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		++vcpu->stat.signal_exits;
-		goto out;
-	}
-
 	if (vcpu->guest_debug.enabled)
 		kvm_x86_ops->guest_debug_pre(vcpu);
 
@@ -3227,26 +3199,63 @@ again:
 	kvm_lapic_sync_from_vapic(vcpu);
 
 	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+out:
+	return r;
+}
 
-	if (r > 0) {
-		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-			r = -EINTR;
-			kvm_run->exit_reason = KVM_EXIT_INTR;
-			++vcpu->stat.request_irq_exits;
-			goto out;
-		}
-		if (!need_resched())
-			goto again;
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r;
+
+	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+		printk("vcpu %d received sipi with vector # %x\n",
+		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		kvm_lapic_reset(vcpu);
+		r = kvm_x86_ops->vcpu_reset(vcpu);
+		if (r)
+			return r;
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	}
 
-out:
-	up_read(&vcpu->kvm->slots_lock);
-	if (r > 0) {
-		kvm_resched(vcpu);
-		down_read(&vcpu->kvm->slots_lock);
-		goto again;
+	down_read(&vcpu->kvm->slots_lock);
+	vapic_enter(vcpu);
+
+	r = 1;
+	while (r > 0) {
+		if (kvm_arch_vcpu_runnable(vcpu))
+			r = vcpu_enter_guest(vcpu, kvm_run);
+		else {
+			up_read(&vcpu->kvm->slots_lock);
+			kvm_vcpu_block(vcpu);
+			down_read(&vcpu->kvm->slots_lock);
+			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+				if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+					vcpu->arch.mp_state =
+							KVM_MP_STATE_RUNNABLE;
+			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+				r = -EINTR;
+		}
+
+		if (r > 0) {
+			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+				r = -EINTR;
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				++vcpu->stat.request_irq_exits;
+			}
+			if (signal_pending(current)) {
+				r = -EINTR;
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				++vcpu->stat.signal_exits;
+			}
+			if (need_resched()) {
+				up_read(&vcpu->kvm->slots_lock);
+				kvm_resched(vcpu);
+				down_read(&vcpu->kvm->slots_lock);
+			}
+		}
 	}
 
+	up_read(&vcpu->kvm->slots_lock);
 	post_kvm_run_save(vcpu, kvm_run);
 
 	vapic_exit(vcpu);
@@ -3266,6 +3275,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		r = -EAGAIN;
 		goto out;
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a18aaad..4b03643 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,7 @@
 #define KVM_REQ_MMU_RELOAD         3
 #define KVM_REQ_TRIPLE_FAULT       4
 #define KVM_REQ_PENDING_TIMER      5
+#define KVM_REQ_UNHALT             6
 
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index de3b029..63e661b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -980,12 +980,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-		if (kvm_cpu_has_interrupt(vcpu))
-			break;
-		if (kvm_cpu_has_pending_timer(vcpu))
-			break;
-		if (kvm_arch_vcpu_runnable(vcpu))
+		if (kvm_cpu_has_interrupt(vcpu) ||
+		    kvm_cpu_has_pending_timer(vcpu) ||
+		    kvm_arch_vcpu_runnable(vcpu)) {
+			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
+		}
 		if (signal_pending(current))
 			break;
 
-- 
cgit v0.10.2


From d19292e457a7c1b7f6c12bccbfdfd53630de1cee Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 8 Sep 2008 21:47:19 +0300
Subject: KVM: x86 emulator: Add call near absolute instruction (opcode 0xff/2)

Add call near absolute instruction.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 3ac2f14..0630d21 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -286,7 +286,8 @@ static u16 group_table[] = {
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0, 0, 0,
 	[Group5*8] =
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
+	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	SrcMem | ModRM | Stack, 0,
 	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem, ModRM | SrcMem,
@@ -1162,6 +1163,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 	case 1:	/* dec */
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
+	case 2: /* call near abs */ {
+		long int old_eip;
+		old_eip = c->eip;
+		c->eip = c->src.val;
+		c->src.val = old_eip;
+		emulate_push(ctxt);
+		break;
+	}
 	case 4: /* jmp abs */
 		c->eip = c->src.val;
 		break;
-- 
cgit v0.10.2


From 9c3e4aab5ae27bf8589d61c7874e213832b4b7d2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 10 Sep 2008 16:40:55 -0300
Subject: KVM: x86: unhalt vcpu0 on reset

Since "KVM: x86: do not execute halted vcpus", HLT by vcpu0 before system
reset by the IO thread will hang the guest.

Mark vcpu as runnable in such case.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf98d40..2134f3e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3959,6 +3959,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
+	/* Older userspace won't unhalt the vcpu on reset. */
+	if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+	    !(vcpu->arch.cr0 & X86_CR0_PE))
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
 	vcpu_put(vcpu);
 
 	return 0;
-- 
cgit v0.10.2


From 0bd595fc222583ca260f259698f72e9946c6e524 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Thu, 11 Sep 2008 10:04:29 -0500
Subject: KVM: ppc: kvmppc_44x_shadow_release() does not require mmap_sem to be
 locked

And it gets in the way of get_user_pages_fast().

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 3594bbd..7b11fd7 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -110,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
 	return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
 }
 
-/* Must be called with mmap_sem locked for writing. */
 static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
                                       unsigned int index)
 {
@@ -150,17 +149,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	/* Get reference to new page. */
 	down_read(&current->mm->mmap_sem);
 	new_page = gfn_to_page(vcpu->kvm, gfn);
+	up_read(&current->mm->mmap_sem);
 	if (is_error_page(new_page)) {
 		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
 		kvm_release_page_clean(new_page);
-		up_read(&current->mm->mmap_sem);
 		return;
 	}
 	hpaddr = page_to_phys(new_page);
 
 	/* Drop reference to old page. */
 	kvmppc_44x_shadow_release(vcpu, victim);
-	up_read(&current->mm->mmap_sem);
 
 	vcpu->arch.shadow_pages[victim] = new_page;
 
@@ -194,7 +192,6 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
-	down_write(&current->mm->mmap_sem);
 	for (i = 0; i <= tlb_44x_hwater; i++) {
 		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
 		unsigned int tid;
@@ -219,7 +216,6 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 				stlbe->tid, stlbe->word0, stlbe->word1,
 				stlbe->word2, handler);
 	}
-	up_write(&current->mm->mmap_sem);
 }
 
 /* Invalidate all mappings on the privilege switch after PID has been changed.
@@ -231,7 +227,6 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 
 	if (vcpu->arch.swap_pid) {
 		/* XXX Replace loop with fancy data structures. */
-		down_write(&current->mm->mmap_sem);
 		for (i = 0; i <= tlb_44x_hwater; i++) {
 			struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
 
@@ -243,7 +238,6 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 			            stlbe->tid, stlbe->word0, stlbe->word1,
 			            stlbe->word2, handler);
 		}
-		up_write(&current->mm->mmap_sem);
 		vcpu->arch.swap_pid = 0;
 	}
 
-- 
cgit v0.10.2


From 4b92fe0c9d0376eb105c88d49b77595e8e5b1548 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 11 Sep 2008 12:58:00 +0200
Subject: KVM: VMX: Cleanup stalled INTR_INFO read

Commit 1c0f4f5011829dac96347b5f84ba37c2252e1e08 left a useless access
of VM_ENTRY_INTR_INFO_FIELD in vmx_intr_assist behind. Clean this up.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e7e8c86..f8e615f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3135,11 +3135,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
-	u32 intr_info_field;
-
 	update_tpr_threshold(vcpu);
 
-	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
 	if (cpu_has_virtual_nmis()) {
 		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 			if (vmx_nmi_enabled(vcpu)) {
-- 
cgit v0.10.2


From ef46f18ea010359f7536afd3f56a92c9c83ac09a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 11 Sep 2008 19:47:13 +0300
Subject: KVM: x86 emulator: fix jmp r/m64 instruction

jmp r/m64 doesn't require the rex.w prefix to indicate the operand size
is 64 bits.  Set the Stack attribute (even though it doesn't involve the
stack, really) to indicate this.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0630d21..0c120c4 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -288,7 +288,7 @@ static u16 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+	SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem, ModRM | SrcMem,
 	SrcNone | ModRM | DstMem | Mov, 0,
-- 
cgit v0.10.2


From ad8003d33efe856515a5c2e9b63637de85c44788 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 10 Sep 2008 20:01:07 +0200
Subject: MAINTAINERS: add entry for the KVM AMD module

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 5d0b8a2..87ba4a6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2448,6 +2448,13 @@ L:	kvm@vger.kernel.org
 W:	http://kvm.qumranet.com
 S:	Supported
 
+KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
+P:	Joerg Roedel
+M:	joerg.roedel@amd.com
+L:	kvm@vger.kernel.org
+W:	http://kvm.qumranet.com
+S:	Supported
+
 KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC
 P:	Hollis Blanchard
 M:	hollisb@us.ibm.com
-- 
cgit v0.10.2


From 9ea542facbd0fd12a53e169953a3fdd6d0364532 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Thu, 11 Sep 2008 15:27:49 +0800
Subject: KVM: VMX: Rename IA32_FEATURE_CONTROL bits

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8e615f..046a91b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1041,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT |
-		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
-	    == IA32_FEATURE_CONTROL_LOCKED_BIT;
+	return (msr & (FEATURE_CONTROL_LOCKED |
+		       FEATURE_CONTROL_VMXON_ENABLED))
+	    == FEATURE_CONTROL_LOCKED;
 	/* locked but not enabled */
 }
 
@@ -1055,14 +1055,14 @@ static void hardware_enable(void *garbage)
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT |
-		    IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
-	    != (IA32_FEATURE_CONTROL_LOCKED_BIT |
-		IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
+	if ((old & (FEATURE_CONTROL_LOCKED |
+		    FEATURE_CONTROL_VMXON_ENABLED))
+	    != (FEATURE_CONTROL_LOCKED |
+		FEATURE_CONTROL_VMXON_ENABLED))
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       IA32_FEATURE_CONTROL_LOCKED_BIT |
-		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT);
+		       FEATURE_CONTROL_LOCKED |
+		       FEATURE_CONTROL_VMXON_ENABLED);
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 86059f4..44cfab7 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,8 +331,8 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define IA32_FEATURE_CONTROL_LOCKED_BIT		0x1
-#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT	0x4
+#define FEATURE_CONTROL_LOCKED		    (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED	    (1<<2)
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
-- 
cgit v0.10.2


From defed7ed926eca9f178a632df05baa866abe754e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Thu, 11 Sep 2008 15:27:50 +0800
Subject: x86: Move FEATURE_CONTROL bits to msr-index.h

For MSR_IA32_FEATURE_CONTROL is already there.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 44cfab7..3e010d2 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,9 +331,6 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define FEATURE_CONTROL_LOCKED		    (1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED	    (1<<2)
-
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
 
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h
index 0bb4330..dabd10f 100644
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -178,6 +178,9 @@
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
+#define FEATURE_CONTROL_LOCKED		(1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED	(1<<2)
+
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
-- 
cgit v0.10.2


From 9c9fddd0e784346a6d0ce82ed20d9ad21f3c8a2c Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Fri, 12 Sep 2008 13:50:25 +0200
Subject: KVM: x86 emulator: Add DstAcc operand type

Add DstAcc operand type. That means that there are 4 bits now for
DstMask.

"In the good old days cpus would have only one register that was able to
 fully participate in arithmetic operations, typically called A for
 Accumulator.  The x86 retains this tradition by having special, shorter
 encodings for the A register (like the cmp opcode), and even some
 instructions that only operate on A (like mul).

 SrcAcc and DstAcc would accommodate these instructions by decoding A
 into the corresponding 'struct operand'."
  -- Avi Kivity

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0c120c4..4390ec8 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -47,25 +47,26 @@
 #define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
 #define DstReg      (2<<1)	/* Register operand. */
 #define DstMem      (3<<1)	/* Memory operand. */
-#define DstMask     (3<<1)
+#define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstMask     (7<<1)
 /* Source operand type. */
-#define SrcNone     (0<<3)	/* No source operand. */
-#define SrcImplicit (0<<3)	/* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)	/* Register operand. */
-#define SrcMem      (2<<3)	/* Memory operand. */
-#define SrcMem16    (3<<3)	/* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)	/* Memory operand (32-bit). */
-#define SrcImm      (5<<3)	/* Immediate operand. */
-#define SrcImmByte  (6<<3)	/* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
+#define SrcNone     (0<<4)	/* No source operand. */
+#define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<4)	/* Register operand. */
+#define SrcMem      (2<<4)	/* Memory operand. */
+#define SrcMem16    (3<<4)	/* Memory operand (16-bit). */
+#define SrcMem32    (4<<4)	/* Memory operand (32-bit). */
+#define SrcImm      (5<<4)	/* Immediate operand. */
+#define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<4)
 /* Generic ModRM decode. */
-#define ModRM       (1<<6)
+#define ModRM       (1<<7)
 /* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
-#define String      (1<<10)     /* String instruction (rep capable) */
-#define Stack       (1<<11)     /* Stack instruction (push/pop) */
+#define Mov         (1<<8)
+#define BitOp       (1<<9)
+#define MemAbs      (1<<10)      /* Memory operand is absolute displacement */
+#define String      (1<<12)     /* String instruction (rep capable) */
+#define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
@@ -1060,6 +1061,23 @@ done_prefixes:
 		}
 		c->dst.type = OP_MEM;
 		break;
+	case DstAcc:
+		c->dst.type = OP_REG;
+		c->dst.bytes = c->op_bytes;
+		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->op_bytes) {
+			case 1:
+				c->dst.val = *(u8 *)c->dst.ptr;
+				break;
+			case 2:
+				c->dst.val = *(u16 *)c->dst.ptr;
+				break;
+			case 4:
+				c->dst.val = *(u32 *)c->dst.ptr;
+				break;
+		}
+		c->dst.orig_val = c->dst.val;
+		break;
 	}
 
 	if (c->rip_relative)
-- 
cgit v0.10.2


From 8a9fee67fba585b4c4731a749367e1751ebf416c Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Fri, 12 Sep 2008 13:51:15 +0200
Subject: KVM: x86 emulator: Add cmp al, imm and cmp ax, imm instructions
 (ocodes 3c, 3d)

Add decode entries for these opcodes; execution is already implemented.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 4390ec8..2b43208 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -108,7 +108,8 @@ static u16 opcode_table[256] = {
 	/* 0x38 - 0x3F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+	0, 0,
 	/* 0x40 - 0x47 */
 	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
 	/* 0x48 - 0x4F */
-- 
cgit v0.10.2


From aa3a816b6d0bd59e1a9c548cc7d2dd829f26534f Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Fri, 12 Sep 2008 13:52:18 +0200
Subject: KVM: x86 emulator: Use DstAcc for 'and'

For instruction 'and al,imm' we use DstAcc instead of doing
the emulation directly into the instruction's opcode.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 2b43208..ea05117 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -96,7 +96,7 @@ static u16 opcode_table[256] = {
 	/* 0x20 - 0x27 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	SrcImmByte, SrcImm, 0, 0,
+	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -1392,27 +1392,10 @@ special_insn:
 	      sbb:		/* sbb */
 		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
 		break;
-	case 0x20 ... 0x23:
+	case 0x20 ... 0x25:
 	      and:		/* and */
 		emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
 		break;
-	case 0x24:              /* and al imm8 */
-		c->dst.type = OP_REG;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		c->dst.val = *(u8 *)c->dst.ptr;
-		c->dst.bytes = 1;
-		c->dst.orig_val = c->dst.val;
-		goto and;
-	case 0x25:              /* and ax imm16, or eax imm32 */
-		c->dst.type = OP_REG;
-		c->dst.bytes = c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		if (c->op_bytes == 2)
-			c->dst.val = *(u16 *)c->dst.ptr;
-		else
-			c->dst.val = *(u32 *)c->dst.ptr;
-		c->dst.orig_val = c->dst.val;
-		goto and;
 	case 0x28 ... 0x2d:
 	      sub:		/* sub */
 		emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
-- 
cgit v0.10.2


From 387179464257921eb9aa3d15cc3ff194f6945a7c Mon Sep 17 00:00:00 2001
From: "Kay, Allen M" <allen.m.kay@intel.com>
Date: Tue, 9 Sep 2008 18:37:29 +0300
Subject: VT-d: Changes to support KVM

This patch extends the VT-d driver to support KVM

[Ben: fixed memory pinning]
[avi: move dma_remapping.h as well]

Signed-off-by: Kay, Allen M <allen.m.kay@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Acked-by: Mark Gross <mgross@linux.intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/drivers/pci/dma_remapping.h b/drivers/pci/dma_remapping.h
deleted file mode 100644
index bff5c65..0000000
--- a/drivers/pci/dma_remapping.h
+++ /dev/null
@@ -1,157 +0,0 @@
-#ifndef _DMA_REMAPPING_H
-#define _DMA_REMAPPING_H
-
-/*
- * We need a fixed PAGE_SIZE of 4K irrespective of
- * arch PAGE_SIZE for IOMMU page tables.
- */
-#define PAGE_SHIFT_4K		(12)
-#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
-#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
-#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
-
-#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
-#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
-
-
-/*
- * 0: Present
- * 1-11: Reserved
- * 12-63: Context Ptr (12 - (haw-1))
- * 64-127: Reserved
- */
-struct root_entry {
-	u64	val;
-	u64	rsvd1;
-};
-#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
-static inline bool root_present(struct root_entry *root)
-{
-	return (root->val & 1);
-}
-static inline void set_root_present(struct root_entry *root)
-{
-	root->val |= 1;
-}
-static inline void set_root_value(struct root_entry *root, unsigned long value)
-{
-	root->val |= value & PAGE_MASK_4K;
-}
-
-struct context_entry;
-static inline struct context_entry *
-get_context_addr_from_root(struct root_entry *root)
-{
-	return (struct context_entry *)
-		(root_present(root)?phys_to_virt(
-		root->val & PAGE_MASK_4K):
-		NULL);
-}
-
-/*
- * low 64 bits:
- * 0: present
- * 1: fault processing disable
- * 2-3: translation type
- * 12-63: address space root
- * high 64 bits:
- * 0-2: address width
- * 3-6: aval
- * 8-23: domain id
- */
-struct context_entry {
-	u64 lo;
-	u64 hi;
-};
-#define context_present(c) ((c).lo & 1)
-#define context_fault_disable(c) (((c).lo >> 1) & 1)
-#define context_translation_type(c) (((c).lo >> 2) & 3)
-#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
-#define context_address_width(c) ((c).hi &  7)
-#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
-
-#define context_set_present(c) do {(c).lo |= 1;} while (0)
-#define context_set_fault_enable(c) \
-	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
-#define context_set_translation_type(c, val) \
-	do { \
-		(c).lo &= (((u64)-1) << 4) | 3; \
-		(c).lo |= ((val) & 3) << 2; \
-	} while (0)
-#define CONTEXT_TT_MULTI_LEVEL 0
-#define context_set_address_root(c, val) \
-	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
-#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
-#define context_set_domain_id(c, val) \
-	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
-#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
-
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-11: available
- * 12-63: Host physcial address
- */
-struct dma_pte {
-	u64 val;
-};
-#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
-
-#define DMA_PTE_READ (1)
-#define DMA_PTE_WRITE (2)
-
-#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
-#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
-#define dma_set_pte_prot(p, prot) \
-		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
-#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
-#define dma_set_pte_addr(p, addr) do {\
-		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
-#define dma_pte_present(p) (((p).val & 3) != 0)
-
-struct intel_iommu;
-
-struct dmar_domain {
-	int	id;			/* domain id */
-	struct intel_iommu *iommu;	/* back pointer to owning iommu */
-
-	struct list_head devices; 	/* all devices' list */
-	struct iova_domain iovad;	/* iova's that belong to this domain */
-
-	struct dma_pte	*pgd;		/* virtual address */
-	spinlock_t	mapping_lock;	/* page table lock */
-	int		gaw;		/* max guest address width */
-
-	/* adjusted guest address width, 0 is level 2 30-bit */
-	int		agaw;
-
-#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
-	int		flags;
-};
-
-/* PCI domain-device relationship */
-struct device_domain_info {
-	struct list_head link;	/* link to domain siblings */
-	struct list_head global; /* link to global list */
-	u8 bus;			/* PCI bus numer */
-	u8 devfn;		/* PCI devfn number */
-	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
-	struct dmar_domain *domain; /* pointer to domain */
-};
-
-extern int init_dmars(void);
-extern void free_dmar_iommu(struct intel_iommu *iommu);
-
-extern int dmar_disabled;
-
-#ifndef CONFIG_DMAR_GFX_WA
-static inline void iommu_prepare_gfx_mapping(void)
-{
-	return;
-}
-#endif /* !CONFIG_DMAR_GFX_WA */
-
-#endif
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index bd2c016..e842e75 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -28,9 +28,9 @@
 
 #include <linux/pci.h>
 #include <linux/dmar.h>
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
 #include <linux/timer.h>
-#include "iova.h"
-#include "intel-iommu.h"
 
 #undef PREFIX
 #define PREFIX "DMAR:"
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 389fdd6..fc5f2db 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -33,8 +33,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
 #include <linux/timer.h>
-#include "iova.h"
-#include "intel-iommu.h"
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -156,7 +156,7 @@ static inline void *alloc_domain_mem(void)
 	return iommu_kmem_cache_alloc(iommu_domain_cache);
 }
 
-static inline void free_domain_mem(void *vaddr)
+static void free_domain_mem(void *vaddr)
 {
 	kmem_cache_free(iommu_domain_cache, vaddr);
 }
@@ -1341,7 +1341,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
  * find_domain
  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
  */
-struct dmar_domain *
+static struct dmar_domain *
 find_domain(struct pci_dev *pdev)
 {
 	struct device_domain_info *info;
@@ -2318,3 +2318,111 @@ int __init intel_iommu_init(void)
 	return 0;
 }
 
+void intel_iommu_domain_exit(struct dmar_domain *domain)
+{
+	u64 end;
+
+	/* Domain 0 is reserved, so dont process it */
+	if (!domain)
+		return;
+
+	end = DOMAIN_MAX_ADDR(domain->gaw);
+	end = end & (~PAGE_MASK_4K);
+
+	/* clear ptes */
+	dma_pte_clear_range(domain, 0, end);
+
+	/* free page tables */
+	dma_pte_free_pagetable(domain, 0, end);
+
+	iommu_free_domain(domain);
+	free_domain_mem(domain);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
+
+struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
+{
+	struct dmar_drhd_unit *drhd;
+	struct dmar_domain *domain;
+	struct intel_iommu *iommu;
+
+	drhd = dmar_find_matched_drhd_unit(pdev);
+	if (!drhd) {
+		printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
+		return NULL;
+	}
+
+	iommu = drhd->iommu;
+	if (!iommu) {
+		printk(KERN_ERR
+			"intel_iommu_domain_alloc: iommu == NULL\n");
+		return NULL;
+	}
+	domain = iommu_alloc_domain(iommu);
+	if (!domain) {
+		printk(KERN_ERR
+			"intel_iommu_domain_alloc: domain == NULL\n");
+		return NULL;
+	}
+	if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+		printk(KERN_ERR
+			"intel_iommu_domain_alloc: domain_init() failed\n");
+		intel_iommu_domain_exit(domain);
+		return NULL;
+	}
+	return domain;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
+
+int intel_iommu_context_mapping(
+	struct dmar_domain *domain, struct pci_dev *pdev)
+{
+	int rc;
+	rc = domain_context_mapping(domain, pdev);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
+
+int intel_iommu_page_mapping(
+	struct dmar_domain *domain, dma_addr_t iova,
+	u64 hpa, size_t size, int prot)
+{
+	int rc;
+	rc = domain_page_mapping(domain, iova, hpa, size, prot);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
+
+void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
+{
+	detach_domain_for_dev(domain, bus, devfn);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
+
+struct dmar_domain *
+intel_iommu_find_domain(struct pci_dev *pdev)
+{
+	return find_domain(pdev);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
+
+int intel_iommu_found(void)
+{
+	return g_num_of_iommus;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_found);
+
+u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
+{
+	struct dma_pte *pte;
+	u64 pfn;
+
+	pfn = 0;
+	pte = addr_to_dma_pte(domain, iova);
+
+	if (pte)
+		pfn = dma_pte_addr(*pte);
+
+	return pfn >> PAGE_SHIFT_4K;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);
diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h
deleted file mode 100644
index 2142c01..0000000
--- a/drivers/pci/intel-iommu.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Author: Ashok Raj <ashok.raj@intel.com>
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- */
-
-#ifndef _INTEL_IOMMU_H_
-#define _INTEL_IOMMU_H_
-
-#include <linux/types.h>
-#include <linux/msi.h>
-#include <linux/sysdev.h>
-#include "iova.h"
-#include <linux/io.h>
-#include <asm/cacheflush.h>
-#include "dma_remapping.h"
-
-/*
- * Intel IOMMU register specification per version 1.0 public spec.
- */
-
-#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
-#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
-#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
-#define	DMAR_GCMD_REG	0x18	/* Global command register */
-#define	DMAR_GSTS_REG	0x1c	/* Global status register */
-#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
-#define	DMAR_CCMD_REG	0x28	/* Context command reg */
-#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
-#define	DMAR_FECTL_REG	0x38	/* Fault control register */
-#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
-#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
-#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
-#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
-#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
-#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
-#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
-#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
-#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
-#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
-#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
-#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
-#define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
-#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
-
-#define OFFSET_STRIDE		(9)
-/*
-#define dmar_readl(dmar, reg) readl(dmar + reg)
-#define dmar_readq(dmar, reg) ({ \
-		u32 lo, hi; \
-		lo = readl(dmar + reg); \
-		hi = readl(dmar + reg + 4); \
-		(((u64) hi) << 32) + lo; })
-*/
-static inline u64 dmar_readq(void __iomem *addr)
-{
-	u32 lo, hi;
-	lo = readl(addr);
-	hi = readl(addr + 4);
-	return (((u64) hi) << 32) + lo;
-}
-
-static inline void dmar_writeq(void __iomem *addr, u64 val)
-{
-	writel((u32)val, addr);
-	writel((u32)(val >> 32), addr + 4);
-}
-
-#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
-#define DMAR_VER_MINOR(v)		((v) & 0x0f)
-
-/*
- * Decoding Capability Register
- */
-#define cap_read_drain(c)	(((c) >> 55) & 1)
-#define cap_write_drain(c)	(((c) >> 54) & 1)
-#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
-#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
-#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
-
-#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
-#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
-					* OFFSET_STRIDE) + 21)
-
-#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
-#define cap_max_fault_reg_offset(c) \
-	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
-
-#define cap_zlr(c)		(((c) >> 22) & 1)
-#define cap_isoch(c)		(((c) >> 23) & 1)
-#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
-#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
-#define cap_caching_mode(c)	(((c) >> 7) & 1)
-#define cap_phmr(c)		(((c) >> 6) & 1)
-#define cap_plmr(c)		(((c) >> 5) & 1)
-#define cap_rwbf(c)		(((c) >> 4) & 1)
-#define cap_afl(c)		(((c) >> 3) & 1)
-#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
-/*
- * Extended Capability Register
- */
-
-#define ecap_niotlb_iunits(e)	((((e) >> 24) & 0xff) + 1)
-#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
-#define ecap_max_iotlb_offset(e) \
-	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
-#define ecap_coherent(e)	((e) & 0x1)
-#define ecap_qis(e)		((e) & 0x2)
-#define ecap_eim_support(e)	((e >> 4) & 0x1)
-#define ecap_ir_support(e)	((e >> 3) & 0x1)
-#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-
-
-/* IOTLB_REG */
-#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
-#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
-#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
-#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
-#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
-#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
-#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
-#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
-#define DMA_TLB_IVT (((u64)1) << 63)
-#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
-#define DMA_TLB_MAX_SIZE (0x3f)
-
-/* INVALID_DESC */
-#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 3)
-#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 3)
-#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 3)
-#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
-#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
-#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
-#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
-#define DMA_ID_TLB_ADDR(addr)	(addr)
-#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
-
-/* PMEN_REG */
-#define DMA_PMEN_EPM (((u32)1)<<31)
-#define DMA_PMEN_PRS (((u32)1)<<0)
-
-/* GCMD_REG */
-#define DMA_GCMD_TE (((u32)1) << 31)
-#define DMA_GCMD_SRTP (((u32)1) << 30)
-#define DMA_GCMD_SFL (((u32)1) << 29)
-#define DMA_GCMD_EAFL (((u32)1) << 28)
-#define DMA_GCMD_WBF (((u32)1) << 27)
-#define DMA_GCMD_QIE (((u32)1) << 26)
-#define DMA_GCMD_SIRTP (((u32)1) << 24)
-#define DMA_GCMD_IRE (((u32) 1) << 25)
-
-/* GSTS_REG */
-#define DMA_GSTS_TES (((u32)1) << 31)
-#define DMA_GSTS_RTPS (((u32)1) << 30)
-#define DMA_GSTS_FLS (((u32)1) << 29)
-#define DMA_GSTS_AFLS (((u32)1) << 28)
-#define DMA_GSTS_WBFS (((u32)1) << 27)
-#define DMA_GSTS_QIES (((u32)1) << 26)
-#define DMA_GSTS_IRTPS (((u32)1) << 24)
-#define DMA_GSTS_IRES (((u32)1) << 25)
-
-/* CCMD_REG */
-#define DMA_CCMD_ICC (((u64)1) << 63)
-#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
-#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
-#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
-#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
-#define DMA_CCMD_MASK_NOBIT 0
-#define DMA_CCMD_MASK_1BIT 1
-#define DMA_CCMD_MASK_2BIT 2
-#define DMA_CCMD_MASK_3BIT 3
-#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
-#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
-
-/* FECTL_REG */
-#define DMA_FECTL_IM (((u32)1) << 31)
-
-/* FSTS_REG */
-#define DMA_FSTS_PPF ((u32)2)
-#define DMA_FSTS_PFO ((u32)1)
-#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
-
-/* FRCD_REG, 32 bits access */
-#define DMA_FRCD_F (((u32)1) << 31)
-#define dma_frcd_type(d) ((d >> 30) & 1)
-#define dma_frcd_fault_reason(c) (c & 0xff)
-#define dma_frcd_source_id(c) (c & 0xffff)
-#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
-
-#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
-
-#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
-{\
-	cycles_t start_time = get_cycles();\
-	while (1) {\
-		sts = op (iommu->reg + offset);\
-		if (cond)\
-			break;\
-		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
-			panic("DMAR hardware is malfunctioning\n");\
-		cpu_relax();\
-	}\
-}
-
-#define QI_LENGTH	256	/* queue length */
-
-enum {
-	QI_FREE,
-	QI_IN_USE,
-	QI_DONE
-};
-
-#define QI_CC_TYPE		0x1
-#define QI_IOTLB_TYPE		0x2
-#define QI_DIOTLB_TYPE		0x3
-#define QI_IEC_TYPE		0x4
-#define QI_IWD_TYPE		0x5
-
-#define QI_IEC_SELECTIVE	(((u64)1) << 4)
-#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
-#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
-
-#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
-#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
-
-struct qi_desc {
-	u64 low, high;
-};
-
-struct q_inval {
-	spinlock_t      q_lock;
-	struct qi_desc  *desc;          /* invalidation queue */
-	int             *desc_status;   /* desc status */
-	int             free_head;      /* first free entry */
-	int             free_tail;      /* last free entry */
-	int             free_cnt;
-};
-
-#ifdef CONFIG_INTR_REMAP
-/* 1MB - maximum possible interrupt remapping table size */
-#define INTR_REMAP_PAGE_ORDER	8
-#define INTR_REMAP_TABLE_REG_SIZE	0xf
-
-#define INTR_REMAP_TABLE_ENTRIES	65536
-
-struct ir_table {
-	struct irte *base;
-};
-#endif
-
-struct intel_iommu {
-	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
-	u64		cap;
-	u64		ecap;
-	int		seg;
-	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
-	spinlock_t	register_lock; /* protect register handling */
-	int		seq_id;	/* sequence id of the iommu */
-
-#ifdef CONFIG_DMAR
-	unsigned long 	*domain_ids; /* bitmap of domains */
-	struct dmar_domain **domains; /* ptr to domains */
-	spinlock_t	lock; /* protect context, domain ids */
-	struct root_entry *root_entry; /* virtual address */
-
-	unsigned int irq;
-	unsigned char name[7];    /* Device Name */
-	struct msi_msg saved_msg;
-	struct sys_device sysdev;
-#endif
-	struct q_inval  *qi;            /* Queued invalidation info */
-#ifdef CONFIG_INTR_REMAP
-	struct ir_table *ir_table;	/* Interrupt remapping info */
-#endif
-};
-
-static inline void __iommu_flush_cache(
-	struct intel_iommu *iommu, void *addr, int size)
-{
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(addr, size);
-}
-
-extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-
-extern int alloc_iommu(struct dmar_drhd_unit *drhd);
-extern void free_iommu(struct intel_iommu *iommu);
-extern int dmar_enable_qi(struct intel_iommu *iommu);
-extern void qi_global_iec(struct intel_iommu *iommu);
-
-extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
-#endif
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index bb642cc..738d4c8 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -4,7 +4,7 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <asm/io_apic.h>
-#include "intel-iommu.h"
+#include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
index 05f2635..ca48f0d 100644
--- a/drivers/pci/intr_remapping.h
+++ b/drivers/pci/intr_remapping.h
@@ -1,4 +1,4 @@
-#include "intel-iommu.h"
+#include <linux/intel-iommu.h>
 
 struct ioapic_scope {
 	struct intel_iommu *iommu;
diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
index 3ef4ac0..2287116 100644
--- a/drivers/pci/iova.c
+++ b/drivers/pci/iova.c
@@ -7,7 +7,7 @@
  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  */
 
-#include "iova.h"
+#include <linux/iova.h>
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h
deleted file mode 100644
index 228f6c9..0000000
--- a/drivers/pci/iova.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This file is released under the GPLv2.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *
- */
-
-#ifndef _IOVA_H_
-#define _IOVA_H_
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/rbtree.h>
-#include <linux/dma-mapping.h>
-
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN		(1)
-
-/* iova structure */
-struct iova {
-	struct rb_node	node;
-	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
-	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
-};
-
-/* holds all the iova translations for a domain */
-struct iova_domain {
-	spinlock_t	iova_alloc_lock;/* Lock to protect iova  allocation */
-	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
-	struct rb_root	rbroot;		/* iova domain rbtree root */
-	struct rb_node	*cached32_node; /* Save last alloced node */
-	unsigned long	dma_32bit_pfn;
-};
-
-struct iova *alloc_iova_mem(void);
-void free_iova_mem(struct iova *iova);
-void free_iova(struct iova_domain *iovad, unsigned long pfn);
-void __free_iova(struct iova_domain *iovad, struct iova *iova);
-struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
-	unsigned long limit_pfn,
-	bool size_aligned);
-struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
-	unsigned long pfn_hi);
-void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
-void put_iova_domain(struct iova_domain *iovad);
-
-#endif
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
new file mode 100644
index 0000000..bff5c65
--- /dev/null
+++ b/include/linux/dma_remapping.h
@@ -0,0 +1,157 @@
+#ifndef _DMA_REMAPPING_H
+#define _DMA_REMAPPING_H
+
+/*
+ * We need a fixed PAGE_SIZE of 4K irrespective of
+ * arch PAGE_SIZE for IOMMU page tables.
+ */
+#define PAGE_SHIFT_4K		(12)
+#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
+#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
+#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
+
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
+#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
+#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
+
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	val;
+	u64	rsvd1;
+};
+#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+static inline bool root_present(struct root_entry *root)
+{
+	return (root->val & 1);
+}
+static inline void set_root_present(struct root_entry *root)
+{
+	root->val |= 1;
+}
+static inline void set_root_value(struct root_entry *root, unsigned long value)
+{
+	root->val |= value & PAGE_MASK_4K;
+}
+
+struct context_entry;
+static inline struct context_entry *
+get_context_addr_from_root(struct root_entry *root)
+{
+	return (struct context_entry *)
+		(root_present(root)?phys_to_virt(
+		root->val & PAGE_MASK_4K):
+		NULL);
+}
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while (0)
+#define context_set_fault_enable(c) \
+	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
+#define context_set_translation_type(c, val) \
+	do { \
+		(c).lo &= (((u64)-1) << 4) | 3; \
+		(c).lo |= ((val) & 3) << 2; \
+	} while (0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define context_set_address_root(c, val) \
+	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
+#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
+#define context_set_domain_id(c, val) \
+	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
+
+#define DMA_PTE_READ (1)
+#define DMA_PTE_WRITE (2)
+
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
+#define dma_set_pte_prot(p, prot) \
+		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_set_pte_addr(p, addr) do {\
+		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
+struct intel_iommu;
+
+struct dmar_domain {
+	int	id;			/* domain id */
+	struct intel_iommu *iommu;	/* back pointer to owning iommu */
+
+	struct list_head devices; 	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	spinlock_t	mapping_lock;	/* page table lock */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
+	int		flags;
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u8 bus;			/* PCI bus numer */
+	u8 devfn;		/* PCI devfn number */
+	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct dmar_domain *domain; /* pointer to domain */
+};
+
+extern int init_dmars(void);
+extern void free_dmar_iommu(struct intel_iommu *iommu);
+
+extern int dmar_disabled;
+
+#ifndef CONFIG_DMAR_GFX_WA
+static inline void iommu_prepare_gfx_mapping(void)
+{
+	return;
+}
+#endif /* !CONFIG_DMAR_GFX_WA */
+
+#endif
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
new file mode 100644
index 0000000..2e117f3
--- /dev/null
+++ b/include/linux/intel-iommu.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <linux/types.h>
+#include <linux/msi.h>
+#include <linux/sysdev.h>
+#include <linux/iova.h>
+#include <linux/io.h>
+#include <linux/dma_remapping.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+
+#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
+#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
+#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
+#define	DMAR_GCMD_REG	0x18	/* Global command register */
+#define	DMAR_GSTS_REG	0x1c	/* Global status register */
+#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
+#define	DMAR_CCMD_REG	0x28	/* Context command reg */
+#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
+#define	DMAR_FECTL_REG	0x38	/* Fault control register */
+#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
+#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
+#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
+#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
+#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
+#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
+#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
+#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
+#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
+#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
+#define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
+#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
+
+#define OFFSET_STRIDE		(9)
+/*
+#define dmar_readl(dmar, reg) readl(dmar + reg)
+#define dmar_readq(dmar, reg) ({ \
+		u32 lo, hi; \
+		lo = readl(dmar + reg); \
+		hi = readl(dmar + reg + 4); \
+		(((u64) hi) << 32) + lo; })
+*/
+static inline u64 dmar_readq(void __iomem *addr)
+{
+	u32 lo, hi;
+	lo = readl(addr);
+	hi = readl(addr + 4);
+	return (((u64) hi) << 32) + lo;
+}
+
+static inline void dmar_writeq(void __iomem *addr, u64 val)
+{
+	writel((u32)val, addr);
+	writel((u32)(val >> 32), addr + 4);
+}
+
+#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
+#define DMAR_VER_MINOR(v)		((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_read_drain(c)	(((c) >> 55) & 1)
+#define cap_write_drain(c)	(((c) >> 54) & 1)
+#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
+
+#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
+#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
+					* OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
+#define cap_max_fault_reg_offset(c) \
+	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
+
+#define cap_zlr(c)		(((c) >> 22) & 1)
+#define cap_isoch(c)		(((c) >> 23) & 1)
+#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)	(((c) >> 7) & 1)
+#define cap_phmr(c)		(((c) >> 6) & 1)
+#define cap_plmr(c)		(((c) >> 5) & 1)
+#define cap_rwbf(c)		(((c) >> 4) & 1)
+#define cap_afl(c)		(((c) >> 3) & 1)
+#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
+/*
+ * Extended Capability Register
+ */
+
+#define ecap_niotlb_iunits(e)	((((e) >> 24) & 0xff) + 1)
+#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
+#define ecap_max_iotlb_offset(e) \
+	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
+#define ecap_coherent(e)	((e) & 0x1)
+#define ecap_qis(e)		((e) & 0x2)
+#define ecap_eim_support(e)	((e >> 4) & 0x1)
+#define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
+
+
+/* IOTLB_REG */
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
+#define DMA_TLB_IVT (((u64)1) << 63)
+#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
+#define DMA_TLB_MAX_SIZE (0x3f)
+
+/* INVALID_DESC */
+#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 3)
+#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 3)
+#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 3)
+#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
+#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
+#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
+#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
+#define DMA_ID_TLB_ADDR(addr)	(addr)
+#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
+
+/* PMEN_REG */
+#define DMA_PMEN_EPM (((u32)1)<<31)
+#define DMA_PMEN_PRS (((u32)1)<<0)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE (((u32)1) << 31)
+#define DMA_GCMD_SRTP (((u32)1) << 30)
+#define DMA_GCMD_SFL (((u32)1) << 29)
+#define DMA_GCMD_EAFL (((u32)1) << 28)
+#define DMA_GCMD_WBF (((u32)1) << 27)
+#define DMA_GCMD_QIE (((u32)1) << 26)
+#define DMA_GCMD_SIRTP (((u32)1) << 24)
+#define DMA_GCMD_IRE (((u32) 1) << 25)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES (((u32)1) << 31)
+#define DMA_GSTS_RTPS (((u32)1) << 30)
+#define DMA_GSTS_FLS (((u32)1) << 29)
+#define DMA_GSTS_AFLS (((u32)1) << 28)
+#define DMA_GSTS_WBFS (((u32)1) << 27)
+#define DMA_GSTS_QIES (((u32)1) << 26)
+#define DMA_GSTS_IRTPS (((u32)1) << 24)
+#define DMA_GSTS_IRES (((u32)1) << 25)
+
+/* CCMD_REG */
+#define DMA_CCMD_ICC (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u32)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PPF ((u32)2)
+#define DMA_FSTS_PFO ((u32)1)
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u32)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
+
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
+{\
+	cycles_t start_time = get_cycles();\
+	while (1) {\
+		sts = op (iommu->reg + offset);\
+		if (cond)\
+			break;\
+		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
+			panic("DMAR hardware is malfunctioning\n");\
+		cpu_relax();\
+	}\
+}
+
+#define QI_LENGTH	256	/* queue length */
+
+enum {
+	QI_FREE,
+	QI_IN_USE,
+	QI_DONE
+};
+
+#define QI_CC_TYPE		0x1
+#define QI_IOTLB_TYPE		0x2
+#define QI_DIOTLB_TYPE		0x3
+#define QI_IEC_TYPE		0x4
+#define QI_IWD_TYPE		0x5
+
+#define QI_IEC_SELECTIVE	(((u64)1) << 4)
+#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
+#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
+
+#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
+#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+
+struct qi_desc {
+	u64 low, high;
+};
+
+struct q_inval {
+	spinlock_t      q_lock;
+	struct qi_desc  *desc;          /* invalidation queue */
+	int             *desc_status;   /* desc status */
+	int             free_head;      /* first free entry */
+	int             free_tail;      /* last free entry */
+	int             free_cnt;
+};
+
+#ifdef CONFIG_INTR_REMAP
+/* 1MB - maximum possible interrupt remapping table size */
+#define INTR_REMAP_PAGE_ORDER	8
+#define INTR_REMAP_TABLE_REG_SIZE	0xf
+
+#define INTR_REMAP_TABLE_ENTRIES	65536
+
+struct ir_table {
+	struct irte *base;
+};
+#endif
+
+struct intel_iommu {
+	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
+	u64		cap;
+	u64		ecap;
+	int		seg;
+	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
+	spinlock_t	register_lock; /* protect register handling */
+	int		seq_id;	/* sequence id of the iommu */
+
+#ifdef CONFIG_DMAR
+	unsigned long 	*domain_ids; /* bitmap of domains */
+	struct dmar_domain **domains; /* ptr to domains */
+	spinlock_t	lock; /* protect context, domain ids */
+	struct root_entry *root_entry; /* virtual address */
+
+	unsigned int irq;
+	unsigned char name[7];    /* Device Name */
+	struct msi_msg saved_msg;
+	struct sys_device sysdev;
+#endif
+	struct q_inval  *qi;            /* Queued invalidation info */
+#ifdef CONFIG_INTR_REMAP
+	struct ir_table *ir_table;	/* Interrupt remapping info */
+#endif
+};
+
+static inline void __iommu_flush_cache(
+	struct intel_iommu *iommu, void *addr, int size)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(addr, size);
+}
+
+extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+
+extern int alloc_iommu(struct dmar_drhd_unit *drhd);
+extern void free_iommu(struct intel_iommu *iommu);
+extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void qi_global_iec(struct intel_iommu *iommu);
+
+extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+
+void intel_iommu_domain_exit(struct dmar_domain *domain);
+struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
+int intel_iommu_context_mapping(struct dmar_domain *domain,
+				struct pci_dev *pdev);
+int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
+			     u64 hpa, size_t size, int prot);
+void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);
+struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev);
+u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova);
+
+#ifdef CONFIG_DMAR
+int intel_iommu_found(void);
+#else /* CONFIG_DMAR */
+static inline int intel_iommu_found(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR */
+
+#endif
diff --git a/include/linux/iova.h b/include/linux/iova.h
new file mode 100644
index 0000000..228f6c9
--- /dev/null
+++ b/include/linux/iova.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ */
+
+#ifndef _IOVA_H_
+#define _IOVA_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h>
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
+/* iova structure */
+struct iova {
+	struct rb_node	node;
+	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
+	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+};
+
+/* holds all the iova translations for a domain */
+struct iova_domain {
+	spinlock_t	iova_alloc_lock;/* Lock to protect iova  allocation */
+	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
+	struct rb_root	rbroot;		/* iova domain rbtree root */
+	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	dma_32bit_pfn;
+};
+
+struct iova *alloc_iova_mem(void);
+void free_iova_mem(struct iova *iova);
+void free_iova(struct iova_domain *iovad, unsigned long pfn);
+void __free_iova(struct iova_domain *iovad, struct iova *iova);
+struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned);
+struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
+	unsigned long pfn_hi);
+void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
+void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
+void put_iova_domain(struct iova_domain *iovad);
+
+#endif
-- 
cgit v0.10.2


From 62c476c7c7f25a5b245b9902a935636e6316e58c Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Sun, 14 Sep 2008 03:48:28 +0300
Subject: KVM: Device Assignment with VT-d

Based on a patch by: Kay, Allen M <allen.m.kay@intel.com>

This patch enables PCI device assignment based on VT-d support.
When a device is assigned to the guest, the guest memory is pinned and
the mapping is updated in the VT-d IOMMU.

[Amit: Expose KVM_CAP_IOMMU so we can check if an IOMMU is present
and also control enable/disable from userspace]

Signed-off-by: Kay, Allen M <allen.m.kay@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Amit Shah <amit.shah@qumranet.com>

Acked-by: Mark Gross <mgross@linux.intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940b..3072b17 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,6 +12,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
 	i8254.o
+ifeq ($(CONFIG_DMAR),y)
+kvm-objs += vtd.o
+endif
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c
new file mode 100644
index 0000000..667bf3f
--- /dev/null
+++ b/arch/x86/kvm/vtd.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+				gfn_t base_gfn, unsigned long npages);
+
+int kvm_iommu_map_pages(struct kvm *kvm,
+			gfn_t base_gfn, unsigned long npages)
+{
+	gfn_t gfn = base_gfn;
+	pfn_t pfn;
+	int i, r;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	r = -EINVAL;
+	for (i = 0; i < npages; i++) {
+		/* check if already mapped */
+		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+						     gfn_to_gpa(gfn));
+		if (pfn && !is_mmio_pfn(pfn))
+			continue;
+
+		pfn = gfn_to_pfn(kvm, gfn);
+		if (!is_mmio_pfn(pfn)) {
+			r = intel_iommu_page_mapping(domain,
+						     gfn_to_gpa(gfn),
+						     pfn_to_hpa(pfn),
+						     PAGE_SIZE,
+						     DMA_PTE_READ |
+						     DMA_PTE_WRITE);
+			if (r) {
+				printk(KERN_DEBUG "kvm_iommu_map_pages:"
+				       "iommu failed to map pfn=%lx\n", pfn);
+				goto unmap_pages;
+			}
+		} else {
+			printk(KERN_DEBUG "kvm_iommu_map_page:"
+			       "invalid pfn=%lx\n", pfn);
+			goto unmap_pages;
+		}
+		gfn++;
+	}
+	return 0;
+
+unmap_pages:
+	kvm_iommu_put_pages(kvm, base_gfn, i);
+	return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+	int i, r;
+
+	down_read(&kvm->slots_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
+					kvm->memslots[i].npages);
+		if (r)
+			break;
+	}
+	up_read(&kvm->slots_lock);
+	return r;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	struct pci_dev *pdev = NULL;
+	int r;
+
+	if (!intel_iommu_found()) {
+		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
+		return -ENODEV;
+	}
+
+	printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
+	       assigned_dev->host_busnr,
+	       PCI_SLOT(assigned_dev->host_devfn),
+	       PCI_FUNC(assigned_dev->host_devfn));
+
+	pdev = assigned_dev->dev;
+
+	if (pdev == NULL) {
+		if (kvm->arch.intel_iommu_domain) {
+			intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
+			kvm->arch.intel_iommu_domain = NULL;
+		}
+		return -ENODEV;
+	}
+
+	kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
+	if (!kvm->arch.intel_iommu_domain)
+		return -ENODEV;
+
+	r = kvm_iommu_map_memslots(kvm);
+	if (r)
+		goto out_unmap;
+
+	intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
+			       pdev->bus->number, pdev->devfn);
+
+	r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
+					pdev);
+	if (r) {
+		printk(KERN_ERR "Domain context map for %s failed",
+		       pci_name(pdev));
+		goto out_unmap;
+	}
+	return 0;
+
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+			       gfn_t base_gfn, unsigned long npages)
+{
+	gfn_t gfn = base_gfn;
+	pfn_t pfn;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+						     gfn_to_gpa(gfn));
+		kvm_release_pfn_clean(pfn);
+		gfn++;
+	}
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+	int i;
+	down_read(&kvm->slots_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
+				    kvm->memslots[i].npages);
+	}
+	up_read(&kvm->slots_lock);
+
+	return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+	struct kvm_assigned_dev_kernel *entry;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
+		printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
+		       entry->host_busnr,
+		       PCI_SLOT(entry->host_devfn),
+		       PCI_FUNC(entry->host_devfn));
+
+		/* detach kvm dmar domain */
+		intel_iommu_detach_dev(domain, entry->host_busnr,
+				       entry->host_devfn);
+	}
+	kvm_iommu_unmap_memslots(kvm);
+	intel_iommu_domain_exit(domain);
+	return 0;
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2134f3e..c8a2793 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -35,6 +35,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -277,9 +278,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+		r = kvm_iommu_map_guest(kvm, match);
+		if (r)
+			goto out_list_del;
+	}
+
 out:
 	mutex_unlock(&kvm->lock);
 	return r;
+out_list_del:
+	list_del(&match->list);
+	pci_release_regions(dev);
 out_disable:
 	pci_disable_device(dev);
 out_put:
@@ -1147,6 +1157,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PV_MMU:
 		r = !tdp_enabled;
 		break;
+	case KVM_CAP_IOMMU:
+		r = intel_iommu_found();
+		break;
 	default:
 		r = 0;
 		break;
@@ -4282,6 +4295,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_iommu_unmap_guest(kvm);
 	kvm_free_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 68a3ac1..805629c 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -331,26 +331,6 @@ struct kvm_mem_alias {
 	gfn_t target_gfn;
 };
 
-struct kvm_irq_ack_notifier {
-	struct hlist_node link;
-	unsigned gsi;
-	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
-};
-
-struct kvm_assigned_dev_kernel {
-	struct kvm_irq_ack_notifier ack_notifier;
-	struct work_struct interrupt_work;
-	struct list_head list;
-	int assigned_dev_id;
-	int host_busnr;
-	int host_devfn;
-	int host_irq;
-	int guest_irq;
-	int irq_requested;
-	struct pci_dev *dev;
-	struct kvm *kvm;
-};
-
 struct kvm_arch{
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
@@ -364,6 +344,7 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
+	struct dmar_domain *intel_iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
@@ -514,6 +495,8 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret);
 
+int is_mmio_pfn(pfn_t pfn);
+
 extern bool tdp_enabled;
 
 enum emulation_result {
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ef4bc6f..4269be1 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -384,6 +384,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
+#define KVM_CAP_IOMMU 18
 
 /*
  * ioctls for VM fds
@@ -495,4 +496,6 @@ struct kvm_assigned_irq {
 	__u32 flags;
 };
 
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4b03643..6252802 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -286,6 +286,53 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
+struct kvm_irq_ack_notifier {
+	struct hlist_node link;
+	unsigned gsi;
+	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
+};
+
+struct kvm_assigned_dev_kernel {
+	struct kvm_irq_ack_notifier ack_notifier;
+	struct work_struct interrupt_work;
+	struct list_head list;
+	int assigned_dev_id;
+	int host_busnr;
+	int host_devfn;
+	int host_irq;
+	int guest_irq;
+	int irq_requested;
+	struct pci_dev *dev;
+	struct kvm *kvm;
+};
+
+#ifdef CONFIG_DMAR
+int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
+			unsigned long npages);
+int kvm_iommu_map_guest(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+#else /* CONFIG_DMAR */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+				      gfn_t base_gfn,
+				      unsigned long npages)
+{
+	return 0;
+}
+
+static inline int kvm_iommu_map_guest(struct kvm *kvm,
+				      struct kvm_assigned_dev_kernel
+				      *assigned_dev)
+{
+	return -ENODEV;
+}
+
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR */
+
 static inline void kvm_guest_enter(void)
 {
 	account_system_vtime(current);
@@ -308,6 +355,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn)
 	return (gpa_t)gfn << PAGE_SHIFT;
 }
 
+static inline hpa_t pfn_to_hpa(pfn_t pfn)
+{
+	return (hpa_t)pfn << PAGE_SHIFT;
+}
+
 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
 	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 63e661b..f42d5c2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -41,6 +41,7 @@
 #include <linux/pagemap.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -76,7 +77,7 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
-static inline int is_mmio_pfn(pfn_t pfn)
+inline int is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn))
 		return PageReserved(pfn_to_page(pfn));
@@ -578,6 +579,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	}
 
 	kvm_free_physmem_slot(&old, &new);
+
+	/* map the pages in iommu page table */
+	r = kvm_iommu_map_pages(kvm, base_gfn, npages);
+	if (r)
+		goto out;
+
 	return 0;
 
 out_free:
-- 
cgit v0.10.2


From bfadaded0dc323a1cf3f08b5068f12955b54cbaa Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Tue, 16 Sep 2008 18:04:28 +0300
Subject: KVM: Device Assignment: Free device structures if IRQ allocation
 fails

When an IRQ allocation fails, we free up the device structures and
disable the device so that we can unregister the device in the
userspace and not expose it to the guest at all.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c8a2793..61eddbe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -166,6 +166,43 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	enable_irq(dev->host_irq);
 }
 
+static void kvm_free_assigned_device(struct kvm *kvm,
+				     struct kvm_assigned_dev_kernel
+				     *assigned_dev)
+{
+	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+
+	if (cancel_work_sync(&assigned_dev->interrupt_work))
+		/* We had pending work. That means we will have to take
+		 * care of kvm_put_kvm.
+		 */
+		kvm_put_kvm(kvm);
+
+	pci_release_regions(assigned_dev->dev);
+	pci_disable_device(assigned_dev->dev);
+	pci_dev_put(assigned_dev->dev);
+
+	list_del(&assigned_dev->list);
+	kfree(assigned_dev);
+}
+
+static void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		kvm_free_assigned_device(kvm, assigned_dev);
+	}
+}
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -194,8 +231,8 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 
 	if (irqchip_in_kernel(kvm)) {
 		if (!capable(CAP_SYS_RAWIO)) {
-			return -EPERM;
-			goto out;
+			r = -EPERM;
+			goto out_release;
 		}
 
 		if (assigned_irq->host_irq)
@@ -214,17 +251,18 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		 */
 		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
 				"kvm_assigned_device", (void *)match)) {
-			printk(KERN_INFO "%s: couldn't allocate irq for pv "
-			       "device\n", __func__);
 			r = -EIO;
-			goto out;
+			goto out_release;
 		}
 	}
 
 	match->irq_requested = true;
-out:
 	mutex_unlock(&kvm->lock);
 	return r;
+out_release:
+	mutex_unlock(&kvm->lock);
+	kvm_free_assigned_device(kvm, match);
+	return r;
 }
 
 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
@@ -300,40 +338,6 @@ out_free:
 	return r;
 }
 
-static void kvm_free_assigned_devices(struct kvm *kvm)
-{
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
-
-		if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) {
-			free_irq(assigned_dev->host_irq,
-				 (void *)assigned_dev);
-
-			kvm_unregister_irq_ack_notifier(kvm,
-							&assigned_dev->
-							ack_notifier);
-		}
-
-		if (cancel_work_sync(&assigned_dev->interrupt_work))
-			/* We had pending work. That means we will have to take
-			 * care of kvm_put_kvm.
-			 */
-			kvm_put_kvm(kvm);
-
-		pci_release_regions(assigned_dev->dev);
-		pci_disable_device(assigned_dev->dev);
-		pci_dev_put(assigned_dev->dev);
-
-		list_del(&assigned_dev->list);
-		kfree(assigned_dev);
-	}
-}
-
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
@@ -4296,7 +4300,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
-	kvm_free_assigned_devices(kvm);
+	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
-- 
cgit v0.10.2


From 777b3f49d297e387866604093b635e5bc9b9d2a6 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Sep 2008 20:54:46 -0300
Subject: KVM: opencode gfn_to_page in kvm_vm_fault

kvm_vm_fault is invoked with mmap_sem held in read mode. Since gfn_to_page
will be converted to get_user_pages_fast, which requires this lock NOT
to be held, switch to opencoded get_user_pages.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f42d5c2..2907d05 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1394,17 +1394,22 @@ out:
 
 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page[1];
+	unsigned long addr;
+	int npages;
+	gfn_t gfn = vmf->pgoff;
 	struct kvm *kvm = vma->vm_file->private_data;
-	struct page *page;
 
-	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
+	addr = gfn_to_hva(kvm, gfn);
+	if (kvm_is_error_hva(addr))
 		return VM_FAULT_SIGBUS;
-	page = gfn_to_page(kvm, vmf->pgoff);
-	if (is_error_page(page)) {
-		kvm_release_page_clean(page);
+
+	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
+				NULL);
+	if (unlikely(npages != 1))
 		return VM_FAULT_SIGBUS;
-	}
-	vmf->page = page;
+
+	vmf->page = page[0];
 	return 0;
 }
 
-- 
cgit v0.10.2


From 4c2155ce81c193788082d4b8cdbc26d79edebc58 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Sep 2008 20:54:47 -0300
Subject: KVM: switch to get_user_pages_fast

Convert gfn_to_pfn to use get_user_pages_fast, which can do lockless
pagetable lookups on x86. Kernel compilation on 4-way guest is 3.7%
faster on VMX.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 7b11fd7..2e227a4 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -147,9 +147,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	stlbe = &vcpu->arch.shadow_tlb[victim];
 
 	/* Get reference to new page. */
-	down_read(&current->mm->mmap_sem);
 	new_page = gfn_to_page(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 	if (is_error_page(new_page)) {
 		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
 		kvm_release_page_clean(new_page);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bce3e25..5779a23 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -405,16 +405,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr;
+	int ret = 0;
 
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
-		return 0;
+		return ret;
 
+	down_read(&current->mm->mmap_sem);
 	vma = find_vma(current->mm, addr);
 	if (vma && is_vm_hugetlb_page(vma))
-		return 1;
+		ret = 1;
+	up_read(&current->mm->mmap_sem);
 
-	return 0;
+	return ret;
 }
 
 static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1140,9 +1143,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	if (gpa == UNMAPPED_GVA)
 		return NULL;
 
-	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
 
 	return page;
 }
@@ -1330,16 +1331,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn_t pfn;
 	unsigned long mmu_seq;
 
-	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 
 	/* mmio */
 	if (is_error_pfn(pfn)) {
@@ -1488,15 +1487,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	if (r)
 		return r;
 
-	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return 1;
@@ -1809,15 +1806,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
-	down_read(&current->mm->mmap_sem);
 	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
 	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b671f61..6dd08e0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -102,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
 	pt_element_t *table;
 	struct page *page;
 
-	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(kvm, table_gfn);
-	up_read(&current->mm->mmap_sem);
 
 	table = kmap_atomic(page, KM_USER0);
-
 	ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
 	kunmap_atomic(table, KM_USER0);
 
 	kvm_release_page_dirty(page);
@@ -418,7 +414,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
 	if (walker.level == PT_DIRECTORY_LEVEL) {
 		gfn_t large_gfn;
 		large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -428,9 +423,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 	}
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
-	up_read(&current->mm->mmap_sem);
 
 	/* mmio */
 	if (is_error_pfn(pfn)) {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 046a91b..025bf40 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2010,9 +2010,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	if (r)
 		goto out;
 
-	down_read(&current->mm->mmap_sem);
 	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-	up_read(&current->mm->mmap_sem);
 out:
 	up_write(&kvm->slots_lock);
 	return r;
@@ -2034,10 +2032,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	if (r)
 		goto out;
 
-	down_read(&current->mm->mmap_sem);
 	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
 			VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
 out:
 	up_write(&kvm->slots_lock);
 	return r;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 61eddbe..108f072 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -946,10 +946,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		/* ...but clean it before doing the actual write */
 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 
-		down_read(&current->mm->mmap_sem);
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-		up_read(&current->mm->mmap_sem);
 
 		if (is_error_page(vcpu->arch.time_page)) {
 			kvm_release_page_clean(vcpu->arch.time_page);
@@ -2322,9 +2320,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 
 		val = *(u64 *)new;
 
-		down_read(&current->mm->mmap_sem);
 		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-		up_read(&current->mm->mmap_sem);
 
 		kaddr = kmap_atomic(page, KM_USER0);
 		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -3089,9 +3085,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
 	if (!apic || !apic->vapic_addr)
 		return;
 
-	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
 
 	vcpu->arch.apic->vapic_page = page;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2907d05..cd34f73 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -723,9 +723,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-/*
- * Requires current->mm->mmap_sem to be held
- */
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
 	struct page *page[1];
@@ -741,20 +738,23 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		return page_to_pfn(bad_page);
 	}
 
-	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
-				NULL);
+	npages = get_user_pages_fast(addr, 1, 1, page);
 
 	if (unlikely(npages != 1)) {
 		struct vm_area_struct *vma;
 
+		down_read(&current->mm->mmap_sem);
 		vma = find_vma(current->mm, addr);
+
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
+			up_read(&current->mm->mmap_sem);
 			get_page(bad_page);
 			return page_to_pfn(bad_page);
 		}
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+		up_read(&current->mm->mmap_sem);
 		BUG_ON(!is_mmio_pfn(pfn));
 	} else
 		pfn = page_to_pfn(page[0]);
-- 
cgit v0.10.2


From 2259e3a7a6089007839cd4bbf7c9867190c67238 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 22 Aug 2008 13:29:17 -0700
Subject: KVM: x86.c make kvm_load_realmode_segment static

Noticed by sparse:
arch/x86/kvm/x86.c:3591:5: warning: symbol 'kvm_load_realmode_segment' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 108f072..1b738cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3611,7 +3611,7 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
 {
 	struct kvm_segment segvar = {
 		.base = selector << 4,
-- 
cgit v0.10.2


From 9ea1de4ea4b04d5132eb74917ecea77dda13ce76 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Fri, 19 Sep 2008 19:25:30 -0700
Subject: MAINTAINERS: Update Avi Kivity's email address

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 87ba4a6..24fd693 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2443,7 +2443,7 @@ S:	Supported
 
 KERNEL VIRTUAL MACHINE (KVM)
 P:	Avi Kivity
-M:	avi@qumranet.com
+M:	avi@redhat.com
 L:	kvm@vger.kernel.org
 W:	http://kvm.qumranet.com
 S:	Supported
-- 
cgit v0.10.2


From af2152f5457448bd90cb019c108e0a85e716fdbe Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@qumranet.com>
Date: Mon, 22 Sep 2008 14:28:53 +0300
Subject: KVM: don't enter guest after SIPI was received by a CPU

The vcpu should process pending SIPI message before entering guest mode again.
kvm_arch_vcpu_runnable() returns true if the vcpu is in SIPI state, so
we can't call it here.

Signed-off-by: Gleb Natapov <gleb@qumranet.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b738cb..08edeab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3233,7 +3233,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	r = 1;
 	while (r > 0) {
-		if (kvm_arch_vcpu_runnable(vcpu))
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
 			r = vcpu_enter_guest(vcpu, kvm_run);
 		else {
 			up_read(&vcpu->kvm->slots_lock);
-- 
cgit v0.10.2


From 7d8fece678c1abc2ca3e1ceda2277c3538a9161c Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Wed, 17 Sep 2008 23:16:59 -0300
Subject: KVM: Don't destroy vcpu in case vcpu_setup fails

One of vcpu_setup responsibilities is to do mmu initialization.
However, in case we fail in kvm_arch_vcpu_reset, before we get the
chance to init mmu. OTOH, vcpu_destroy will attempt to destroy mmu,
triggering a bug. Keeping track of whether or not mmu is initialized
would unnecessarily complicate things. Rather, we just make return,
making sure any needed uninitialization is done before we return, in
case we fail.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cd34f73..ef9a121 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1089,12 +1089,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 
 	r = kvm_arch_vcpu_setup(vcpu);
 	if (r)
-		goto vcpu_destroy;
+		return r;
 
 	mutex_lock(&kvm->lock);
 	if (kvm->vcpus[n]) {
 		r = -EEXIST;
-		mutex_unlock(&kvm->lock);
 		goto vcpu_destroy;
 	}
 	kvm->vcpus[n] = vcpu;
@@ -1110,8 +1109,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 unlink:
 	mutex_lock(&kvm->lock);
 	kvm->vcpus[n] = NULL;
-	mutex_unlock(&kvm->lock);
 vcpu_destroy:
+	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_destroy(vcpu);
 	return r;
 }
-- 
cgit v0.10.2


From 271b05281f7f485a0be8764860687ebb98459b80 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Wed, 24 Sep 2008 13:39:57 +0800
Subject: KVM: Remove useless intel-iommu.h header inclusion

Currently "#include <linux/intel-iommu.h>" is not needed in
virt/kvm/kvm_main.c.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef9a121..6cf0427 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -41,7 +41,6 @@
 #include <linux/pagemap.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
-#include <linux/intel-iommu.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
-- 
cgit v0.10.2


From 81aec5227eedf9035e8544d8021ca6b8fb7c357a Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 12 Sep 2008 20:23:11 +0800
Subject: KVM: ia64: Implement a uniform vps interface

An uniform entry kvm_vps_entry is added for
vps_sync_write/read, vps_resume_handler/guest,
and branches to differnt PAL service according to the offset.

Singed-off-by: Anthony Xu <anthony.xu@intel.com>
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h
index 13980d9..2cc41d1 100644
--- a/arch/ia64/kvm/kvm_minstate.h
+++ b/arch/ia64/kvm/kvm_minstate.h
@@ -50,27 +50,18 @@
 
 #define PAL_VSA_SYNC_READ						\
 	/* begin to call pal vps sync_read */				\
+{.mii;									\
 	add r25 = VMM_VPD_BASE_OFFSET, r21;				\
-	adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21;  /* entry point */	\
+	nop 0x0;							\
+	mov r24=ip;							\
 	;;								\
+}									\
+{.mmb									\
+	add r24=0x20, r24;						\
 	ld8 r25 = [r25];      /* read vpd base */			\
-	ld8 r20 = [r20];						\
-	;;								\
-	add r20 = PAL_VPS_SYNC_READ,r20;				\
-	;;								\
-{ .mii;									\
-	nop 0x0;							\
-	mov r24 = ip;							\
-	mov b0 = r20;							\
+	br.cond.sptk kvm_vps_sync_read;		/*call the service*/	\
 	;;								\
 };									\
-{ .mmb;									\
-	add r24 = 0x20, r24;						\
-	nop 0x0;							\
-	br.cond.sptk b0;        /*  call the service */			\
-	;;								\
-};
-
 
 
 #define KVM_MINSTATE_GET_CURRENT(reg)   mov reg=r21
diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index e4f15d6..f0bf0a8 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -20,6 +20,75 @@
 #define ACCE_MOV_TO_PSR
 #define ACCE_THASH
 
+ENTRY(kvm_vps_entry)
+	adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
+	;;
+	ld8 r29 = [r29]
+	;;
+	add r29 = r29, r30
+	;;
+	mov b0 = r29
+	br.sptk.many b0
+END(kvm_vps_entry)
+
+/*
+ *	Inputs:
+ *	r24 : return address
+ *  	r25 : vpd
+ *	r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_sync_read)
+	movl r30 = PAL_VPS_SYNC_READ
+	;;
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_sync_read)
+
+/*
+ *	Inputs:
+ *	r24 : return address
+ *  	r25 : vpd
+ *	r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_sync_write)
+	movl r30 = PAL_VPS_SYNC_WRITE
+	;;
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_sync_write)
+
+/*
+ *	Inputs:
+ *	r23 : pr
+ *	r24 : guest b0
+ *  	r25 : vpd
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_resume_normal)
+	movl r30 = PAL_VPS_RESUME_NORMAL
+	;;
+	mov pr=r23,-2
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_resume_normal)
+
+/*
+ *	Inputs:
+ *	r23 : pr
+ *	r24 : guest b0
+ *  	r25 : vpd
+ *	r17 : isr
+ */
+GLOBAL_ENTRY(kvm_vps_resume_handler)
+	movl r30 = PAL_VPS_RESUME_HANDLER
+	;;
+	ld8 r27=[r25]
+	shr r17=r17,IA64_ISR_IR_BIT
+	;;
+	dep r27=r17,r27,63,1   // bit 63 of r27 indicate whether enable CFLE
+	mov pr=r23,-2
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_resume_handler)
+
 //mov r1=ar3
 GLOBAL_ENTRY(kvm_asm_mov_from_ar)
 #ifndef ACCE_MOV_FROM_AR
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index 5a33f7e..3417783 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
 void vmm_transition(struct kvm_vcpu *vcpu)
 {
 	ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
-			0, 0, 0, 0, 0, 0);
+			1, 0, 0, 0, 0, 0);
 	vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
 	ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
-						0, 0, 0, 0, 0, 0);
+						1, 0, 0, 0, 0, 0);
 	kvm_do_resume_op(vcpu);
 }
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index 3ee5f48..c1d7251 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid:
     adds r19=VMM_VPD_VPSR_OFFSET,r18
     ;;
     ld8 r19=[r19]        //vpsr
-    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
-    ;;
-    ld8 r20=[r20]
-    ;;
-//vsa_sync_write_start
     mov r25=r18
     adds r16= VMM_VCPU_GP_OFFSET,r21
     ;;
@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid:
     ;;
     add  r24=r24,r16
     ;;
-    add r16=PAL_VPS_SYNC_WRITE,r20
-    ;;
-    mov b0=r16
-    br.cond.sptk b0         // call the service
+    br.sptk.many  kvm_vps_sync_write       // call the service
     ;;
 END(ia64_leave_hypervisor)
 // fall through
@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry)
  *  r17:cr.isr
  *  r18:vpd
  *  r19:vpsr
- *  r20:__vsa_base
  *  r22:b0
  *  r23:predicate
  */
     mov r24=r22
     mov r25=r18
     tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+    (p1) br.cond.sptk.few kvm_vps_resume_normal
+    (p2) br.cond.sptk.many kvm_vps_resume_handler
     ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p1) br.sptk.many ia64_vmm_entry_out
-    ;;
-    tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT		//p1=cr.isr.ir
-    ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p2) add r29=PAL_VPS_RESUME_HANDLER,r20
-    (p2) ld8 r26=[r25]
-    ;;
-ia64_vmm_entry_out:
-    mov pr=r23,-2
-    mov b0=r29
-    ;;
-    br.cond.sptk b0             // call pal service
 END(ia64_vmm_entry)
 
 
@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
     //set up ipsr, iip, vpd.vpsr, dcr
     // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
     // For DCR: all bits 0
+    bsw.0
+    ;;
+    mov r21 =r13
     adds r14=-VMM_PT_REGS_SIZE, r12
     ;;
     movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry)
     ;;
     srlz.i
     ;;
-    bsw.0
-    ;;
-    mov r21 =r13
-    ;;
-    bsw.1
-    ;;
     mov ar.rsc = 0
     ;;
     flushrs
@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
     ld8 r1 = [r20]
     ;;
     mov cr.iip=r4
-    ;;
     adds r16=VMM_VPD_BASE_OFFSET,r13
-    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
     ;;
     ld8 r18=[r16]
-    ld8 r20=[r20]
     ;;
     adds r19=VMM_VPD_VPSR_OFFSET,r18
     ;;
-- 
cgit v0.10.2


From 1f095610aabb9d54617901aa734d2a6093f2000c Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sat, 13 Sep 2008 06:21:22 +0800
Subject: KVM: ia64: add support for Tukwila processors

In Tukwila processor, VT-i has been enhanced in its
implementation, it is often called VT-i2 techonology.
With VTi-2 support, virtulization performance should be
improved. In this patch, we added the related stuff to
support kvm/ia64 for Tukwila processors.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index f0bf0a8..634abad 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -1,9 +1,12 @@
 /*
- * arch/ia64/vmx/optvfault.S
+ * arch/ia64/kvm/optvfault.S
  * optimize virtualization fault handler
  *
  * Copyright (C) 2006 Intel Co
  *	Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ * Copyright (C) 2008 Intel Co
+ *      Add the support for Tukwila processors.
+ *	Xiantao Zhang <xiantao.zhang@intel.com>
  */
 
 #include <asm/asmmacro.h>
@@ -20,6 +23,29 @@
 #define ACCE_MOV_TO_PSR
 #define ACCE_THASH
 
+#define VMX_VPS_SYNC_READ			\
+	add r16=VMM_VPD_BASE_OFFSET,r21;	\
+	mov r17 = b0;				\
+	mov r18 = r24;				\
+	mov r19 = r25;				\
+	mov r20 = r31;				\
+	;;					\
+{.mii;						\
+	ld8 r16 = [r16];			\
+	nop 0x0;				\
+	mov r24 = ip;				\
+	;;					\
+};						\
+{.mmb;						\
+	add r24=0x20, r24;			\
+	mov r25 =r16;				\
+	br.sptk.many kvm_vps_sync_read;		\
+};						\
+	mov b0 = r17;				\
+	mov r24 = r18;				\
+	mov r25 = r19;				\
+	mov r31 = r20
+
 ENTRY(kvm_vps_entry)
 	adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
 	;;
@@ -226,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 #ifndef ACCE_RSM
 	br.many kvm_virtualization_fault_back
 #endif
-	add r16=VMM_VPD_BASE_OFFSET,r21
+	VMX_VPS_SYNC_READ
+	;;
 	extr.u r26=r25,6,21
 	extr.u r27=r25,31,2
 	;;
-	ld8 r16=[r16]
 	extr.u r28=r25,36,1
 	dep r26=r27,r26,21,2
 	;;
@@ -265,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 	tbit.nz p6,p0=r23,0
 	;;
 	tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
-	(p6) br.dptk kvm_resume_to_guest
+	(p6) br.dptk kvm_resume_to_guest_with_sync
 	;;
 	add r26=VMM_VCPU_META_RR0_OFFSET,r21
 	add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
@@ -281,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 	mov rr[r28]=r27
 	;;
 	srlz.d
-	br.many kvm_resume_to_guest
+	br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_rsm)
 
 
@@ -290,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm)
 #ifndef ACCE_SSM
 	br.many kvm_virtualization_fault_back
 #endif
-	add r16=VMM_VPD_BASE_OFFSET,r21
+	VMX_VPS_SYNC_READ
+	;;
 	extr.u r26=r25,6,21
 	extr.u r27=r25,31,2
 	;;
-	ld8 r16=[r16]
 	extr.u r28=r25,36,1
 	dep r26=r27,r26,21,2
 	;;  //r26 is imm24
@@ -340,7 +366,7 @@ kvm_asm_ssm_1:
 	tbit.nz p6,p0=r29,IA64_PSR_I_BIT
 	;;
 	tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
-	(p6) br.dptk kvm_resume_to_guest
+	(p6) br.dptk kvm_resume_to_guest_with_sync
 	;;
 	add r29=VPD_VTPR_START_OFFSET,r16
 	add r30=VPD_VHPI_START_OFFSET,r16
@@ -355,7 +381,7 @@ kvm_asm_ssm_1:
 	;;
 	cmp.gt p6,p0=r30,r17
 	(p6) br.dpnt.few kvm_asm_dispatch_vexirq
-	br.many kvm_resume_to_guest
+	br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_ssm)
 
 
@@ -364,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr)
 #ifndef ACCE_MOV_TO_PSR
 	br.many kvm_virtualization_fault_back
 #endif
-	add r16=VMM_VPD_BASE_OFFSET,r21
-	extr.u r26=r25,13,7 //r2
+	VMX_VPS_SYNC_READ
 	;;
-	ld8 r16=[r16]
+	extr.u r26=r25,13,7 //r2
 	addl r20=@gprel(asm_mov_from_reg),gp
 	;;
 	adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
@@ -443,7 +468,7 @@ kvm_asm_mov_to_psr_1:
 	;;
 	tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
 	tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
-	(p6) br.dpnt.few kvm_resume_to_guest
+	(p6) br.dpnt.few kvm_resume_to_guest_with_sync
 	;;
 	add r29=VPD_VTPR_START_OFFSET,r16
 	add r30=VPD_VHPI_START_OFFSET,r16
@@ -458,13 +483,29 @@ kvm_asm_mov_to_psr_1:
 	;;
 	cmp.gt p6,p0=r30,r17
 	(p6) br.dpnt.few kvm_asm_dispatch_vexirq
-	br.many kvm_resume_to_guest
+	br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_mov_to_psr)
 
 
 ENTRY(kvm_asm_dispatch_vexirq)
 //increment iip
+	mov r17 = b0
+	mov r18 = r31
+{.mii
+	add r25=VMM_VPD_BASE_OFFSET,r21
+	nop 0x0
+	mov r24 = ip
+	;;
+}
+{.mmb
+	add r24 = 0x20, r24
+	ld8 r25 = [r25]
+	br.sptk.many kvm_vps_sync_write
+}
+	mov b0 =r17
 	mov r16=cr.ipsr
+	mov r31 = r18
+	mov r19 = 37
 	;;
 	extr.u r17=r16,IA64_PSR_RI_BIT,2
 	tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
@@ -504,25 +545,31 @@ GLOBAL_ENTRY(kvm_asm_thash)
 	;;
 kvm_asm_thash_back1:
 	shr.u r23=r19,61		// get RR number
-	adds r25=VMM_VCPU_VRR0_OFFSET,r21	// get vcpu->arch.vrr[0]'s addr
+	adds r28=VMM_VCPU_VRR0_OFFSET,r21	// get vcpu->arch.vrr[0]'s addr
 	adds r16=VMM_VPD_VPTA_OFFSET,r16	// get vpta
 	;;
-	shladd r27=r23,3,r25	// get vcpu->arch.vrr[r23]'s addr
+	shladd r27=r23,3,r28	// get vcpu->arch.vrr[r23]'s addr
 	ld8 r17=[r16]		// get PTA
 	mov r26=1
 	;;
-	extr.u r29=r17,2,6		// get pta.size
-	ld8 r25=[r27]		// get vcpu->arch.vrr[r23]'s value
+	extr.u r29=r17,2,6	// get pta.size
+	ld8 r28=[r27]		// get vcpu->arch.vrr[r23]'s value
 	;;
-	extr.u r25=r25,2,6		// get rr.ps
+	mov b0=r24
+	//Fallback to C if pta.vf is set
+	tbit.nz p6,p0=r17, 8
+	;;
+	(p6) mov r24=EVENT_THASH
+	(p6) br.cond.dpnt.many kvm_virtualization_fault_back
+	extr.u r28=r28,2,6	// get rr.ps
 	shl r22=r26,r29		// 1UL << pta.size
 	;;
-	shr.u r23=r19,r25		// vaddr >> rr.ps
+	shr.u r23=r19,r28	// vaddr >> rr.ps
 	adds r26=3,r29		// pta.size + 3
 	shl r27=r17,3		// pta << 3
 	;;
 	shl r23=r23,3		// (vaddr >> rr.ps) << 3
-	shr.u r27=r27,r26		// (pta << 3) >> (pta.size+3)
+	shr.u r27=r27,r26	// (pta << 3) >> (pta.size+3)
 	movl r16=7<<61
 	;;
 	adds r22=-1,r22		// (1UL << pta.size) - 1
@@ -793,6 +840,29 @@ END(asm_mov_from_reg)
  * r31: pr
  * r24: b0
  */
+ENTRY(kvm_resume_to_guest_with_sync)
+	adds r19=VMM_VPD_BASE_OFFSET,r21
+	mov r16 = r31
+	mov r17 = r24
+	;;
+{.mii
+	ld8 r25 =[r19]
+	nop 0x0
+	mov r24 = ip
+	;;
+}
+{.mmb
+	add r24 =0x20, r24
+	nop 0x0
+	br.sptk.many kvm_vps_sync_write
+}
+
+	mov r31 = r16
+	mov r24 =r17
+	;;
+	br.sptk.many kvm_resume_to_guest
+END(kvm_resume_to_guest_with_sync)
+
 ENTRY(kvm_resume_to_guest)
 	adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
 	;;
-- 
cgit v0.10.2


From a08546001c2b0f584ffc81987340943a7d6d6acb Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 23 Sep 2008 11:01:45 -0700
Subject: x86: pvclock: fix shadowed variable warning

arch/x86/kernel/pvclock.c:102:6: warning: symbol 'tsc_khz' shadows an earlier one
include/asm/tsc.h:18:21: originally declared here

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 1c54b5f..4f9c55f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -99,14 +99,14 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 {
-	u64 tsc_khz = 1000000ULL << 32;
+	u64 pv_tsc_khz = 1000000ULL << 32;
 
-	do_div(tsc_khz, src->tsc_to_system_mul);
+	do_div(pv_tsc_khz, src->tsc_to_system_mul);
 	if (src->tsc_shift < 0)
-		tsc_khz <<= -src->tsc_shift;
+		pv_tsc_khz <<= -src->tsc_shift;
 	else
-		tsc_khz >>= src->tsc_shift;
-	return tsc_khz;
+		pv_tsc_khz >>= src->tsc_shift;
+	return pv_tsc_khz;
 }
 
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
-- 
cgit v0.10.2


From 93a423e7045cf3cf69f960ff307edda1afcd7b41 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:29 -0300
Subject: KVM: MMU: flush remote TLBs on large->normal entry overwrite

It is necessary to flush all TLB's when a large spte entry is
overwritten with a normal page directory pointer.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6dd08e0..e9fbaa4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -310,8 +310,11 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
 	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
 		return 0;
 
-	if (is_large_pte(*sptep))
+	if (is_large_pte(*sptep)) {
+		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
 		rmap_remove(vcpu->kvm, sptep);
+	}
 
 	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
 		metaphysical = 1;
-- 
cgit v0.10.2


From 1e73f9dd885957bf0c7bb5e63b350d5aeb06b726 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:30 -0300
Subject: KVM: MMU: split mmu_set_spte

Split the spte entry creation code into a new set_spte function.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5779a23..9ad4cc55 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1148,44 +1148,13 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
-			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+		    unsigned pte_access, int user_fault,
+		    int write_fault, int dirty, int largepage,
+		    gfn_t gfn, pfn_t pfn, bool speculative)
 {
 	u64 spte;
-	int was_rmapped = 0;
-	int was_writeble = is_writeble_pte(*shadow_pte);
-
-	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
-		 __func__, *shadow_pte, pt_access,
-		 write_fault, user_fault, gfn);
-
-	if (is_rmap_pte(*shadow_pte)) {
-		/*
-		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
-		 * the parent of the now unreachable PTE.
-		 */
-		if (largepage && !is_large_pte(*shadow_pte)) {
-			struct kvm_mmu_page *child;
-			u64 pte = *shadow_pte;
-
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, shadow_pte);
-		} else if (pfn != spte_to_pfn(*shadow_pte)) {
-			pgprintk("hfn old %lx new %lx\n",
-				 spte_to_pfn(*shadow_pte), pfn);
-			rmap_remove(vcpu->kvm, shadow_pte);
-		} else {
-			if (largepage)
-				was_rmapped = is_large_pte(*shadow_pte);
-			else
-				was_rmapped = 1;
-		}
-	}
-
+	int ret = 0;
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -1218,26 +1187,70 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
+			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
 			if (is_writeble_pte(spte)) {
 				spte &= ~PT_WRITABLE_MASK;
 				kvm_x86_ops->tlb_flush(vcpu);
 			}
-			if (write_fault)
-				*ptwrite = 1;
 		}
 	}
 
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
-	pgprintk("%s: setting spte %llx\n", __func__, spte);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
-		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
 	set_shadow_pte(shadow_pte, spte);
-	if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
-	    && (spte & PT_PRESENT_MASK))
+	return ret;
+}
+
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+			 unsigned pt_access, unsigned pte_access,
+			 int user_fault, int write_fault, int dirty,
+			 int *ptwrite, int largepage, gfn_t gfn,
+			 pfn_t pfn, bool speculative)
+{
+	int was_rmapped = 0;
+	int was_writeble = is_writeble_pte(*shadow_pte);
+
+	pgprintk("%s: spte %llx access %x write_fault %d"
+		 " user_fault %d gfn %lx\n",
+		 __func__, *shadow_pte, pt_access,
+		 write_fault, user_fault, gfn);
+
+	if (is_rmap_pte(*shadow_pte)) {
+		/*
+		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+		 * the parent of the now unreachable PTE.
+		 */
+		if (largepage && !is_large_pte(*shadow_pte)) {
+			struct kvm_mmu_page *child;
+			u64 pte = *shadow_pte;
+
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, shadow_pte);
+		} else if (pfn != spte_to_pfn(*shadow_pte)) {
+			pgprintk("hfn old %lx new %lx\n",
+				 spte_to_pfn(*shadow_pte), pfn);
+			rmap_remove(vcpu->kvm, shadow_pte);
+		} else {
+			if (largepage)
+				was_rmapped = is_large_pte(*shadow_pte);
+			else
+				was_rmapped = 1;
+		}
+	}
+	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+		      dirty, largepage, gfn, pfn, speculative))
+		if (write_fault)
+			*ptwrite = 1;
+
+	pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+		 is_large_pte(*shadow_pte)? "2MB" : "4kB",
+		 is_present_pte(*shadow_pte)?"RW":"R", gfn,
+		 *shadow_pte, shadow_pte);
+	if (!was_rmapped && is_large_pte(*shadow_pte))
 		++vcpu->kvm->stat.lpages;
 
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
-- 
cgit v0.10.2


From a378b4e64c0fef2d9e53214db167878b7673a7a3 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:31 -0300
Subject: KVM: MMU: move local TLB flush to mmu_set_spte

Since the sync page path can collapse flushes.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9ad4cc55..23752ef 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1189,10 +1189,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writeble_pte(spte)) {
+			if (is_writeble_pte(spte))
 				spte &= ~PT_WRITABLE_MASK;
-				kvm_x86_ops->tlb_flush(vcpu);
-			}
 		}
 	}
 
@@ -1241,9 +1239,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative))
+		      dirty, largepage, gfn, pfn, speculative)) {
 		if (write_fault)
 			*ptwrite = 1;
+		kvm_x86_ops->tlb_flush(vcpu);
+	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
 	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-- 
cgit v0.10.2


From 38187c830cab84daecb41169948467f1f19317e3 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:32 -0300
Subject: KVM: MMU: do not write-protect large mappings

There is not much point in write protecting large mappings. This
can only happen when a page is shadowed during the window between
is_largepage_backed and mmu_lock acquision. Zap the entry instead, so
the next pagefault will find a shadowed page via is_largepage_backed and
fallback to 4k translations.

Simplifies out of sync shadow.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 23752ef..731e6fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
 		struct kvm_mmu_page *shadow;
 
+		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+			ret = 1;
+			spte = shadow_trap_nonpresent_pte;
+			goto set_pte;
+		}
+
 		spte |= PT_WRITABLE_MASK;
 
 		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow ||
-		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+		if (shadow) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
@@ -1197,6 +1202,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
+set_pte:
 	set_shadow_pte(shadow_pte, spte);
 	return ret;
 }
-- 
cgit v0.10.2


From e8bc217aef67d41d767ede6e7a7eb10f1d47c86c Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:33 -0300
Subject: KVM: MMU: mode specific sync_page

Examine guest pagetable and bring the shadow back in sync. Caller is responsible
for local TLB flush before re-entering guest mode.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 731e6fe..90f0116 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -871,6 +871,12 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 		sp->spt[i] = shadow_trap_nonpresent_pte;
 }
 
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+			       struct kvm_mmu_page *sp)
+{
+	return 1;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -1547,6 +1553,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1594,6 +1601,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->prefetch_page = paging64_prefetch_page;
+	context->sync_page = paging64_sync_page;
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
@@ -1615,6 +1623,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->free = paging_free;
 	context->prefetch_page = paging32_prefetch_page;
+	context->sync_page = paging32_sync_page;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1634,6 +1643,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e9fbaa4..776fb6d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -507,6 +507,60 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 	}
 }
 
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ *   can't change unless all sptes pointing to it are nuked first.
+ * - Alias changes zap the entire shadow cache.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	int i, offset, nr_present;
+
+	offset = nr_present = 0;
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+		unsigned pte_access;
+		pt_element_t gpte;
+		gpa_t pte_gpa;
+		gfn_t gfn = sp->gfns[i];
+
+		if (!is_shadow_present_pte(sp->spt[i]))
+			continue;
+
+		pte_gpa = gfn_to_gpa(sp->gfn);
+		pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+					  sizeof(pt_element_t)))
+			return -EINVAL;
+
+		if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+		    !(gpte & PT_ACCESSED_MASK)) {
+			u64 nonpresent;
+
+			rmap_remove(vcpu->kvm, &sp->spt[i]);
+			if (is_present_pte(gpte))
+				nonpresent = shadow_trap_nonpresent_pte;
+			else
+				nonpresent = shadow_notrap_nonpresent_pte;
+			set_shadow_pte(&sp->spt[i], nonpresent);
+			continue;
+		}
+
+		nr_present++;
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+			 is_dirty_pte(gpte), 0, gfn,
+			 spte_to_pfn(sp->spt[i]), true);
+	}
+
+	return !nr_present;
+}
+
 #undef pt_element_t
 #undef guest_walker
 #undef shadow_walker
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 805629c..8bad9bd 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -220,6 +220,8 @@ struct kvm_mmu {
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
+	int (*sync_page)(struct kvm_vcpu *vcpu,
+			 struct kvm_mmu_page *sp);
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
-- 
cgit v0.10.2


From 0ba73cdadb8ac172f396df7e23c4a9cebd59b550 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:34 -0300
Subject: KVM: MMU: sync roots on mmu reload

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 90f0116..9d8c4bb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1471,6 +1471,41 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+}
+
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		mmu_sync_children(vcpu, sp);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			mmu_sync_children(vcpu, sp);
+		}
+	}
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_roots(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
@@ -1715,6 +1750,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	mmu_alloc_roots(vcpu);
+	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08edeab..88e6d9a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -594,6 +594,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 		return;
 	}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 8bad9bd..475d8ab 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -584,6 +584,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
-- 
cgit v0.10.2


From a7052897b3bcd568a9f5bfaa558957039e7e7ec0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:35 -0300
Subject: KVM: x86: trap invlpg

With pages out of sync invlpg needs to be trapped. For now simply nuke
the entry.

Untested on AMD.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9d8c4bb..e89af1d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -877,6 +877,10 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -1589,6 +1593,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1637,6 +1642,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->prefetch_page = paging64_prefetch_page;
 	context->sync_page = paging64_sync_page;
+	context->invlpg = paging64_invlpg;
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
@@ -1659,6 +1665,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->free = paging_free;
 	context->prefetch_page = paging32_prefetch_page;
 	context->sync_page = paging32_sync_page;
+	context->invlpg = paging32_invlpg;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1679,6 +1686,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
@@ -2071,6 +2079,16 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	vcpu->arch.mmu.invlpg(vcpu, gva);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_flush_tlb(vcpu);
+	++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
 void kvm_enable_tdp(void)
 {
 	tdp_enabled = true;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 776fb6d..dc169e8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,31 @@ out_unlock:
 	return 0;
 }
 
+static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
+				      struct kvm_vcpu *vcpu, u64 addr,
+				      u64 *sptep, int level)
+{
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		if (is_shadow_present_pte(*sptep))
+			rmap_remove(vcpu->kvm, sptep);
+		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+		return 1;
+	}
+	if (!is_shadow_present_pte(*sptep))
+		return 1;
+	return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	struct shadow_walker walker = {
+		.walker = { .entry = FNAME(shadow_invlpg_entry), },
+	};
+
+	walk_shadow(&walker.walker, vcpu, gva);
+}
+
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	struct guest_walker walker;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9b54550..9c4ce65 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -525,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 				(1ULL << INTERCEPT_CPUID) |
 				(1ULL << INTERCEPT_INVD) |
 				(1ULL << INTERCEPT_HLT) |
+				(1ULL << INTERCEPT_INVLPG) |
 				(1ULL << INTERCEPT_INVLPGA) |
 				(1ULL << INTERCEPT_IOIO_PROT) |
 				(1ULL << INTERCEPT_MSR_PROT) |
@@ -589,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
 		control->nested_ctl = 1;
-		control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+					(1ULL << INTERCEPT_INVLPG));
 		control->intercept_exceptions &= ~(1 << PF_VECTOR);
 		control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
 						INTERCEPT_CR3_MASK);
@@ -1164,6 +1166,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
+	return 1;
+}
+
 static int emulate_on_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
@@ -1417,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
-	[SVM_EXIT_INVLPG]			= emulate_on_interception,
+	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
 	[SVM_EXIT_IOIO] 		  	= io_interception,
 	[SVM_EXIT_MSR]				= msr_interception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 025bf40..4556cc3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1130,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_CR3_STORE_EXITING |
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
-	      CPU_BASED_USE_TSC_OFFSETING;
+	      CPU_BASED_USE_TSC_OFFSETING |
+	      CPU_BASED_INVLPG_EXITING;
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1159,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
 	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
-		/* CR3 accesses don't need to cause VM Exits when EPT enabled */
+		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+		   enabled */
 		min &= ~(CPU_BASED_CR3_LOAD_EXITING |
-			 CPU_BASED_CR3_STORE_EXITING);
+			 CPU_BASED_CR3_STORE_EXITING |
+			 CPU_BASED_INVLPG_EXITING);
 		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 					&_cpu_based_exec_control) < 0)
 			return -EIO;
@@ -2790,6 +2793,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+	kvm_mmu_invlpg(vcpu, exit_qualification);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
@@ -2958,6 +2970,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
+	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 88e6d9a..efee85b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2341,6 +2341,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 {
+	kvm_mmu_invlpg(vcpu, address);
 	return X86EMUL_CONTINUE;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 475d8ab..8b935cc 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -222,6 +222,7 @@ struct kvm_mmu {
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp);
+	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
@@ -591,6 +592,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
-- 
cgit v0.10.2


From ad8cfbe3fffdc09704f0808fde3934855620d545 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:36 -0300
Subject: KVM: MMU: mmu_parent_walk

Introduce a function to walk all parents of a given page, invoking a handler.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e89af1d..b82abee 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,8 @@ struct kvm_shadow_walk {
 		     u64 addr, u64 *spte, int level);
 };
 
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
@@ -862,6 +864,31 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 	BUG();
 }
 
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    mmu_parent_walk_fn fn)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	struct kvm_mmu_page *parent_sp;
+	int i;
+
+	if (!sp->multimapped && sp->parent_pte) {
+		parent_sp = page_header(__pa(sp->parent_pte));
+		fn(vcpu, parent_sp);
+		mmu_parent_walk(vcpu, parent_sp, fn);
+		return;
+	}
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+			fn(vcpu, parent_sp);
+			mmu_parent_walk(vcpu, parent_sp, fn);
+		}
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {
-- 
cgit v0.10.2


From 0738541396be165995c7f2387746eb0b47024fec Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:37 -0300
Subject: KVM: MMU: awareness of new kvm_mmu_zap_page behaviour

kvm_mmu_zap_page will soon zap the unsynced children of a page. Restart
list walk in such case.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b82abee..c9b4b90 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1078,7 +1078,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
@@ -1095,6 +1095,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_reload_remote_mmus(kvm);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
+	return 0;
 }
 
 /*
@@ -1147,8 +1148,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 		if (sp->gfn == gfn && !sp->role.metaphysical) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
-			kvm_mmu_zap_page(kvm, sp);
 			r = 1;
+			if (kvm_mmu_zap_page(kvm, sp))
+				n = bucket->first;
 		}
 	return r;
 }
@@ -1992,7 +1994,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			kvm_mmu_zap_page(vcpu->kvm, sp);
+			if (kvm_mmu_zap_page(vcpu->kvm, sp))
+				n = bucket->first;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -2226,7 +2229,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 
 	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		kvm_mmu_zap_page(kvm, sp);
+		if (kvm_mmu_zap_page(kvm, sp))
+			node = container_of(kvm->arch.active_mmu_pages.next,
+					    struct kvm_mmu_page, link);
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
-- 
cgit v0.10.2


From 6844dec6948679d084f054235fee19ba4e3a3096 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:38 -0300
Subject: KVM: MMU: mmu_convert_notrap helper

Need to convert shadow_notrap_nonpresent -> shadow_trap_nonpresent when
unsyncing pages.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c9b4b90..57c7580 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1173,6 +1173,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 	__set_bit(slot, &sp->slot_bitmap);
 }
 
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+	int i;
+	u64 *pt = sp->spt;
+
+	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+		return;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (pt[i] == shadow_notrap_nonpresent_pte)
+			set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+	}
+}
+
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct page *page;
-- 
cgit v0.10.2


From 4731d4c7a07769cf2926c327177b97bb8c68cafc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:39 -0300
Subject: KVM: MMU: out of sync shadow core

Allow guest pagetables to go out of sync.  Instead of emulating write
accesses to guest pagetables, or unshadowing them, we un-write-protect
the page table and allow the guest to modify it at will.  We rely on
invlpg executions to synchronize individual ptes, and will synchronize
the entire pagetable on tlb flushes.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57c7580..d88659a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,10 @@ struct kvm_shadow_walk {
 		     u64 addr, u64 *spte, int level);
 };
 
+struct kvm_unsync_walk {
+	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
@@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 
 	if (write_protected)
 		kvm_flush_remote_tlbs(kvm);
-
-	account_shadowed(kvm, gfn);
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+			   struct kvm_unsync_walk *walker)
+{
+	int i, ret;
+
+	if (!sp->unsync_children)
+		return 0;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			struct kvm_mmu_page *child;
+			child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+			if (child->unsync_children) {
+				ret = mmu_unsync_walk(child, walker);
+				if (ret)
+					return ret;
+			}
+
+			if (child->unsync) {
+				ret = walker->entry(child, walker);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	if (i == PT64_ENT_PER_PAGE)
+		sp->unsync_children = 0;
+
+	return 0;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	return NULL;
 }
 
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	WARN_ON(!sp->unsync);
+	sp->unsync = 0;
+	--kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		return 1;
+	}
+
+	rmap_write_protect(vcpu->kvm, sp->gfn);
+	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		return 1;
+	}
+
+	kvm_mmu_flush_tlb(vcpu);
+	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	return 0;
+}
+
+struct sync_walker {
+	struct kvm_vcpu *vcpu;
+	struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+	struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+						     walker);
+	struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+	kvm_sync_page(vcpu, sp);
+	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	struct sync_walker walker = {
+		.walker = { .entry = mmu_sync_fn, },
+		.vcpu = vcpu,
+	};
+
+	while (mmu_unsync_walk(sp, &walker.walker))
+		cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned quadrant;
 	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
+	struct hlist_node *node, *tmp;
 
 	role.word = 0;
 	role.glevels = vcpu->arch.mmu.root_level;
@@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		 gfn, role.word);
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && sp->role.word == role.word) {
+	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+		if (sp->gfn == gfn) {
+			if (sp->unsync)
+				if (kvm_sync_page(vcpu, sp))
+					continue;
+
+			if (sp->role.word != role.word)
+				continue;
+
+			if (sp->unsync_children)
+				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			pgprintk("%s: found\n", __func__);
 			return sp;
@@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->gfn = gfn;
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, bucket);
-	if (!metaphysical)
+	if (!metaphysical) {
 		rmap_write_protect(vcpu->kvm, gfn);
+		account_shadowed(vcpu->kvm, gfn);
+	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
 		vcpu->arch.mmu.prefetch_page(vcpu, sp);
 	else
@@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
+struct zap_walker {
+	struct kvm_unsync_walk walker;
+	struct kvm *kvm;
+	int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+	struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+						     walker);
+	kvm_mmu_zap_page(zap_walk->kvm, sp);
+	zap_walk->zapped = 1;
+	return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	struct zap_walker walker = {
+		.walker = { .entry = mmu_zap_fn, },
+		.kvm = kvm,
+		.zapped = 0,
+	};
+
+	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+		return 0;
+	mmu_unsync_walk(sp, &walker.walker);
+	return walker.zapped;
+}
+
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	int ret;
 	++kvm->stat.mmu_shadow_zapped;
+	ret = mmu_zap_unsync_children(kvm, sp);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_flush_remote_tlbs(kvm);
 	if (!sp->role.invalid && !sp->role.metaphysical)
 		unaccount_shadowed(kvm, sp->gfn);
+	if (sp->unsync)
+		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
@@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_reload_remote_mmus(kvm);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
-	return 0;
+	return ret;
 }
 
 /*
@@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	sp->unsync_children = 1;
+	return 1;
+}
+
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+
+	index = kvm_page_table_hashfn(sp->gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	/* don't unsync if pagetable is shadowed with multiple roles */
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != sp->gfn || s->role.metaphysical)
+			continue;
+		if (s->role.word != sp->role.word)
+			return 1;
+	}
+	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	++vcpu->kvm->stat.mmu_unsync;
+	sp->unsync = 1;
+	mmu_convert_notrap(sp);
+	return 0;
+}
+
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+				  bool can_unsync)
+{
+	struct kvm_mmu_page *shadow;
+
+	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+	if (shadow) {
+		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+			return 1;
+		if (shadow->unsync)
+			return 0;
+		if (can_unsync)
+			return kvm_unsync_page(vcpu, shadow);
+		return 1;
+	}
+	return 0;
+}
+
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    gfn_t gfn, pfn_t pfn, bool speculative)
+		    gfn_t gfn, pfn_t pfn, bool speculative,
+		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
@@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 	if ((pte_access & ACC_WRITE_MASK)
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-		struct kvm_mmu_page *shadow;
 
 		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
 			ret = 1;
@@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 		spte |= PT_WRITABLE_MASK;
 
-		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow) {
+		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
@@ -1260,7 +1441,6 @@ set_pte:
 	return ret;
 }
 
-
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
@@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative)) {
+		      dirty, largepage, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-}
-
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dc169e8..613ec9a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 			 is_dirty_pte(gpte), 0, gfn,
-			 spte_to_pfn(sp->spt[i]), true);
+			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
 	return !nr_present;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index efee85b..1c5864a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_flooded", VM_STAT(mmu_flooded) },
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+			kvm_mmu_sync_roots(vcpu);
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
 			kvm_x86_ops->tlb_flush(vcpu);
 		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 8b935cc..7d36fcc 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -195,6 +195,8 @@ struct kvm_mmu_page {
 				    */
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
+	bool unsync;
+	bool unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -371,6 +373,7 @@ struct kvm_vm_stat {
 	u32 mmu_flooded;
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
+	u32 mmu_unsync;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6252802..73b7c52 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,7 @@
 #define KVM_REQ_TRIPLE_FAULT       4
 #define KVM_REQ_PENDING_TIMER      5
 #define KVM_REQ_UNHALT             6
+#define KVM_REQ_MMU_SYNC           7
 
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
-- 
cgit v0.10.2


From 0074ff63ebc195701062ca46e0d82fcea0fa3a0a Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:40 -0300
Subject: KVM: MMU: speed up mmu_unsync_walk

Cache the unsynced children information in a per-page bitmap.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d88659a..cb391d6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -891,6 +891,52 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		}
 }
 
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+	unsigned int index;
+	struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+	index = spte - sp->spt;
+	__set_bit(index, sp->unsync_child_bitmap);
+	sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!sp->parent_pte)
+		return;
+
+	if (!sp->multimapped) {
+		kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+		return;
+	}
+
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+		}
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	sp->unsync_children = 1;
+	kvm_mmu_update_parents_unsync(sp);
+	return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+					struct kvm_mmu_page *sp)
+{
+	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	kvm_mmu_update_parents_unsync(sp);
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {
@@ -910,6 +956,11 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+#define for_each_unsync_children(bitmap, idx)		\
+	for (idx = find_first_bit(bitmap, 512);		\
+	     idx < 512;					\
+	     idx = find_next_bit(bitmap, 512, idx+1))
+
 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 			   struct kvm_unsync_walk *walker)
 {
@@ -918,7 +969,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	if (!sp->unsync_children)
 		return 0;
 
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+	for_each_unsync_children(sp->unsync_child_bitmap, i) {
 		u64 ent = sp->spt[i];
 
 		if (is_shadow_present_pte(ent)) {
@@ -929,17 +980,19 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 				ret = mmu_unsync_walk(child, walker);
 				if (ret)
 					return ret;
+				__clear_bit(i, sp->unsync_child_bitmap);
 			}
 
 			if (child->unsync) {
 				ret = walker->entry(child, walker);
+				__clear_bit(i, sp->unsync_child_bitmap);
 				if (ret)
 					return ret;
 			}
 		}
 	}
 
-	if (i == PT64_ENT_PER_PAGE)
+	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
 		sp->unsync_children = 0;
 
 	return 0;
@@ -1056,10 +1109,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			if (sp->role.word != role.word)
 				continue;
 
-			if (sp->unsync_children)
-				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+			if (sp->unsync_children) {
+				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+				kvm_mmu_mark_parents_unsync(vcpu, sp);
+			}
 			pgprintk("%s: found\n", __func__);
 			return sp;
 		}
@@ -1336,12 +1390,6 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-	sp->unsync_children = 1;
-	return 1;
-}
-
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	unsigned index;
@@ -1358,7 +1406,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
-	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	kvm_mmu_mark_parents_unsync(vcpu, sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 	mmu_convert_notrap(sp);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 7d36fcc..0992d72 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -201,6 +201,7 @@ struct kvm_mmu_page {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
 	};
+	DECLARE_BITMAP(unsync_child_bitmap, 512);
 };
 
 struct kvm_pv_mmu_op_buffer {
-- 
cgit v0.10.2


From 582801a95d2f2ceab841779e1dec0e11dfec44c0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:41 -0300
Subject: KVM: MMU: add "oos_shadow" parameter to disable oos

Subject says it all.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb391d6..99c239c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
 #ifndef MMU_DEBUG
 #define ASSERT(x) do { } while (0)
 #else
@@ -1424,7 +1427,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 			return 1;
 		if (shadow->unsync)
 			return 0;
-		if (can_unsync)
+		if (can_unsync && oos_shadow)
 			return kvm_unsync_page(vcpu, shadow);
 		return 1;
 	}
-- 
cgit v0.10.2


From e48258009d941891fca35348986b8d280caf31cd Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 24 Sep 2008 20:28:34 -0300
Subject: KVM: PIC: enhance IPI avoidance

The PIC code makes little effort to avoid kvm_vcpu_kick(), resulting in
unnecessary guest exits in some conditions.

For example, if the timer interrupt is routed through the IOAPIC, IRR
for IRQ 0 will get set but not cleared, since the APIC is handling the
acks.

This means that everytime an interrupt < 16 is triggered, the priority
logic will find IRQ0 pending and send an IPI to vcpu0 (in case IRQ0 is
not masked, which is Linux's case).

Introduce a new variable isr_ack to represent the IRQ's for which the
guest has been signalled / cleared the ISR. Use it to avoid more than
one IPI per trigger-ack cycle, in addition to the avoidance when ISR is
set in get_priority().

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 71e3eee..17e41e1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,14 @@
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
+	s->isr_ack |= (1 << irq);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+	struct kvm_pic *s = pic_irqchip(kvm);
+	s->pics[0].isr_ack = 0xff;
+	s->pics[1].isr_ack = 0xff;
 }
 
 /*
@@ -213,6 +221,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 	s->irr = 0;
 	s->imr = 0;
 	s->isr = 0;
+	s->isr_ack = 0xff;
 	s->priority_add = 0;
 	s->irq_base = 0;
 	s->read_reg_select = 0;
@@ -444,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
 {
 	struct kvm *kvm = opaque;
 	struct kvm_vcpu *vcpu = kvm->vcpus[0];
+	struct kvm_pic *s = pic_irqchip(kvm);
+	int irq = pic_get_irq(&s->pics[0]);
 
-	pic_irqchip(kvm)->output = level;
-	if (vcpu)
+	s->output = level;
+	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+		s->pics[0].isr_ack &= ~(1 << irq);
 		kvm_vcpu_kick(vcpu);
+	}
 }
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 479a3d2..4748532 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
 	u8 irr;		/* interrupt request register */
 	u8 imr;		/* interrupt mask register */
 	u8 isr;		/* interrupt service register */
+	u8 isr_ack;	/* interrupt ack detection */
 	u8 priority_add;	/* highest irq priority */
 	u8 irq_base;
 	u8 read_reg_select;
@@ -70,6 +71,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
 void kvm_pic_set_irq(void *opaque, int irq, int level);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
 
 static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c5864a..4cfdd1b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3963,6 +3963,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 			pr_debug("Set back pending irq %d\n",
 				 pending_vec);
 		}
+		kvm_pic_clear_isr_ack(vcpu->kvm);
 	}
 
 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-- 
cgit v0.10.2


From e5fcfc821a467bd0827635db8fd39ab1213987e5 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Thu, 25 Sep 2008 23:32:02 +0800
Subject: KVM: Device Assignment: Map mmio pages into VT-d page table

Assigned device could DMA to mmio pages, so also need to map mmio pages
into VT-d page table.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c
index 667bf3f..a770874 100644
--- a/arch/x86/kvm/vtd.c
+++ b/arch/x86/kvm/vtd.c
@@ -36,37 +36,30 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 {
 	gfn_t gfn = base_gfn;
 	pfn_t pfn;
-	int i, r;
+	int i, r = 0;
 	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
-	r = -EINVAL;
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
 		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
 						     gfn_to_gpa(gfn));
-		if (pfn && !is_mmio_pfn(pfn))
+		if (pfn)
 			continue;
 
 		pfn = gfn_to_pfn(kvm, gfn);
-		if (!is_mmio_pfn(pfn)) {
-			r = intel_iommu_page_mapping(domain,
-						     gfn_to_gpa(gfn),
-						     pfn_to_hpa(pfn),
-						     PAGE_SIZE,
-						     DMA_PTE_READ |
-						     DMA_PTE_WRITE);
-			if (r) {
-				printk(KERN_DEBUG "kvm_iommu_map_pages:"
-				       "iommu failed to map pfn=%lx\n", pfn);
-				goto unmap_pages;
-			}
-		} else {
-			printk(KERN_DEBUG "kvm_iommu_map_page:"
-			       "invalid pfn=%lx\n", pfn);
+		r = intel_iommu_page_mapping(domain,
+					     gfn_to_gpa(gfn),
+					     pfn_to_hpa(pfn),
+					     PAGE_SIZE,
+					     DMA_PTE_READ |
+					     DMA_PTE_WRITE);
+		if (r) {
+			printk(KERN_ERR "kvm_iommu_map_pages:"
+			       "iommu failed to map pfn=%lx\n", pfn);
 			goto unmap_pages;
 		}
 		gfn++;
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 0992d72..ca6bbc0 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -502,8 +502,6 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret);
 
-int is_mmio_pfn(pfn_t pfn);
-
 extern bool tdp_enabled;
 
 enum emulation_result {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6cf0427..98cd916 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -76,7 +76,7 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
-inline int is_mmio_pfn(pfn_t pfn)
+static inline int is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn))
 		return PageReserved(pfn_to_page(pfn));
-- 
cgit v0.10.2


From 1b10bf31a5de5b76e2e9c2937878a45c5ae2be37 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 30 Sep 2008 10:41:06 +0200
Subject: KVM: x86: Silence various LAPIC-related host kernel messages

KVM-x86 dumps a lot of debug messages that have no meaning for normal
operation:
 - INIT de-assertion is ignored
 - SIPIs are sent and received
 - APIC writes are unaligned or < 4 byte long
   (Windows Server 2003 triggers this on SMP)

Degrade them to true debug messages, keeping the host kernel log clean
for real problems.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fd00f69..6571926 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -365,16 +365,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			kvm_vcpu_kick(vcpu);
 		} else {
-			printk(KERN_DEBUG
-			       "Ignoring de-assert INIT to vcpu %d\n",
-			       vcpu->vcpu_id);
+			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+				   vcpu->vcpu_id);
 		}
-
 		break;
 
 	case APIC_DM_STARTUP:
-		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
-		       vcpu->vcpu_id, vector);
+		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+			   vcpu->vcpu_id, vector);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
@@ -679,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
 	 * Refer SDM 8.4.1
 	 */
 	if (len != 4 || alignment) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "apic write: bad size=%d %lx\n",
-			       len, (long)address);
+		/* Don't shout loud, $infamous_os would cause only noise. */
+		apic_debug("apic write: bad size=%d %lx\n",
+			   len, (long)address);
 		return;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4cfdd1b..d6d7123 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3224,8 +3224,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-		printk("vcpu %d received sipi with vector # %x\n",
-		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		pr_debug("vcpu %d received sipi with vector # %x\n",
+			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
 		r = kvm_x86_ops->vcpu_reset(vcpu);
 		if (r)
-- 
cgit v0.10.2


From 83dbc83a0d7c88c919d769177bd1924a46c9c034 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 7 Oct 2008 17:01:27 -0300
Subject: KVM: VMX: enable invlpg exiting if EPT is disabled

Manually disabling EPT via module option fails to re-enable INVLPG
exiting.

Reported-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4556cc3..2643b43 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2118,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	}
 	if (!vm_need_ept())
 		exec_control |= CPU_BASED_CR3_STORE_EXITING |
-				CPU_BASED_CR3_LOAD_EXITING;
+				CPU_BASED_CR3_LOAD_EXITING  |
+				CPU_BASED_INVLPG_EXITING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
 	if (cpu_has_secondary_exec_ctrls()) {
-- 
cgit v0.10.2


From 371c01b28e4049d1fbf60a9631cdad98f7cae030 Mon Sep 17 00:00:00 2001
From: Zhang xiantao <xiantao.zhang@intel.com>
Date: Thu, 11 Sep 2008 13:19:32 +0800
Subject: KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/

Preparation for kvm/ia64 VT-d support.

Signed-off-by: Zhang xiantao <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3072b17..7dce593 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,14 +7,14 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
 	i8254.o
-ifeq ($(CONFIG_DMAR),y)
-kvm-objs += vtd.o
-endif
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c
deleted file mode 100644
index a770874..0000000
--- a/arch/x86/kvm/vtd.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/intel-iommu.h>
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages);
-
-int kvm_iommu_map_pages(struct kvm *kvm,
-			gfn_t base_gfn, unsigned long npages)
-{
-	gfn_t gfn = base_gfn;
-	pfn_t pfn;
-	int i, r = 0;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	for (i = 0; i < npages; i++) {
-		/* check if already mapped */
-		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
-						     gfn_to_gpa(gfn));
-		if (pfn)
-			continue;
-
-		pfn = gfn_to_pfn(kvm, gfn);
-		r = intel_iommu_page_mapping(domain,
-					     gfn_to_gpa(gfn),
-					     pfn_to_hpa(pfn),
-					     PAGE_SIZE,
-					     DMA_PTE_READ |
-					     DMA_PTE_WRITE);
-		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_pages:"
-			       "iommu failed to map pfn=%lx\n", pfn);
-			goto unmap_pages;
-		}
-		gfn++;
-	}
-	return 0;
-
-unmap_pages:
-	kvm_iommu_put_pages(kvm, base_gfn, i);
-	return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-	int i, r;
-
-	down_read(&kvm->slots_lock);
-	for (i = 0; i < kvm->nmemslots; i++) {
-		r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
-					kvm->memslots[i].npages);
-		if (r)
-			break;
-	}
-	up_read(&kvm->slots_lock);
-	return r;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	struct pci_dev *pdev = NULL;
-	int r;
-
-	if (!intel_iommu_found()) {
-		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
-		return -ENODEV;
-	}
-
-	printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
-	       assigned_dev->host_busnr,
-	       PCI_SLOT(assigned_dev->host_devfn),
-	       PCI_FUNC(assigned_dev->host_devfn));
-
-	pdev = assigned_dev->dev;
-
-	if (pdev == NULL) {
-		if (kvm->arch.intel_iommu_domain) {
-			intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
-			kvm->arch.intel_iommu_domain = NULL;
-		}
-		return -ENODEV;
-	}
-
-	kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
-	if (!kvm->arch.intel_iommu_domain)
-		return -ENODEV;
-
-	r = kvm_iommu_map_memslots(kvm);
-	if (r)
-		goto out_unmap;
-
-	intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
-			       pdev->bus->number, pdev->devfn);
-
-	r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
-					pdev);
-	if (r) {
-		printk(KERN_ERR "Domain context map for %s failed",
-		       pci_name(pdev));
-		goto out_unmap;
-	}
-	return 0;
-
-out_unmap:
-	kvm_iommu_unmap_memslots(kvm);
-	return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-			       gfn_t base_gfn, unsigned long npages)
-{
-	gfn_t gfn = base_gfn;
-	pfn_t pfn;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-	int i;
-
-	for (i = 0; i < npages; i++) {
-		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
-						     gfn_to_gpa(gfn));
-		kvm_release_pfn_clean(pfn);
-		gfn++;
-	}
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-	int i;
-	down_read(&kvm->slots_lock);
-	for (i = 0; i < kvm->nmemslots; i++) {
-		kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
-				    kvm->memslots[i].npages);
-	}
-	up_read(&kvm->slots_lock);
-
-	return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	struct kvm_assigned_dev_kernel *entry;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
-		printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
-		       entry->host_busnr,
-		       PCI_SLOT(entry->host_devfn),
-		       PCI_FUNC(entry->host_devfn));
-
-		/* detach kvm dmar domain */
-		intel_iommu_detach_dev(domain, entry->host_busnr,
-				       entry->host_devfn);
-	}
-	kvm_iommu_unmap_memslots(kvm);
-	intel_iommu_domain_exit(domain);
-	return 0;
-}
diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c
new file mode 100644
index 0000000..a770874
--- /dev/null
+++ b/virt/kvm/vtd.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+				gfn_t base_gfn, unsigned long npages);
+
+int kvm_iommu_map_pages(struct kvm *kvm,
+			gfn_t base_gfn, unsigned long npages)
+{
+	gfn_t gfn = base_gfn;
+	pfn_t pfn;
+	int i, r = 0;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	for (i = 0; i < npages; i++) {
+		/* check if already mapped */
+		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+						     gfn_to_gpa(gfn));
+		if (pfn)
+			continue;
+
+		pfn = gfn_to_pfn(kvm, gfn);
+		r = intel_iommu_page_mapping(domain,
+					     gfn_to_gpa(gfn),
+					     pfn_to_hpa(pfn),
+					     PAGE_SIZE,
+					     DMA_PTE_READ |
+					     DMA_PTE_WRITE);
+		if (r) {
+			printk(KERN_ERR "kvm_iommu_map_pages:"
+			       "iommu failed to map pfn=%lx\n", pfn);
+			goto unmap_pages;
+		}
+		gfn++;
+	}
+	return 0;
+
+unmap_pages:
+	kvm_iommu_put_pages(kvm, base_gfn, i);
+	return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+	int i, r;
+
+	down_read(&kvm->slots_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
+					kvm->memslots[i].npages);
+		if (r)
+			break;
+	}
+	up_read(&kvm->slots_lock);
+	return r;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	struct pci_dev *pdev = NULL;
+	int r;
+
+	if (!intel_iommu_found()) {
+		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
+		return -ENODEV;
+	}
+
+	printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
+	       assigned_dev->host_busnr,
+	       PCI_SLOT(assigned_dev->host_devfn),
+	       PCI_FUNC(assigned_dev->host_devfn));
+
+	pdev = assigned_dev->dev;
+
+	if (pdev == NULL) {
+		if (kvm->arch.intel_iommu_domain) {
+			intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
+			kvm->arch.intel_iommu_domain = NULL;
+		}
+		return -ENODEV;
+	}
+
+	kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
+	if (!kvm->arch.intel_iommu_domain)
+		return -ENODEV;
+
+	r = kvm_iommu_map_memslots(kvm);
+	if (r)
+		goto out_unmap;
+
+	intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
+			       pdev->bus->number, pdev->devfn);
+
+	r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
+					pdev);
+	if (r) {
+		printk(KERN_ERR "Domain context map for %s failed",
+		       pci_name(pdev));
+		goto out_unmap;
+	}
+	return 0;
+
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+			       gfn_t base_gfn, unsigned long npages)
+{
+	gfn_t gfn = base_gfn;
+	pfn_t pfn;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+						     gfn_to_gpa(gfn));
+		kvm_release_pfn_clean(pfn);
+		gfn++;
+	}
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+	int i;
+	down_read(&kvm->slots_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
+				    kvm->memslots[i].npages);
+	}
+	up_read(&kvm->slots_lock);
+
+	return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+	struct kvm_assigned_dev_kernel *entry;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
+		printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
+		       entry->host_busnr,
+		       PCI_SLOT(entry->host_devfn),
+		       PCI_FUNC(entry->host_devfn));
+
+		/* detach kvm dmar domain */
+		intel_iommu_detach_dev(domain, entry->host_busnr,
+				       entry->host_devfn);
+	}
+	kvm_iommu_unmap_memslots(kvm);
+	intel_iommu_domain_exit(domain);
+	return 0;
+}
-- 
cgit v0.10.2


From 8a98f6648a2b0756d8f26d6c13332f5526355fec Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Mon, 6 Oct 2008 13:47:38 +0800
Subject: KVM: Move device assignment logic to common code

To share with other archs, this patch moves device assignment
logic to common parts.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d6d7123..f8bde01 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -30,7 +30,6 @@
 #include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
-#include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
@@ -107,238 +106,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct list_head *ptr;
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each(ptr, head) {
-		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-				    interrupt_work);
-
-	/* This is taken to safely inject irq inside the guest. When
-	 * the interrupt injection (or the ioapic code) uses a
-	 * finer-grained lock, update this
-	 */
-	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm,
-		    assigned_dev->guest_irq, 1);
-	mutex_unlock(&assigned_dev->kvm->lock);
-	kvm_put_kvm(assigned_dev->kvm);
-}
-
-/* FIXME: Implement the OR logic needed to make shared interrupts on
- * this line behave properly
- */
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev =
-		(struct kvm_assigned_dev_kernel *) dev_id;
-
-	kvm_get_kvm(assigned_dev->kvm);
-	schedule_work(&assigned_dev->interrupt_work);
-	disable_irq_nosync(irq);
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev;
-
-	if (kian->gsi == -1)
-		return;
-
-	dev = container_of(kian, struct kvm_assigned_dev_kernel,
-			   ack_notifier);
-	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
-	enable_irq(dev->host_irq);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
-		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-
-	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-
-	if (cancel_work_sync(&assigned_dev->interrupt_work))
-		/* We had pending work. That means we will have to take
-		 * care of kvm_put_kvm.
-		 */
-		kvm_put_kvm(kvm);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-static void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
-
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq
-				   *assigned_irq)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match) {
-		mutex_unlock(&kvm->lock);
-		return -EINVAL;
-	}
-
-	if (match->irq_requested) {
-		match->guest_irq = assigned_irq->guest_irq;
-		match->ack_notifier.gsi = assigned_irq->guest_irq;
-		mutex_unlock(&kvm->lock);
-		return 0;
-	}
-
-	INIT_WORK(&match->interrupt_work,
-		  kvm_assigned_dev_interrupt_work_handler);
-
-	if (irqchip_in_kernel(kvm)) {
-		if (!capable(CAP_SYS_RAWIO)) {
-			r = -EPERM;
-			goto out_release;
-		}
-
-		if (assigned_irq->host_irq)
-			match->host_irq = assigned_irq->host_irq;
-		else
-			match->host_irq = match->dev->irq;
-		match->guest_irq = assigned_irq->guest_irq;
-		match->ack_notifier.gsi = assigned_irq->guest_irq;
-		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
-
-		/* Even though this is PCI, we don't want to use shared
-		 * interrupts. Sharing host devices with guest-assigned devices
-		 * on the same interrupt line is not a happy situation: there
-		 * are going to be long delays in accepting, acking, etc.
-		 */
-		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
-				"kvm_assigned_device", (void *)match)) {
-			r = -EIO;
-			goto out_release;
-		}
-	}
-
-	match->irq_requested = true;
-	mutex_unlock(&kvm->lock);
-	return r;
-out_release:
-	mutex_unlock(&kvm->lock);
-	kvm_free_assigned_device(kvm, match);
-	return r;
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EINVAL;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_bus_and_slot(assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->dev = dev;
-
-	match->kvm = kvm;
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-		r = kvm_iommu_map_guest(kvm, match);
-		if (r)
-			goto out_list_del;
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-out_list_del:
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
@@ -2030,28 +1797,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
 	case KVM_GET_PIT: {
 		r = -EFAULT;
 		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 4269be1..9acf34a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -383,7 +383,9 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#ifdef CONFIG_X86
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
+#endif
 #define KVM_CAP_IOMMU 18
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 73b7c52..10c1146 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -281,6 +281,7 @@ void kvm_free_physmem(struct kvm *kvm);
 
 struct  kvm *kvm_arch_create_vm(void);
 void kvm_arch_destroy_vm(struct kvm *kvm);
+void kvm_free_all_assigned_devices(struct kvm *kvm);
 
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 98cd916..485bcdc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,12 @@
 #include "coalesced_mmio.h"
 #endif
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include "irq.h"
+#endif
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -71,6 +77,240 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 
 bool kvm_rebooting;
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+						      int assigned_dev_id)
+{
+	struct list_head *ptr;
+	struct kvm_assigned_dev_kernel *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+		if (match->assigned_dev_id == assigned_dev_id)
+			return match;
+	}
+	return NULL;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+				    interrupt_work);
+
+	/* This is taken to safely inject irq inside the guest. When
+	 * the interrupt injection (or the ioapic code) uses a
+	 * finer-grained lock, update this
+	 */
+	mutex_lock(&assigned_dev->kvm->lock);
+	kvm_set_irq(assigned_dev->kvm,
+		    assigned_dev->guest_irq, 1);
+	mutex_unlock(&assigned_dev->kvm->lock);
+	kvm_put_kvm(assigned_dev->kvm);
+}
+
+/* FIXME: Implement the OR logic needed to make shared interrupts on
+ * this line behave properly
+ */
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev =
+		(struct kvm_assigned_dev_kernel *) dev_id;
+
+	kvm_get_kvm(assigned_dev->kvm);
+	schedule_work(&assigned_dev->interrupt_work);
+	disable_irq_nosync(irq);
+	return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_assigned_dev_kernel *dev;
+
+	if (kian->gsi == -1)
+		return;
+
+	dev = container_of(kian, struct kvm_assigned_dev_kernel,
+			   ack_notifier);
+	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
+	enable_irq(dev->host_irq);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+				     struct kvm_assigned_dev_kernel
+				     *assigned_dev)
+{
+	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+
+	if (cancel_work_sync(&assigned_dev->interrupt_work))
+		/* We had pending work. That means we will have to take
+		 * care of kvm_put_kvm.
+		 */
+		kvm_put_kvm(kvm);
+
+	pci_release_regions(assigned_dev->dev);
+	pci_disable_device(assigned_dev->dev);
+	pci_dev_put(assigned_dev->dev);
+
+	list_del(&assigned_dev->list);
+	kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		kvm_free_assigned_device(kvm, assigned_dev);
+	}
+}
+
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+				   struct kvm_assigned_irq
+				   *assigned_irq)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match) {
+		mutex_unlock(&kvm->lock);
+		return -EINVAL;
+	}
+
+	if (match->irq_requested) {
+		match->guest_irq = assigned_irq->guest_irq;
+		match->ack_notifier.gsi = assigned_irq->guest_irq;
+		mutex_unlock(&kvm->lock);
+		return 0;
+	}
+
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
+
+	if (irqchip_in_kernel(kvm)) {
+		if (!capable(CAP_SYS_RAWIO)) {
+			r = -EPERM;
+			goto out_release;
+		}
+
+		if (assigned_irq->host_irq)
+			match->host_irq = assigned_irq->host_irq;
+		else
+			match->host_irq = match->dev->irq;
+		match->guest_irq = assigned_irq->guest_irq;
+		match->ack_notifier.gsi = assigned_irq->guest_irq;
+		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+
+		/* Even though this is PCI, we don't want to use shared
+		 * interrupts. Sharing host devices with guest-assigned devices
+		 * on the same interrupt line is not a happy situation: there
+		 * are going to be long delays in accepting, acking, etc.
+		 */
+		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
+				"kvm_assigned_device", (void *)match)) {
+			r = -EIO;
+			goto out_release;
+		}
+	}
+
+	match->irq_requested = true;
+	mutex_unlock(&kvm->lock);
+	return r;
+out_release:
+	mutex_unlock(&kvm->lock);
+	kvm_free_assigned_device(kvm, match);
+	return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+				      struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+	struct pci_dev *dev;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (match) {
+		/* device already assigned */
+		r = -EINVAL;
+		goto out;
+	}
+
+	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+	if (match == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory\n",
+		       __func__);
+		r = -ENOMEM;
+		goto out;
+	}
+	dev = pci_get_bus_and_slot(assigned_dev->busnr,
+				   assigned_dev->devfn);
+	if (!dev) {
+		printk(KERN_INFO "%s: host device not found\n", __func__);
+		r = -EINVAL;
+		goto out_free;
+	}
+	if (pci_enable_device(dev)) {
+		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+		r = -EBUSY;
+		goto out_put;
+	}
+	r = pci_request_regions(dev, "kvm_assigned_device");
+	if (r) {
+		printk(KERN_INFO "%s: Could not get access to device regions\n",
+		       __func__);
+		goto out_disable;
+	}
+	match->assigned_dev_id = assigned_dev->assigned_dev_id;
+	match->host_busnr = assigned_dev->busnr;
+	match->host_devfn = assigned_dev->devfn;
+	match->dev = dev;
+
+	match->kvm = kvm;
+
+	list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+		r = kvm_iommu_map_guest(kvm, match);
+		if (r)
+			goto out_list_del;
+	}
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+out_list_del:
+	list_del(&match->list);
+	pci_release_regions(dev);
+out_disable:
+	pci_disable_device(dev);
+out_put:
+	pci_dev_put(dev);
+out_free:
+	kfree(match);
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+#endif
+
 static inline int valid_vcpu(int n)
 {
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
@@ -578,12 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	}
 
 	kvm_free_physmem_slot(&old, &new);
-
+#ifdef CONFIG_DMAR
 	/* map the pages in iommu page table */
 	r = kvm_iommu_map_pages(kvm, base_gfn, npages);
 	if (r)
 		goto out;
-
+#endif
 	return 0;
 
 out_free:
@@ -1383,6 +1623,30 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+	case KVM_ASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
cgit v0.10.2


From c77fb9dc7a0383c86eabef30272a763a482403e1 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sat, 27 Sep 2008 10:55:40 +0800
Subject: KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for
 all archs

Add a kvm_ prefix to avoid polluting kernel's name space.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 10c1146..b3b7598 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -288,6 +288,8 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
+int kvm_is_mmio_pfn(pfn_t pfn);
+
 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
 	unsigned gsi;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 485bcdc..cf0ab8e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -316,7 +316,7 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
-static inline int is_mmio_pfn(pfn_t pfn)
+inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn))
 		return PageReserved(pfn_to_page(pfn));
@@ -994,7 +994,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 		up_read(&current->mm->mmap_sem);
-		BUG_ON(!is_mmio_pfn(pfn));
+		BUG_ON(!kvm_is_mmio_pfn(pfn));
 	} else
 		pfn = page_to_pfn(page[0]);
 
@@ -1008,10 +1008,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 	pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
-	if (!is_mmio_pfn(pfn))
+	if (!kvm_is_mmio_pfn(pfn))
 		return pfn_to_page(pfn);
 
-	WARN_ON(is_mmio_pfn(pfn));
+	WARN_ON(kvm_is_mmio_pfn(pfn));
 
 	get_page(bad_page);
 	return bad_page;
@@ -1027,7 +1027,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (!is_mmio_pfn(pfn))
+	if (!kvm_is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -1053,7 +1053,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-	if (!is_mmio_pfn(pfn)) {
+	if (!kvm_is_mmio_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 		if (!PageReserved(page))
 			SetPageDirty(page);
@@ -1063,14 +1063,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-	if (!is_mmio_pfn(pfn))
+	if (!kvm_is_mmio_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
 void kvm_get_pfn(pfn_t pfn)
 {
-	if (!is_mmio_pfn(pfn))
+	if (!kvm_is_mmio_pfn(pfn))
 		get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);
-- 
cgit v0.10.2


From 3de42dc094ecd313dc7d551e007a134b52f8663d Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Mon, 6 Oct 2008 13:48:45 +0800
Subject: KVM: Separate irq ack notification out of arch/x86/kvm/irq.c

Moving irq ack notification logic as common, and make
it shared with ia64 side.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 1efe513..da579a3 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -413,6 +413,10 @@ struct kvm_arch {
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
 	struct kvm_sal_data rdv_sal_data;
+
+	struct list_head assigned_dev_head;
+	struct dmar_domain *intel_iommu_domain;
+	struct hlist_head irq_ack_notifier_list;
 };
 
 union cpuid3_t {
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index bf22fb9..3b1a1c1 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -44,7 +44,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-		coalesced_mmio.o)
+		coalesced_mmio.o irq_comm.o)
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
index f2e6545..604329a 100644
--- a/arch/ia64/kvm/irq.h
+++ b/arch/ia64/kvm/irq.h
@@ -23,10 +23,5 @@
 #ifndef __IRQ_H
 #define __IRQ_H
 
-struct kvm;
-
-static inline void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
-{
-}
 
 #endif
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 7dce593..c023435 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,7 +3,7 @@
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                coalesced_mmio.o)
+                coalesced_mmio.o irq_comm.o)
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 8c1b9c5..c019b8e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -99,36 +99,3 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 	__kvm_migrate_apic_timer(vcpu);
 	__kvm_migrate_pit_timer(vcpu);
 }
-
-/* This should be called with the kvm->lock mutex held */
-void kvm_set_irq(struct kvm *kvm, int irq, int level)
-{
-	/* Not possible to detect if the guest uses the PIC or the
-	 * IOAPIC.  So set the bit in both. The guest will ignore
-	 * writes to the unused one.
-	 */
-	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
-	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
-}
-
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
-{
-	struct kvm_irq_ack_notifier *kian;
-	struct hlist_node *n;
-
-	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
-		if (kian->gsi == gsi)
-			kian->irq_acked(kian);
-}
-
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
-				   struct kvm_irq_ack_notifier *kian)
-{
-	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
-}
-
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian)
-{
-	hlist_del(&kian->link);
-}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 4748532..f17c8f5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -68,7 +68,6 @@ struct kvm_pic {
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 void kvm_pic_clear_isr_ack(struct kvm *kvm);
@@ -85,13 +84,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
-void kvm_set_irq(struct kvm *kvm, int irq, int level);
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
-				   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian);
-
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index ca6bbc0..411fb8c 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -565,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b3b7598..3833c48 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -309,6 +309,12 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 };
+void kvm_set_irq(struct kvm *kvm, int irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian);
 
 #ifdef CONFIG_DMAR
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
new file mode 100644
index 0000000..d0169f5
--- /dev/null
+++ b/virt/kvm/irq_comm.c
@@ -0,0 +1,60 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "irq.h"
+
+#include "ioapic.h"
+
+/* This should be called with the kvm->lock mutex held */
+void kvm_set_irq(struct kvm *kvm, int irq, int level)
+{
+	/* Not possible to detect if the guest uses the PIC or the
+	 * IOAPIC.  So set the bit in both. The guest will ignore
+	 * writes to the unused one.
+	 */
+	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
+#ifdef CONFIG_X86
+	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
+#endif
+}
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+{
+	struct kvm_irq_ack_notifier *kian;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
+		if (kian->gsi == gsi)
+			kian->irq_acked(kian);
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian)
+{
+	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian)
+{
+	hlist_del(&kian->link);
+}
-- 
cgit v0.10.2


From 2f7497719179a9f3270b05434be989d21f9fdc09 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sat, 27 Sep 2008 11:46:36 +0800
Subject: KVM: Move irqchip_in_kernel() from ioapic.h to irq.h

Moving irqchip_in_kernel() from ioapic.h to irq.h.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
index 604329a..c6786e8 100644
--- a/arch/ia64/kvm/irq.h
+++ b/arch/ia64/kvm/irq.h
@@ -23,5 +23,9 @@
 #ifndef __IRQ_H
 #define __IRQ_H
 
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	return 1;
+}
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7ad759e..a6cf719 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -45,6 +45,7 @@
 #include "iodev.h"
 #include "ioapic.h"
 #include "lapic.h"
+#include "irq.h"
 
 static unsigned long kvm_vmm_base;
 static unsigned long kvm_vsa_base;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index b52732f..cd7ae76 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -79,13 +79,6 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
-#ifdef CONFIG_IA64
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-	return 1;
-}
-#endif
-
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 				       unsigned long bitmap);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
-- 
cgit v0.10.2


From 1cbea809c400661eecb538e0dd0bc4f3660f0a35 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 3 Oct 2008 14:58:09 +0800
Subject: KVM: ia64: Make pmt table be able to hold physical mmio entries.

Don't try to do put_page once the entries are mmio.
Set the tag to indicate the mmio space for vmm setting
TLB's memory attribute.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index a6cf719..800a4f2 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1437,17 +1437,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		int user_alloc)
 {
 	unsigned long i;
-	struct page *page;
+	unsigned long pfn;
 	int npages = mem->memory_size >> PAGE_SHIFT;
 	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
 	unsigned long base_gfn = memslot->base_gfn;
 
 	for (i = 0; i < npages; i++) {
-		page = gfn_to_page(kvm, base_gfn + i);
-		kvm_set_pmt_entry(kvm, base_gfn + i,
-				page_to_pfn(page) << PAGE_SHIFT,
-				_PAGE_AR_RWX|_PAGE_MA_WB);
-		memslot->rmap[i] = (unsigned long)page;
+		pfn = gfn_to_pfn(kvm, base_gfn + i);
+		if (!kvm_is_mmio_pfn(pfn)) {
+			kvm_set_pmt_entry(kvm, base_gfn + i,
+					pfn << PAGE_SHIFT,
+					_PAGE_MA_WB);
+			memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
+		} else {
+			kvm_set_pmt_entry(kvm, base_gfn + i,
+					GPFN_LOW_MMIO | (pfn << PAGE_SHIFT),
+					_PAGE_MA_UC);
+			memslot->rmap[i] = 0;
+			}
 	}
 
 	return 0;
-- 
cgit v0.10.2


From b010eb5103cfbe12ae6f08a4cdb3a748bf78c410 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sun, 28 Sep 2008 01:39:46 -0700
Subject: KVM: ia64: add directed mmio range support for kvm guests

Using vt-d, kvm guests can be assigned physcial devices, so
this patch introduce a new mmio type (directed mmio)
to handle its mmio access.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index da579a3..85db124 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -132,7 +132,7 @@
 #define GPFN_IOSAPIC        (4UL << 60) /* IOSAPIC base */
 #define GPFN_LEGACY_IO      (5UL << 60) /* Legacy I/O base */
 #define GPFN_GFW        (6UL << 60) /* Guest Firmware */
-#define GPFN_HIGH_MMIO      (7UL << 60) /* High MMIO range */
+#define GPFN_PHYS_MMIO      (7UL << 60) /* Directed MMIO Range */
 
 #define GPFN_IO_MASK        (7UL << 60) /* Guest pfn is I/O type */
 #define GPFN_INV_MASK       (1UL << 63) /* Guest pfn is invalid */
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 800a4f2..3df82f3 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1447,11 +1447,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		if (!kvm_is_mmio_pfn(pfn)) {
 			kvm_set_pmt_entry(kvm, base_gfn + i,
 					pfn << PAGE_SHIFT,
-					_PAGE_MA_WB);
+				_PAGE_AR_RWX | _PAGE_MA_WB);
 			memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
 		} else {
 			kvm_set_pmt_entry(kvm, base_gfn + i,
-					GPFN_LOW_MMIO | (pfn << PAGE_SHIFT),
+					GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
 					_PAGE_MA_UC);
 			memslot->rmap[i] = 0;
 			}
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index b0fcfb6..341e3fe 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
 	trp->rid = rid;
 }
 
-extern u64 kvm_lookup_mpa(u64 gpfn);
-extern u64 kvm_gpa_to_mpa(u64 gpa);
-
-/* Return I/O type if trye */
-#define __gpfn_is_io(gpfn)			\
-	({						\
-	 u64 pte, ret = 0;			\
-	 pte = kvm_lookup_mpa(gpfn);		\
-	 if (!(pte & GPFN_INV_MASK))		\
-	 ret = pte & GPFN_IO_MASK;	\
-	 ret;					\
-	 })
+extern u64 kvm_get_mpt_entry(u64 gpfn);
 
+/* Return I/ */
+static inline u64 __gpfn_is_io(u64 gpfn)
+{
+	u64  pte;
+	pte = kvm_get_mpt_entry(gpfn);
+	if (!(pte & GPFN_INV_MASK)) {
+		pte = pte & GPFN_IO_MASK;
+		if (pte != GPFN_PHYS_MMIO)
+			return pte;
+	}
+	return 0;
+}
 #endif
-
 #define IA64_NO_FAULT	0
 #define IA64_FAULT	1
 
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index def4576..e22b933 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
 
 u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
 {
-	u64 ps, ps_mask, paddr, maddr;
+	u64 ps, ps_mask, paddr, maddr, io_mask;
 	union pte_flags phy_pte;
 
 	ps = itir_ps(itir);
@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
 	phy_pte.val = *pte;
 	paddr = *pte;
 	paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
-	maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT);
-	if (maddr & GPFN_IO_MASK) {
+	maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT);
+	io_mask = maddr & GPFN_IO_MASK;
+	if (io_mask && (io_mask != GPFN_PHYS_MMIO)) {
 		*pte |= VTLB_PTE_IO;
 		return -1;
 	}
@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 						u64 ifa, int type)
 {
 	u64 ps;
-	u64 phy_pte;
+	u64 phy_pte, io_mask, index;
 	union ia64_rr vrr, mrr;
 	int ret = 0;
 
@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 	vrr.val = vcpu_get_rr(v, ifa);
 	mrr.val = ia64_get_rr(ifa);
 
+	index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
+	io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK;
 	phy_pte = translate_phy_pte(&pte, itir, ifa);
 
 	/* Ensure WB attribute if pte is related to a normal mem page,
 	 * which is required by vga acceleration since qemu maps shared
 	 * vram buffer with WB.
 	 */
-	if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) {
+	if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) &&
+			io_mask != GPFN_PHYS_MMIO) {
 		pte &= ~_PAGE_MA_MASK;
 		phy_pte &= ~_PAGE_MA_MASK;
 	}
@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz)
 	}
 }
 
-u64 kvm_lookup_mpa(u64 gpfn)
+u64 kvm_get_mpt_entry(u64 gpfn)
 {
 	u64 *base = (u64 *) KVM_P2M_BASE;
 	return *(base + gpfn);
 }
 
+u64 kvm_lookup_mpa(u64 gpfn)
+{
+	u64 maddr;
+	maddr = kvm_get_mpt_entry(gpfn);
+	return maddr&_PAGE_PPN_MASK;
+}
+
 u64 kvm_gpa_to_mpa(u64 gpa)
 {
 	u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
-- 
cgit v0.10.2


From 2381ad241d0bea1253a37f314b270848067640bb Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Wed, 8 Oct 2008 08:29:33 +0800
Subject: KVM: ia64: Add intel iommu support for guests.

With intel iommu hardware, we can assign devices to kvm/ia64 guests.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 3b1a1c1..cf37f8f 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -46,6 +46,10 @@ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 		coalesced_mmio.o irq_comm.o)
 
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
+
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
 obj-$(CONFIG_KVM) += kvm.o
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 3df82f3..c0699f0 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/hrtimer.h>
 #include <linux/uaccess.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/pgtable.h>
 #include <asm/gcc_intrin.h>
@@ -187,6 +188,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+	case KVM_CAP_IOMMU:
+		r = intel_iommu_found();
+		break;
 	default:
 		r = 0;
 	}
@@ -773,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm)
 	 */
 	kvm_build_io_pmt(kvm);
 
+	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 }
 
 struct  kvm *kvm_arch_create_vm(void)
@@ -1336,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_iommu_unmap_guest(kvm);
+#ifdef  KVM_CAP_DEVICE_ASSIGNMENT
+	kvm_free_all_assigned_devices(kvm);
+#endif
 	kfree(kvm->arch.vioapic);
 	kvm_release_vm_pages(kvm);
 	kvm_free_physmem(kvm);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9acf34a..797fcd7 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -383,7 +383,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86)||defined(CONFIG_IA64)
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
-- 
cgit v0.10.2