From 010c520e20413dfd567d568aba2b7238acd37e33 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 11 Oct 2010 14:23:39 +0200
Subject: KVM: Don't reset mmu context unnecessarily when updating EFER

The only bit of EFER that affects the mmu is NX, and this is already
accounted for (LME only takes effect when changing cr0).

Based on a patch by Hillf Danton.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b989e1f..c05d477 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -830,7 +830,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	kvm_x86_ops->set_efer(vcpu, efer);
 
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-	kvm_mmu_reset_context(vcpu);
 
 	/* Update reserved bits */
 	if ((efer ^ old_efer) & EFER_NX)
-- 
cgit v0.10.2


From af585b921e5d1e919947c4b1164b59507fe7cd7b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:46 +0200
Subject: KVM: Halt vcpu if page it tries to access is swapped out

If a guest accesses swapped out memory do not swap it in from vcpu thread
context. Schedule work to do swapping and put vcpu into halted state
instead.

Interrupts will still be delivered to the guest and if interrupt will
cause reschedule guest will continue to run another task.

[avi: remove call to get_user_pages_noio(), nacked by Linus; this
      makes everything synchrnous again]

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82..b5f4c1a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
 #define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
+#define ASYNC_PF_PER_VCPU 64
+
 extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
 
 struct kvm_vcpu;
 struct kvm;
+struct kvm_async_pf;
 
 enum kvm_reg {
 	VCPU_REGS_RAX = 0,
@@ -412,6 +415,11 @@ struct kvm_vcpu_arch {
 	u64 hv_vapic;
 
 	cpumask_var_t wbinvd_dirty_mask;
+
+	struct {
+		bool halted;
+		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+	} apf;
 };
 
 struct kvm_arch {
@@ -585,6 +593,10 @@ struct kvm_x86_ops {
 	const struct trace_print_flags *exit_reasons_str;
 };
 
+struct kvm_arch_async_pf {
+	gfn_t gfn;
+};
+
 extern struct kvm_x86_ops *kvm_x86_ops;
 
 int kvm_mmu_module_init(void);
@@ -799,4 +811,10 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
 
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+				     struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+				 struct kvm_async_pf *work);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131f..50f6364 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
+	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
 	---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035..c53bf19 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,6 +9,7 @@ kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
 				assigned-dev.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o timer.o
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04ae..4ab04de 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
  *
  */
 
+#include "irq.h"
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -2587,6 +2589,50 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 			     error_code & PFERR_WRITE_MASK, gfn);
 }
 
+int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+	struct kvm_arch_async_pf arch;
+	arch.gfn = gfn;
+
+	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+}
+
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+		     kvm_event_needs_reinjection(vcpu)))
+		return false;
+
+	return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
+			 pfn_t *pfn)
+{
+	bool async;
+
+	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
+
+	if (!async)
+		return false; /* *pfn has correct page already */
+
+	put_page(pfn_to_page(*pfn));
+
+	if (can_do_async_pf(vcpu)) {
+		trace_kvm_try_async_get_page(async, *pfn);
+		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+			trace_kvm_async_pf_doublefault(gva, gfn);
+			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+			return true;
+		} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+			return true;
+	}
+
+	*pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+	return false;
+}
+
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 				u32 error_code)
 {
@@ -2609,7 +2655,11 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+	if (try_async_pf(vcpu, gfn, gpa, &pfn))
+		return 0;
+
+	/* mmio */
 	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833..c45376d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -568,7 +568,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+	if (try_async_pf(vcpu, walker.gfn, addr, &pfn))
+		return 0;
 
 	/* mmio */
 	if (is_error_pfn(pfn))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c05d477..3cd4d09 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
+#include <linux/hash.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+	int i;
+	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+		vcpu->arch.apf.gfns[i] = ~0;
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 	unsigned slot;
@@ -5115,6 +5123,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
+		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+			/* Page is swapped out. Do synthetic halt */
+			vcpu->arch.apf.halted = true;
+			r = 1;
+			goto out;
+		}
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5243,7 +5257,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 
 	r = 1;
 	while (r > 0) {
-		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+		    !vcpu->arch.apf.halted)
 			r = vcpu_enter_guest(vcpu);
 		else {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5256,6 +5271,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 					vcpu->arch.mp_state =
 						KVM_MP_STATE_RUNNABLE;
 				case KVM_MP_STATE_RUNNABLE:
+					vcpu->arch.apf.halted = false;
 					break;
 				case KVM_MP_STATE_SIPI_RECEIVED:
 				default:
@@ -5277,6 +5293,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.request_irq_exits;
 		}
+
+		kvm_check_async_pf_completion(vcpu);
+
 		if (signal_pending(current)) {
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5792,6 +5811,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
+	kvm_clear_async_pf_completion_queue(vcpu);
+	kvm_async_pf_hash_reset(vcpu);
+	vcpu->arch.apf.halted = false;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -5880,6 +5903,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
 		goto fail_free_mce_banks;
 
+	kvm_async_pf_hash_reset(vcpu);
+
 	return 0;
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
@@ -5938,8 +5963,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
 	/*
 	 * Unpin any mmu pages first.
 	 */
-	kvm_for_each_vcpu(i, vcpu, kvm)
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_clear_async_pf_completion_queue(vcpu);
 		kvm_unload_vcpu_mmu(vcpu);
+	}
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_arch_vcpu_free(vcpu);
 
@@ -6050,7 +6077,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+		!vcpu->arch.apf.halted)
+		|| !list_empty_careful(&vcpu->async_pf.done)
 		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
 		|| vcpu->arch.nmi_pending ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6109,6 +6138,83 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	u32 key = kvm_async_pf_hash_fn(gfn);
+
+	while (vcpu->arch.apf.gfns[key] != ~0)
+		key = kvm_async_pf_next_probe(key);
+
+	vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	int i;
+	u32 key = kvm_async_pf_hash_fn(gfn);
+
+	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+		     (vcpu->arch.apf.gfns[key] != gfn ||
+		      vcpu->arch.apf.gfns[key] == ~0); i++)
+		key = kvm_async_pf_next_probe(key);
+
+	return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	u32 i, j, k;
+
+	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+	while (true) {
+		vcpu->arch.apf.gfns[i] = ~0;
+		do {
+			j = kvm_async_pf_next_probe(j);
+			if (vcpu->arch.apf.gfns[j] == ~0)
+				return;
+			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+			/*
+			 * k lies cyclically in ]i,j]
+			 * |    i.k.j |
+			 * |....j i.k.| or  |.k..j i...|
+			 */
+		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+		i = j;
+	}
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+				     struct kvm_async_pf *work)
+{
+	trace_kvm_async_pf_not_present(work->gva);
+
+	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+				 struct kvm_async_pf *work)
+{
+	trace_kvm_async_pf_ready(work->gva);
+	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+}
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a055742..e56acc7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -40,6 +40,7 @@
 #define KVM_REQ_KICK               9
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
+#define KVM_REQ_APF_HALT          12
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -74,6 +75,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 			      struct kvm_io_device *dev);
 
+#ifdef CONFIG_KVM_ASYNC_PF
+struct kvm_async_pf {
+	struct work_struct work;
+	struct list_head link;
+	struct list_head queue;
+	struct kvm_vcpu *vcpu;
+	struct mm_struct *mm;
+	gva_t gva;
+	unsigned long addr;
+	struct kvm_arch_async_pf arch;
+	struct page *page;
+	bool done;
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+		       struct kvm_arch_async_pf *arch);
+#endif
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -104,6 +125,15 @@ struct kvm_vcpu {
 	gpa_t mmio_phys_addr;
 #endif
 
+#ifdef CONFIG_KVM_ASYNC_PF
+	struct {
+		u32 queued;
+		struct list_head queue;
+		struct list_head done;
+		spinlock_t lock;
+	} async_pf;
+#endif
+
 	struct kvm_vcpu_arch arch;
 };
 
@@ -302,6 +332,7 @@ void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 6dd3a51..a78a5e5 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -185,6 +185,96 @@ TRACE_EVENT(kvm_age_page,
 		  __entry->referenced ? "YOUNG" : "OLD")
 );
 
+#ifdef CONFIG_KVM_ASYNC_PF
+TRACE_EVENT(
+	kvm_try_async_get_page,
+	TP_PROTO(bool async, u64 pfn),
+	TP_ARGS(async, pfn),
+
+	TP_STRUCT__entry(
+		__field(__u64, pfn)
+		),
+
+	TP_fast_assign(
+		__entry->pfn = (!async) ? pfn : (u64)-1;
+		),
+
+	TP_printk("pfn %#llx", __entry->pfn)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_not_present,
+	TP_PROTO(u64 gva),
+	TP_ARGS(gva),
+
+	TP_STRUCT__entry(
+		__field(__u64, gva)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		),
+
+	TP_printk("gva %#llx not present", __entry->gva)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_ready,
+	TP_PROTO(u64 gva),
+	TP_ARGS(gva),
+
+	TP_STRUCT__entry(
+		__field(__u64, gva)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		),
+
+	TP_printk("gva %#llx ready", __entry->gva)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_completed,
+	TP_PROTO(unsigned long address, struct page *page, u64 gva),
+	TP_ARGS(address, page, gva),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, address)
+		__field(pfn_t, pfn)
+		__field(u64, gva)
+		),
+
+	TP_fast_assign(
+		__entry->address = address;
+		__entry->pfn = page ? page_to_pfn(page) : 0;
+		__entry->gva = gva;
+		),
+
+	TP_printk("gva %#llx address %#lx pfn %#llx",  __entry->gva,
+		  __entry->address, __entry->pfn)
+);
+
+TRACE_EVENT(
+	kvm_async_pf_doublefault,
+	TP_PROTO(u64 gva, u64 gfn),
+	TP_ARGS(gva, gfn),
+
+	TP_STRUCT__entry(
+		__field(u64, gva)
+		__field(u64, gfn)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		__entry->gfn = gfn;
+		),
+
+	TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
+);
+
+#endif
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 7f1178f..f63ccb0 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
 
 config KVM_MMIO
        bool
+
+config KVM_ASYNC_PF
+       bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
new file mode 100644
index 0000000..857d634
--- /dev/null
+++ b/virt/kvm/async_pf.c
@@ -0,0 +1,190 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mmu_context.h>
+
+#include "async_pf.h"
+#include <trace/events/kvm.h>
+
+static struct kmem_cache *async_pf_cache;
+
+int kvm_async_pf_init(void)
+{
+	async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
+
+	if (!async_pf_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void kvm_async_pf_deinit(void)
+{
+	if (async_pf_cache)
+		kmem_cache_destroy(async_pf_cache);
+	async_pf_cache = NULL;
+}
+
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	INIT_LIST_HEAD(&vcpu->async_pf.done);
+	INIT_LIST_HEAD(&vcpu->async_pf.queue);
+	spin_lock_init(&vcpu->async_pf.lock);
+}
+
+static void async_pf_execute(struct work_struct *work)
+{
+	struct page *page = NULL;
+	struct kvm_async_pf *apf =
+		container_of(work, struct kvm_async_pf, work);
+	struct mm_struct *mm = apf->mm;
+	struct kvm_vcpu *vcpu = apf->vcpu;
+	unsigned long addr = apf->addr;
+	gva_t gva = apf->gva;
+
+	might_sleep();
+
+	use_mm(mm);
+	down_read(&mm->mmap_sem);
+	get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
+	up_read(&mm->mmap_sem);
+	unuse_mm(mm);
+
+	spin_lock(&vcpu->async_pf.lock);
+	list_add_tail(&apf->link, &vcpu->async_pf.done);
+	apf->page = page;
+	apf->done = true;
+	spin_unlock(&vcpu->async_pf.lock);
+
+	/*
+	 * apf may be freed by kvm_check_async_pf_completion() after
+	 * this point
+	 */
+
+	trace_kvm_async_pf_completed(addr, page, gva);
+
+	if (waitqueue_active(&vcpu->wq))
+		wake_up_interruptible(&vcpu->wq);
+
+	mmdrop(mm);
+	kvm_put_kvm(vcpu->kvm);
+}
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+	/* cancel outstanding work queue item */
+	while (!list_empty(&vcpu->async_pf.queue)) {
+		struct kvm_async_pf *work =
+			list_entry(vcpu->async_pf.queue.next,
+				   typeof(*work), queue);
+		cancel_work_sync(&work->work);
+		list_del(&work->queue);
+		if (!work->done) /* work was canceled */
+			kmem_cache_free(async_pf_cache, work);
+	}
+
+	spin_lock(&vcpu->async_pf.lock);
+	while (!list_empty(&vcpu->async_pf.done)) {
+		struct kvm_async_pf *work =
+			list_entry(vcpu->async_pf.done.next,
+				   typeof(*work), link);
+		list_del(&work->link);
+		if (work->page)
+			put_page(work->page);
+		kmem_cache_free(async_pf_cache, work);
+	}
+	spin_unlock(&vcpu->async_pf.lock);
+
+	vcpu->async_pf.queued = 0;
+}
+
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
+{
+	struct kvm_async_pf *work;
+
+	if (list_empty_careful(&vcpu->async_pf.done))
+		return;
+
+	spin_lock(&vcpu->async_pf.lock);
+	work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link);
+	list_del(&work->link);
+	spin_unlock(&vcpu->async_pf.lock);
+
+	kvm_arch_async_page_present(vcpu, work);
+
+	list_del(&work->queue);
+	vcpu->async_pf.queued--;
+	if (work->page)
+		put_page(work->page);
+	kmem_cache_free(async_pf_cache, work);
+}
+
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+		       struct kvm_arch_async_pf *arch)
+{
+	struct kvm_async_pf *work;
+
+	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
+		return 0;
+
+	/* setup delayed work */
+
+	/*
+	 * do alloc nowait since if we are going to sleep anyway we
+	 * may as well sleep faulting in page
+	 */
+	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
+	if (!work)
+		return 0;
+
+	work->page = NULL;
+	work->done = false;
+	work->vcpu = vcpu;
+	work->gva = gva;
+	work->addr = gfn_to_hva(vcpu->kvm, gfn);
+	work->arch = *arch;
+	work->mm = current->mm;
+	atomic_inc(&work->mm->mm_count);
+	kvm_get_kvm(work->vcpu->kvm);
+
+	/* this can't really happen otherwise gfn_to_pfn_async
+	   would succeed */
+	if (unlikely(kvm_is_error_hva(work->addr)))
+		goto retry_sync;
+
+	INIT_WORK(&work->work, async_pf_execute);
+	if (!schedule_work(&work->work))
+		goto retry_sync;
+
+	list_add_tail(&work->queue, &vcpu->async_pf.queue);
+	vcpu->async_pf.queued++;
+	kvm_arch_async_page_not_present(vcpu, work);
+	return 1;
+retry_sync:
+	kvm_put_kvm(work->vcpu->kvm);
+	mmdrop(work->mm);
+	kmem_cache_free(async_pf_cache, work);
+	return 0;
+}
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
new file mode 100644
index 0000000..e7ef6447
--- /dev/null
+++ b/virt/kvm/async_pf.h
@@ -0,0 +1,36 @@
+/*
+ * kvm asynchronous fault support
+ *
+ * Copyright 2010 Red Hat, Inc.
+ *
+ * Author:
+ *      Gleb Natapov <gleb@redhat.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KVM_ASYNC_PF_H__
+#define __KVM_ASYNC_PF_H__
+
+#ifdef CONFIG_KVM_ASYNC_PF
+int kvm_async_pf_init(void);
+void kvm_async_pf_deinit(void);
+void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
+#else
+#define kvm_async_pf_init() (0)
+#define kvm_async_pf_deinit() do{}while(0)
+#define kvm_async_pf_vcpu_init(C) do{}while(0)
+#endif
+
+#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5225052..75fd590 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -55,6 +55,7 @@
 #include <asm-generic/bitops/le.h>
 
 #include "coalesced_mmio.h"
+#include "async_pf.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -186,6 +187,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	init_waitqueue_head(&vcpu->wq);
+	kvm_async_pf_vcpu_init(vcpu);
 
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
@@ -946,15 +948,20 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
+			bool *async)
 {
 	struct page *page[1];
-	int npages;
+	int npages = 0;
 	pfn_t pfn;
 
-	if (atomic)
+	/* we can do it either atomically or asynchronously, not both */
+	BUG_ON(atomic && async);
+
+	if (atomic || async)
 		npages = __get_user_pages_fast(addr, 1, 1, page);
-	else {
+
+	if (unlikely(npages != 1) && !atomic) {
 		might_sleep();
 		npages = get_user_pages_fast(addr, 1, 1, page);
 	}
@@ -976,6 +983,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
 
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
+			if (async && !(vma->vm_flags & VM_PFNMAP) &&
+			    (vma->vm_flags & VM_WRITE))
+				*async = true;
 			up_read(&current->mm->mmap_sem);
 return_fault_page:
 			get_page(fault_page);
@@ -993,32 +1003,41 @@ return_fault_page:
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 {
-	return hva_to_pfn(kvm, addr, true);
+	return hva_to_pfn(kvm, addr, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
 {
 	unsigned long addr;
 
+	if (async)
+		*async = false;
+
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr)) {
 		get_page(bad_page);
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, atomic);
+	return hva_to_pfn(kvm, addr, atomic, async);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, true);
+	return __gfn_to_pfn(kvm, gfn, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async)
+{
+	return __gfn_to_pfn(kvm, gfn, false, async);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
+
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, false);
+	return __gfn_to_pfn(kvm, gfn, false, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
@@ -1026,7 +1045,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr, false);
+	return hva_to_pfn(kvm, addr, false, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -2336,6 +2355,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		goto out_free_5;
 	}
 
+	r = kvm_async_pf_init();
+	if (r)
+		goto out_free;
+
 	kvm_chardev_ops.owner = module;
 	kvm_vm_fops.owner = module;
 	kvm_vcpu_fops.owner = module;
@@ -2343,7 +2366,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	r = misc_register(&kvm_dev);
 	if (r) {
 		printk(KERN_ERR "kvm: misc device register failed\n");
-		goto out_free;
+		goto out_unreg;
 	}
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2353,6 +2376,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 	return 0;
 
+out_unreg:
+	kvm_async_pf_deinit();
 out_free:
 	kmem_cache_destroy(kvm_vcpu_cache);
 out_free_5:
@@ -2385,6 +2410,7 @@ void kvm_exit(void)
 	kvm_exit_debug();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
+	kvm_async_pf_deinit();
 	sysdev_unregister(&kvm_sysdev);
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
-- 
cgit v0.10.2


From 56028d0861e48f7cc9c573d79f2d8a0a933a2bba Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 17 Oct 2010 18:13:42 +0200
Subject: KVM: Retry fault before vmentry

When page is swapped in it is mapped into guest memory only after guest
tries to access it again and generate another fault. To save this fault
we can map it immediately since we know that guest is going to access
the page. Do it only when tdp is enabled for now. Shadow paging case is
more complicated. CR[034] and EFER registers should be switched before
doing mapping and then switched back.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b5f4c1a..c3076bc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -241,7 +241,7 @@ struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
-	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool no_apf);
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
@@ -815,6 +815,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work);
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+			       struct kvm_async_pf *work);
 extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4ab04de..b2c6098 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2570,7 +2570,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 }
 
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-				u32 error_code)
+				u32 error_code, bool no_apf)
 {
 	gfn_t gfn;
 	int r;
@@ -2606,8 +2606,8 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
-static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
-			 pfn_t *pfn)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+			 gva_t gva, pfn_t *pfn)
 {
 	bool async;
 
@@ -2618,7 +2618,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 
 	put_page(pfn_to_page(*pfn));
 
-	if (can_do_async_pf(vcpu)) {
+	if (!no_apf && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(async, *pfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
@@ -2633,8 +2633,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 	return false;
 }
 
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
-				u32 error_code)
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+			  bool no_apf)
 {
 	pfn_t pfn;
 	int r;
@@ -2656,7 +2656,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, gfn, gpa, &pfn))
+	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn))
 		return 0;
 
 	/* mmio */
@@ -3319,7 +3319,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 	int r;
 	enum emulation_result er;
 
-	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
 	if (r < 0)
 		goto out;
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c45376d..d6b281e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -527,8 +527,8 @@ out_gpte_changed:
  *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
  *           a negative value on error.
  */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
-			       u32 error_code)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+			     bool no_apf)
 {
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
@@ -569,7 +569,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, walker.gfn, addr, &pfn))
+	if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn))
 		return 0;
 
 	/* mmio */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3cd4d09..71beb27 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6138,6 +6138,20 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+	int r;
+
+	if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
+		return;
+
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		return;
+
+	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+}
+
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
 {
 	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 857d634..e97eae9 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -132,6 +132,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 	list_del(&work->link);
 	spin_unlock(&vcpu->async_pf.lock);
 
+	if (work->page)
+		kvm_arch_async_page_ready(vcpu, work);
 	kvm_arch_async_page_present(vcpu, work);
 
 	list_del(&work->queue);
-- 
cgit v0.10.2


From 49c7754ce57063b819b01eb8a4290841ad0886c4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 18 Oct 2010 15:22:23 +0200
Subject: KVM: Add memory slot versioning and use it to provide fast guest
 write interface

Keep track of memslots changes by keeping generation number in memslots
structure. Provide kvm_write_guest_cached() function that skips
gfn_to_hva() translation if memslots was not changed since previous
invocation.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 71beb27..bd25477 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3190,6 +3190,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+		slots->generation++;
 
 		old_slots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e56acc7..e674820 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -199,6 +199,7 @@ struct kvm_irq_routing_table {};
 
 struct kvm_memslots {
 	int nmemslots;
+	u64 generation;
 	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
 					KVM_PRIVATE_MEM_SLOTS];
 };
@@ -352,12 +353,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len);
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 7ac0d4e..fa7cc72 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -67,4 +67,11 @@ struct kvm_lapic_irq {
 	u32 dest_id;
 };
 
+struct gfn_to_hva_cache {
+	u64 generation;
+	gpa_t gpa;
+	unsigned long hva;
+	struct kvm_memory_slot *memslot;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 75fd590..228f00f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -687,6 +687,7 @@ skip_lpage:
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		if (mem->slot >= slots->nmemslots)
 			slots->nmemslots = mem->slot + 1;
+		slots->generation++;
 		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
 
 		old_memslots = kvm->memslots;
@@ -721,6 +722,7 @@ skip_lpage:
 	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 	if (mem->slot >= slots->nmemslots)
 		slots->nmemslots = mem->slot + 1;
+	slots->generation++;
 
 	/* actual memory is freed via old in kvm_free_physmem_slot below */
 	if (!npages) {
@@ -851,10 +853,10 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
+						gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -865,6 +867,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 	}
 	return NULL;
 }
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+}
 EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -927,12 +934,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return memslot - slots->memslots;
 }
 
-static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				     gfn_t *nr_pages)
 {
-	struct kvm_memory_slot *slot;
-
-	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
 
@@ -944,7 +948,7 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
 
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
-	return gfn_to_hva_many(kvm, gfn, NULL);
+	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
@@ -1054,7 +1058,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 	unsigned long addr;
 	gfn_t entry;
 
-	addr = gfn_to_hva_many(kvm, gfn, &entry);
+	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
 	if (kvm_is_error_hva(addr))
 		return -1;
 
@@ -1238,6 +1242,47 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 	return 0;
 }
 
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int offset = offset_in_page(gpa);
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+
+	ghc->gpa = gpa;
+	ghc->generation = slots->generation;
+	ghc->memslot = __gfn_to_memslot(slots, gfn);
+	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
+	if (!kvm_is_error_hva(ghc->hva))
+		ghc->hva += offset;
+	else
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	int r;
+
+	if (slots->generation != ghc->generation)
+		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+	if (kvm_is_error_hva(ghc->hva))
+		return -EFAULT;
+
+	r = copy_to_user((void __user *)ghc->hva, data, len);
+	if (r)
+		return -EFAULT;
+	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
 	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
@@ -1263,11 +1308,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     gfn_t gfn)
 {
-	struct kvm_memory_slot *memslot;
-
-	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
@@ -1275,6 +1318,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 	}
 }
 
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	memslot = gfn_to_memslot(kvm, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
+}
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
-- 
cgit v0.10.2


From ca3f10172eea9b95bbb66487656f3c3e93855702 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:49 +0200
Subject: KVM paravirt: Move kvm_smp_prepare_boot_cpu() from kvmclock.c to
 kvm.c.

Async PF also needs to hook into smp_prepare_boot_cpu so move the hook
into generic code.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6..e3faaaf 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -65,6 +65,7 @@ struct kvm_mmu_op_release_pt {
 #include <asm/processor.h>
 
 extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
 
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8..e6db179 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -231,10 +231,21 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+	WARN_ON(kvm_register_clock("primary cpu clock"));
+	native_smp_prepare_boot_cpu();
+}
+#endif
+
 void __init kvm_guest_init(void)
 {
 	if (!kvm_para_available())
 		return;
 
 	paravirt_ops_setup();
+#ifdef CONFIG_SMP
+	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#endif
 }
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce3..f98d3ea 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
 {
 	int cpu = smp_processor_id();
 	int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 }
 #endif
 
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
-	WARN_ON(kvm_register_clock("primary cpu clock"));
-	native_smp_prepare_boot_cpu();
-}
-#endif
-
 /*
  * After the clock is registered, the host will keep writing to the
  * registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
 	x86_cpuinit.setup_percpu_clockev =
 		kvm_setup_secondary_clock;
 #endif
-#ifdef CONFIG_SMP
-	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
 	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
-- 
cgit v0.10.2


From 344d9588a9df06182684168be4f1408b55c7da3e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:50 +0200
Subject: KVM: Add PV MSR to enable asynchronous page faults delivery.

Guest enables async PF vcpu functionality using this MSR.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
index 14a12ea..8820685 100644
--- a/Documentation/kvm/cpuid.txt
+++ b/Documentation/kvm/cpuid.txt
@@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP                 ||     2 || deprecated.
 KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
                                    ||       || 0x4b564d00 and 0x4b564d01
 ------------------------------------------------------------------------------
+KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
+                                   ||       || writing to msr 0x4b564d02
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
index 8ddcfe8..e67b4a8 100644
--- a/Documentation/kvm/msr.txt
+++ b/Documentation/kvm/msr.txt
@@ -3,7 +3,6 @@ Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
 =====================================================
 
 KVM makes use of some custom MSRs to service some requests.
-At present, this facility is only used by kvmclock.
 
 Custom MSRs have a range reserved for them, that goes from
 0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
@@ -151,3 +150,37 @@ MSR_KVM_SYSTEM_TIME: 0x12
 			return PRESENT;
 		} else
 			return NON_PRESENT;
+
+MSR_KVM_ASYNC_PF_EN: 0x4b564d02
+	data: Bits 63-6 hold 64-byte aligned physical address of a
+	64 byte memory area which must be in guest RAM and must be
+	zeroed. Bits 5-1 are reserved and should be zero. Bit 0 is 1
+	when asynchronous page faults are enabled on the vcpu 0 when
+	disabled.
+
+	First 4 byte of 64 byte memory location will be written to by
+	the hypervisor at the time of asynchronous page fault (APF)
+	injection to indicate type of asynchronous page fault. Value
+	of 1 means that the page referred to by the page fault is not
+	present. Value 2 means that the page is now available. Disabling
+	interrupt inhibits APFs. Guest must not enable interrupt
+	before the reason is read, or it may be overwritten by another
+	APF. Since APF uses the same exception vector as regular page
+	fault guest must reset the reason to 0 before it does
+	something that can generate normal page fault.  If during page
+	fault APF reason is 0 it means that this is regular page
+	fault.
+
+	During delivery of type 1 APF cr2 contains a token that will
+	be used to notify a guest when missing page becomes
+	available. When page becomes available type 2 APF is sent with
+	cr2 set to the token associated with the page. There is special
+	kind of token 0xffffffff which tells vcpu that it should wake
+	up all processes waiting for APFs and no individual type 2 APFs
+	will be sent.
+
+	If APF is disabled while there are outstanding APFs, they will
+	not be delivered.
+
+	Currently type 2 APF will be always delivered on the same vcpu as
+	type 1 was, but guest should not rely on that.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c3076bc..0d70398 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -419,6 +419,8 @@ struct kvm_vcpu_arch {
 	struct {
 		bool halted;
 		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+		struct gfn_to_hva_cache data;
+		u64 msr_val;
 	} apf;
 };
 
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e3faaaf..8662ae0 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
  * are available. The use of 0x11 and 0x12 is deprecated
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
+#define KVM_FEATURE_ASYNC_PF		4
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,12 @@
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
+#define KVM_ASYNC_PF_ENABLED			(1 << 0)
+
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
 #define KVM_MMU_OP_FLUSH_TLB	        2
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd25477..063c072 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -783,12 +783,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	7
+#define KVM_SAVE_MSRS_BEGIN	8
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1425,6 +1425,29 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	return 0;
 }
 
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+	gpa_t gpa = data & ~0x3f;
+
+	/* Bits 1:5 are resrved, Should be zero */
+	if (data & 0x3e)
+		return 1;
+
+	vcpu->arch.apf.msr_val = data;
+
+	if (!(data & KVM_ASYNC_PF_ENABLED)) {
+		kvm_clear_async_pf_completion_queue(vcpu);
+		kvm_async_pf_hash_reset(vcpu);
+		return 0;
+	}
+
+	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+		return 1;
+
+	kvm_async_pf_wakeup_all(vcpu);
+	return 0;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1506,6 +1529,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		}
 		break;
 	}
+	case MSR_KVM_ASYNC_PF_EN:
+		if (kvm_pv_enable_async_pf(vcpu, data))
+			return 1;
+		break;
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1782,6 +1809,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
+	case MSR_KVM_ASYNC_PF_EN:
+		data = vcpu->arch.apf.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -1929,6 +1959,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
+	case KVM_CAP_ASYNC_PF:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -5792,6 +5823,8 @@ free_vcpu:
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.apf.msr_val = 0;
+
 	vcpu_load(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
@@ -5811,6 +5844,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr7 = DR7_FIXED_1;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	vcpu->arch.apf.msr_val = 0;
 
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 919ae53..ea2dc1a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo {
 #endif
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e674820..ee4314e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -93,6 +93,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
 void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
 int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 		       struct kvm_arch_async_pf *arch);
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 struct kvm_vcpu {
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index e97eae9..1f59498 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -190,3 +190,23 @@ retry_sync:
 	kmem_cache_free(async_pf_cache, work);
 	return 0;
 }
+
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
+{
+	struct kvm_async_pf *work;
+
+	if (!list_empty(&vcpu->async_pf.done))
+		return 0;
+
+	work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	work->page = bad_page;
+	get_page(bad_page);
+	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
+
+	list_add_tail(&work->link, &vcpu->async_pf.done);
+	vcpu->async_pf.queued++;
+	return 0;
+}
-- 
cgit v0.10.2


From fd10cde9294f73eeccbc16f3fec1ae6cde7b800c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:51 +0200
Subject: KVM paravirt: Add async PF initialization to PV guest.

Enable async PF in a guest if async PF capability is discovered.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8b61c93..369580d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1707,6 +1707,9 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
 
+	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
+			fault handling.
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 8662ae0..2315398 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -65,6 +65,12 @@ struct kvm_mmu_op_release_pt {
 	__u64 pt_phys;
 };
 
+struct kvm_vcpu_pv_apf_data {
+	__u32 reason;
+	__u8 pad[60];
+	__u32 enabled;
+};
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e6db179..032d03b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,30 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
 #include <asm/timer.h>
+#include <asm/cpu.h>
 
 #define MMU_QUEUE_SIZE 1024
 
+static int kvmapf = 1;
+
+static int parse_no_kvmapf(char *arg)
+{
+        kvmapf = 0;
+        return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
 struct kvm_para_state {
 	u8 mmu_queue[MMU_QUEUE_SIZE];
 	int mmu_queue_len;
 };
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -231,12 +245,86 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+void __cpuinit kvm_guest_cpu_init(void)
+{
+	if (!kvm_para_available())
+		return;
+
+	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+		u64 pa = __pa(&__get_cpu_var(apf_reason));
+
+		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+		__get_cpu_var(apf_reason).enabled = 1;
+		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+		       smp_processor_id());
+	}
+}
+
+static void kvm_pv_disable_apf(void *unused)
+{
+	if (!__get_cpu_var(apf_reason).enabled)
+		return;
+
+	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+	__get_cpu_var(apf_reason).enabled = 0;
+
+	printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+	       smp_processor_id());
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+				unsigned long code, void *unused)
+{
+	if (code == SYS_RESTART)
+		on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+	.notifier_call = kvm_pv_reboot_notify,
+};
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
 	WARN_ON(kvm_register_clock("primary cpu clock"));
+	kvm_guest_cpu_init();
 	native_smp_prepare_boot_cpu();
 }
+
+static void kvm_guest_cpu_online(void *dummy)
+{
+	kvm_guest_cpu_init();
+}
+
+static void kvm_guest_cpu_offline(void *dummy)
+{
+	kvm_pv_disable_apf(NULL);
+}
+
+static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE_FROZEN:
+		smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+        .notifier_call  = kvm_cpu_notify,
+};
 #endif
 
 void __init kvm_guest_init(void)
@@ -245,7 +333,11 @@ void __init kvm_guest_init(void)
 		return;
 
 	paravirt_ops_setup();
+	register_reboot_notifier(&kvm_pv_reboot_nb);
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+	register_cpu_notifier(&kvm_cpu_notifier);
+#else
+	kvm_guest_cpu_init();
 #endif
 }
-- 
cgit v0.10.2


From 631bc4878220932fe67fc46fc7cf7cccdb1ec597 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:52 +0200
Subject: KVM: Handle async PF in a guest.

When async PF capability is detected hook up special page fault handler
that will handle async page fault events and bypass other page faults to
regular page fault handler. Also add async PF handling to nested SVM
emulation. Async PF always generates exit to L1 where vcpu thread will
be scheduled out until page is available.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 2315398..fbfd367 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -65,6 +65,9 @@ struct kvm_mmu_op_release_pt {
 	__u64 pt_phys;
 };
 
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
 struct kvm_vcpu_pv_apf_data {
 	__u32 reason;
 	__u8 pad[60];
@@ -171,8 +174,17 @@ static inline unsigned int kvm_arch_para_features(void)
 
 #ifdef CONFIG_KVM_GUEST
 void __init kvm_guest_init(void);
+void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
 #else
 #define kvm_guest_init() do { } while (0)
+#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+static u32 kvm_read_and_reset_pf_reason(void)
+{
+	return 0;
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda5..0310da6 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
 asmlinkage void stack_segment(void);
 asmlinkage void general_protection(void);
 asmlinkage void page_fault(void);
+asmlinkage void async_page_fault(void);
 asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void coprocessor_error(void);
 asmlinkage void alignment_check(void);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e601..c8b4efa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
 	CFI_ENDPROC
 END(general_protection)
 
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+	RING0_EC_FRAME
+	pushl $do_async_page_fault
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(apf_page_fault)
+#endif
+
 /*
  * End of kprobes section
  */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e3ba417..bb3f6e9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1319,6 +1319,9 @@ errorentry xen_stack_segment do_stack_segment
 #endif
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
 #ifdef CONFIG_X86_MCE
 paranoidzeroentry machine_check *machine_check_vector(%rip)
 #endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 032d03b..d564063 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,8 +29,14 @@
 #include <linux/hardirq.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
 
 #define MMU_QUEUE_SIZE 1024
 
@@ -64,6 +70,168 @@ static void kvm_io_delay(void)
 {
 }
 
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+	struct hlist_node link;
+	wait_queue_head_t wq;
+	u32 token;
+	int cpu;
+};
+
+static struct kvm_task_sleep_head {
+	spinlock_t lock;
+	struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+						  u32 token)
+{
+	struct hlist_node *p;
+
+	hlist_for_each(p, &b->list) {
+		struct kvm_task_sleep_node *n =
+			hlist_entry(p, typeof(*n), link);
+		if (n->token == token)
+			return n;
+	}
+
+	return NULL;
+}
+
+void kvm_async_pf_task_wait(u32 token)
+{
+	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+	struct kvm_task_sleep_node n, *e;
+	DEFINE_WAIT(wait);
+
+	spin_lock(&b->lock);
+	e = _find_apf_task(b, token);
+	if (e) {
+		/* dummy entry exist -> wake up was delivered ahead of PF */
+		hlist_del(&e->link);
+		kfree(e);
+		spin_unlock(&b->lock);
+		return;
+	}
+
+	n.token = token;
+	n.cpu = smp_processor_id();
+	init_waitqueue_head(&n.wq);
+	hlist_add_head(&n.link, &b->list);
+	spin_unlock(&b->lock);
+
+	for (;;) {
+		prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (hlist_unhashed(&n.link))
+			break;
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
+	}
+	finish_wait(&n.wq, &wait);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+	hlist_del_init(&n->link);
+	if (waitqueue_active(&n->wq))
+		wake_up(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+	int i;
+
+	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+		struct hlist_node *p, *next;
+		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+		spin_lock(&b->lock);
+		hlist_for_each_safe(p, next, &b->list) {
+			struct kvm_task_sleep_node *n =
+				hlist_entry(p, typeof(*n), link);
+			if (n->cpu == smp_processor_id())
+				apf_task_wake_one(n);
+		}
+		spin_unlock(&b->lock);
+	}
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+	struct kvm_task_sleep_node *n;
+
+	if (token == ~0) {
+		apf_task_wake_all();
+		return;
+	}
+
+again:
+	spin_lock(&b->lock);
+	n = _find_apf_task(b, token);
+	if (!n) {
+		/*
+		 * async PF was not yet handled.
+		 * Add dummy entry for the token.
+		 */
+		n = kmalloc(sizeof(*n), GFP_ATOMIC);
+		if (!n) {
+			/*
+			 * Allocation failed! Busy wait while other cpu
+			 * handles async PF.
+			 */
+			spin_unlock(&b->lock);
+			cpu_relax();
+			goto again;
+		}
+		n->token = token;
+		n->cpu = smp_processor_id();
+		init_waitqueue_head(&n->wq);
+		hlist_add_head(&n->link, &b->list);
+	} else
+		apf_task_wake_one(n);
+	spin_unlock(&b->lock);
+	return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+	u32 reason = 0;
+
+	if (__get_cpu_var(apf_reason).enabled) {
+		reason = __get_cpu_var(apf_reason).reason;
+		__get_cpu_var(apf_reason).reason = 0;
+	}
+
+	return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void __kprobes
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	switch (kvm_read_and_reset_pf_reason()) {
+	default:
+		do_page_fault(regs, error_code);
+		break;
+	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+		/* page is swapped out by the host. */
+		kvm_async_pf_task_wait((u32)read_cr2());
+		break;
+	case KVM_PV_REASON_PAGE_READY:
+		kvm_async_pf_task_wake((u32)read_cr2());
+		break;
+	}
+}
+
 static void kvm_mmu_op(void *buffer, unsigned len)
 {
 	int r;
@@ -300,6 +468,7 @@ static void kvm_guest_cpu_online(void *dummy)
 static void kvm_guest_cpu_offline(void *dummy)
 {
 	kvm_pv_disable_apf(NULL);
+	apf_task_wake_all();
 }
 
 static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
@@ -327,13 +496,25 @@ static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
 };
 #endif
 
+static void __init kvm_apf_trap_init(void)
+{
+	set_intr_gate(14, &async_page_fault);
+}
+
 void __init kvm_guest_init(void)
 {
+	int i;
+
 	if (!kvm_para_available())
 		return;
 
 	paravirt_ops_setup();
 	register_reboot_notifier(&kvm_pv_reboot_nb);
+	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+		spin_lock_init(&async_pf_sleepers[i].lock);
+	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+		x86_init.irqs.trap_init = kvm_apf_trap_init;
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 	register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7..93e8120 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
 
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
+#include <asm/kvm_para.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -133,6 +134,7 @@ struct vcpu_svm {
 
 	unsigned int3_injected;
 	unsigned long int3_rip;
+	u32 apf_reason;
 };
 
 #define MSR_INVALID			0xffffffffU
@@ -1383,16 +1385,33 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 
 static int pf_interception(struct vcpu_svm *svm)
 {
-	u64 fault_address;
+	u64 fault_address = svm->vmcb->control.exit_info_2;
 	u32 error_code;
+	int r = 1;
 
-	fault_address  = svm->vmcb->control.exit_info_2;
-	error_code = svm->vmcb->control.exit_info_1;
+	switch (svm->apf_reason) {
+	default:
+		error_code = svm->vmcb->control.exit_info_1;
 
-	trace_kvm_page_fault(fault_address, error_code);
-	if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
-		kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
-	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+		trace_kvm_page_fault(fault_address, error_code);
+		if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+		r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+		break;
+	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+		svm->apf_reason = 0;
+		local_irq_disable();
+		kvm_async_pf_task_wait(fault_address);
+		local_irq_enable();
+		break;
+	case KVM_PV_REASON_PAGE_READY:
+		svm->apf_reason = 0;
+		local_irq_disable();
+		kvm_async_pf_task_wake(fault_address);
+		local_irq_enable();
+		break;
+	}
+	return r;
 }
 
 static int db_interception(struct vcpu_svm *svm)
@@ -1836,8 +1855,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 			return NESTED_EXIT_HOST;
 		break;
 	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-		/* When we're shadowing, trap PFs */
-		if (!npt_enabled)
+		/* When we're shadowing, trap PFs, but not async PF */
+		if (!npt_enabled && svm->apf_reason == 0)
 			return NESTED_EXIT_HOST;
 		break;
 	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1893,6 +1912,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
 		if (svm->nested.intercept_exceptions & excp_bits)
 			vmexit = NESTED_EXIT_DONE;
+		/* async page fault always cause vmexit */
+		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+			 svm->apf_reason != 0)
+			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
 	case SVM_EXIT_ERR: {
@@ -3414,6 +3437,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	svm->next_rip = 0;
 
+	/* if exit due to PF check for async PF */
+	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+		svm->apf_reason = kvm_read_and_reset_pf_reason();
+
 	if (npt_enabled) {
 		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
 		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
-- 
cgit v0.10.2


From 7c90705bf2a373aa238661bdb6446f27299ef489 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:53 +0200
Subject: KVM: Inject asynchronous page fault into a PV guest if page is
 swapped out.

Send async page fault to a PV guest if it accesses swapped out memory.
Guest will choose another task to run upon receiving the fault.

Allow async page fault injection only when guest is in user mode since
otherwise guest may be in non-sleepable context and will not be able
to reschedule.

Vcpu will be halted if guest will fault on the same page again or if
vcpu executes kernel code.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0d70398..167375c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -421,6 +421,7 @@ struct kvm_vcpu_arch {
 		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
 		struct gfn_to_hva_cache data;
 		u64 msr_val;
+		u32 id;
 	} apf;
 };
 
@@ -596,6 +597,7 @@ struct kvm_x86_ops {
 };
 
 struct kvm_arch_async_pf {
+	u32 token;
 	gfn_t gfn;
 };
 
@@ -819,6 +821,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work);
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 			       struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
 extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b2c6098..64f90db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2592,6 +2592,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 {
 	struct kvm_arch_async_pf arch;
+	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
 	arch.gfn = gfn;
 
 	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 063c072..ac4c368 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6248,20 +6248,53 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 	}
 }
 
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+				      sizeof(val));
+}
+
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work)
 {
-	trace_kvm_async_pf_not_present(work->gva);
-
-	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+	    kvm_x86_ops->get_cpl(vcpu) == 0)
+		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+		vcpu->arch.fault.error_code = 0;
+		vcpu->arch.fault.address = work->arch.token;
+		kvm_inject_page_fault(vcpu);
+	}
 }
 
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
-	trace_kvm_async_pf_ready(work->gva);
-	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+	trace_kvm_async_pf_ready(work->arch.token, work->gva);
+	if (is_error_page(work->page))
+		work->arch.token = ~0; /* broadcast wakeup */
+	else
+		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+
+	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
+	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+		vcpu->arch.fault.error_code = 0;
+		vcpu->arch.fault.address = work->arch.token;
+		kvm_inject_page_fault(vcpu);
+	}
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+		return true;
+	else
+		return !kvm_event_needs_reinjection(vcpu) &&
+			kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index a78a5e5..9c2cc6a 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -204,34 +204,39 @@ TRACE_EVENT(
 
 TRACE_EVENT(
 	kvm_async_pf_not_present,
-	TP_PROTO(u64 gva),
-	TP_ARGS(gva),
+	TP_PROTO(u64 token, u64 gva),
+	TP_ARGS(token, gva),
 
 	TP_STRUCT__entry(
+		__field(__u64, token)
 		__field(__u64, gva)
 		),
 
 	TP_fast_assign(
+		__entry->token = token;
 		__entry->gva = gva;
 		),
 
-	TP_printk("gva %#llx not present", __entry->gva)
+	TP_printk("token %#llx gva %#llx not present", __entry->token,
+		  __entry->gva)
 );
 
 TRACE_EVENT(
 	kvm_async_pf_ready,
-	TP_PROTO(u64 gva),
-	TP_ARGS(gva),
+	TP_PROTO(u64 token, u64 gva),
+	TP_ARGS(token, gva),
 
 	TP_STRUCT__entry(
+		__field(__u64, token)
 		__field(__u64, gva)
 		),
 
 	TP_fast_assign(
+		__entry->token = token;
 		__entry->gva = gva;
 		),
 
-	TP_printk("gva %#llx ready", __entry->gva)
+	TP_printk("token %#llx gva %#llx ready", __entry->token, __entry->gva)
 );
 
 TRACE_EVENT(
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 1f59498..60df9e0 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -124,7 +124,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 {
 	struct kvm_async_pf *work;
 
-	if (list_empty_careful(&vcpu->async_pf.done))
+	if (list_empty_careful(&vcpu->async_pf.done) ||
+	    !kvm_arch_can_inject_async_page_present(vcpu))
 		return;
 
 	spin_lock(&vcpu->async_pf.lock);
-- 
cgit v0.10.2


From 6c047cd982f944fa63b2d96de2a06463d113f9fa Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:54 +0200
Subject: KVM paravirt: Handle async PF in non preemptable context

If async page fault is received by idle task or when preemp_count is
not zero guest cannot reschedule, so do sti; hlt and wait for page to be
ready. vcpu can still process interrupts while it waits for the page to
be ready.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d564063..47ea93e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -37,6 +37,7 @@
 #include <asm/cpu.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
+#include <asm/tlbflush.h>
 
 #define MMU_QUEUE_SIZE 1024
 
@@ -78,6 +79,8 @@ struct kvm_task_sleep_node {
 	wait_queue_head_t wq;
 	u32 token;
 	int cpu;
+	bool halted;
+	struct mm_struct *mm;
 };
 
 static struct kvm_task_sleep_head {
@@ -106,6 +109,11 @@ void kvm_async_pf_task_wait(u32 token)
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 	struct kvm_task_sleep_node n, *e;
 	DEFINE_WAIT(wait);
+	int cpu, idle;
+
+	cpu = get_cpu();
+	idle = idle_cpu(cpu);
+	put_cpu();
 
 	spin_lock(&b->lock);
 	e = _find_apf_task(b, token);
@@ -119,19 +127,33 @@ void kvm_async_pf_task_wait(u32 token)
 
 	n.token = token;
 	n.cpu = smp_processor_id();
+	n.mm = current->active_mm;
+	n.halted = idle || preempt_count() > 1;
+	atomic_inc(&n.mm->mm_count);
 	init_waitqueue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
 	spin_unlock(&b->lock);
 
 	for (;;) {
-		prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (!n.halted)
+			prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 		if (hlist_unhashed(&n.link))
 			break;
-		local_irq_enable();
-		schedule();
-		local_irq_disable();
+
+		if (!n.halted) {
+			local_irq_enable();
+			schedule();
+			local_irq_disable();
+		} else {
+			/*
+			 * We cannot reschedule. So halt.
+			 */
+			native_safe_halt();
+			local_irq_disable();
+		}
 	}
-	finish_wait(&n.wq, &wait);
+	if (!n.halted)
+		finish_wait(&n.wq, &wait);
 
 	return;
 }
@@ -140,7 +162,12 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 {
 	hlist_del_init(&n->link);
-	if (waitqueue_active(&n->wq))
+	if (!n->mm)
+		return;
+	mmdrop(n->mm);
+	if (n->halted)
+		smp_send_reschedule(n->cpu);
+	else if (waitqueue_active(&n->wq))
 		wake_up(&n->wq);
 }
 
@@ -193,6 +220,7 @@ again:
 		}
 		n->token = token;
 		n->cpu = smp_processor_id();
+		n->mm = NULL;
 		init_waitqueue_head(&n->wq);
 		hlist_add_head(&n->link, &b->list);
 	} else
-- 
cgit v0.10.2


From 6adba527420651b6cacaf392541c09fb108711a2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:55 +0200
Subject: KVM: Let host know whether the guest can handle async PF in
 non-userspace context.

If guest can detect that it runs in non-preemptable context it can
handle async PFs at any time, so let host know that it can send async
PF even if guest cpu is not in userspace.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
index e67b4a8..d079aed 100644
--- a/Documentation/kvm/msr.txt
+++ b/Documentation/kvm/msr.txt
@@ -154,9 +154,10 @@ MSR_KVM_SYSTEM_TIME: 0x12
 MSR_KVM_ASYNC_PF_EN: 0x4b564d02
 	data: Bits 63-6 hold 64-byte aligned physical address of a
 	64 byte memory area which must be in guest RAM and must be
-	zeroed. Bits 5-1 are reserved and should be zero. Bit 0 is 1
+	zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
 	when asynchronous page faults are enabled on the vcpu 0 when
-	disabled.
+	disabled. Bit 2 is 1 if asynchronous page faults can be injected
+	when vcpu is in cpl == 0.
 
 	First 4 byte of 64 byte memory location will be written to by
 	the hypervisor at the time of asynchronous page fault (APF)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 167375c..b2ea428 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -422,6 +422,7 @@ struct kvm_vcpu_arch {
 		struct gfn_to_hva_cache data;
 		u64 msr_val;
 		u32 id;
+		bool send_user_only;
 	} apf;
 };
 
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index fbfd367..d3a1a48 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -38,6 +38,7 @@
 #define KVM_MAX_MMU_OP_BATCH           32
 
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS		(1 << 1)
 
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 47ea93e..91b3d65 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -449,6 +449,9 @@ void __cpuinit kvm_guest_cpu_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
 		u64 pa = __pa(&__get_cpu_var(apf_reason));
 
+#ifdef CONFIG_PREEMPT
+		pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
 		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
 		__get_cpu_var(apf_reason).enabled = 1;
 		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac4c368..fff70b5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1429,8 +1429,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
 	gpa_t gpa = data & ~0x3f;
 
-	/* Bits 1:5 are resrved, Should be zero */
-	if (data & 0x3e)
+	/* Bits 2:5 are resrved, Should be zero */
+	if (data & 0x3c)
 		return 1;
 
 	vcpu->arch.apf.msr_val = data;
@@ -1444,6 +1444,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
 		return 1;
 
+	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
 	kvm_async_pf_wakeup_all(vcpu);
 	return 0;
 }
-- 
cgit v0.10.2


From fc5f06fac6fb8ce469ea173370f2cd398f1d9f9a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 14 Oct 2010 11:22:56 +0200
Subject: KVM: Send async PF when guest is not in userspace too.

If guest indicates that it can handle async pf in kernel mode too send
it, but only if interrupts are enabled.

Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fff70b5..c0bd2a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6263,7 +6263,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
 
 	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
-	    kvm_x86_ops->get_cpl(vcpu) == 0)
+	    (vcpu->arch.apf.send_user_only &&
+	     kvm_x86_ops->get_cpl(vcpu) == 0))
 		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
 	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
 		vcpu->arch.fault.error_code = 0;
-- 
cgit v0.10.2


From 8030089f9e93ee5cefe74d258e35edc7ce9e4b73 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 19 Oct 2010 18:13:41 +0200
Subject: KVM: improve hva_to_pfn() readability

Improve vma handling code readability in hva_to_pfn() and fix
async pf handling code to properly check vma returned by find_vma().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 228f00f..475a100 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -952,6 +952,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
+static pfn_t get_fault_pfn(void)
+{
+	get_page(fault_page);
+	return fault_pfn;
+}
+
 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 			bool *async)
 {
@@ -974,7 +980,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 		struct vm_area_struct *vma;
 
 		if (atomic)
-			goto return_fault_page;
+			return get_fault_pfn();
 
 		down_read(&current->mm->mmap_sem);
 		if (is_hwpoison_address(addr)) {
@@ -983,22 +989,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 			return page_to_pfn(hwpoison_page);
 		}
 
-		vma = find_vma(current->mm, addr);
+		vma = find_vma_intersection(current->mm, addr, addr+1);
 
-		if (vma == NULL || addr < vma->vm_start ||
-		    !(vma->vm_flags & VM_PFNMAP)) {
-			if (async && !(vma->vm_flags & VM_PFNMAP) &&
-			    (vma->vm_flags & VM_WRITE))
+		if (vma == NULL)
+			pfn = get_fault_pfn();
+		else if ((vma->vm_flags & VM_PFNMAP)) {
+			pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+				vma->vm_pgoff;
+			BUG_ON(!kvm_is_mmio_pfn(pfn));
+		} else {
+			if (async && (vma->vm_flags & VM_WRITE))
 				*async = true;
-			up_read(&current->mm->mmap_sem);
-return_fault_page:
-			get_page(fault_page);
-			return page_to_pfn(fault_page);
+			pfn = get_fault_pfn();
 		}
-
-		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 		up_read(&current->mm->mmap_sem);
-		BUG_ON(!kvm_is_mmio_pfn(pfn));
 	} else
 		pfn = page_to_pfn(page[0]);
 
-- 
cgit v0.10.2


From 7e1fbeac6f4b52e12ccaa20b151bd76783069dee Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 20 Oct 2010 15:18:02 +0200
Subject: KVM: x86: Mark kvm_arch_setup_async_pf static

It has no user outside mmu.c and also no prototype.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 64f90db..0aac41f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2589,7 +2589,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 			     error_code & PFERR_WRITE_MASK, gfn);
 }
 
-int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 {
 	struct kvm_arch_async_pf arch;
 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
-- 
cgit v0.10.2


From f56f5369561ccafe0d4a2396179f814454a1571c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 20 Oct 2010 17:56:17 +0200
Subject: KVM: Move KVM context switch into own function

gcc 4.5 with some special options is able to duplicate the VMX
context switch asm in vmx_vcpu_run(). This results in a compile error
because the inline asm sequence uses an on local label. The non local
label is needed because other code wants to set up the return address.

This patch moves the asm code into an own function and marks
that explicitely noinline to avoid this problem.

Better would be probably to just move it into an .S file.

The diff looks worse than the change really is, it's all just
code movement and no logic change.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fcbe9..46c8925 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3890,33 +3890,17 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 #define Q "l"
 #endif
 
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+/*
+ * We put this into a separate noinline function to prevent the compiler
+ * from duplicating the code. This is needed because this code
+ * uses non local labels that cannot be duplicated.
+ * Do not put any flow control into this function.
+ * Better would be to put this whole monstrosity into a .S file.
+ */
+static void noinline do_vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	/* Record the guest's net vcpu time for enforced NMI injections. */
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
-		vmx->entry_time = ktime_get();
-
-	/* Don't enter VMX if guest state is invalid, let the exit handler
-	   start emulation until we arrive back to a valid state */
-	if (vmx->emulation_required && emulate_invalid_guest_state)
-		return;
-
-	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
-		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
-		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
-	/* When single-stepping over STI and MOV SS, we must clear the
-	 * corresponding interruptibility bits in the guest state. Otherwise
-	 * vmentry fails as it then expects bit 14 (BS) in pending debug
-	 * exceptions being set, but that's not correct for the guest debugging
-	 * case. */
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmx_set_interrupt_shadow(vcpu, 0);
-
-	asm(
+	asm volatile(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
 		"push %%"R"cx \n\t"
@@ -4011,6 +3995,35 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 #endif
 	      );
+}
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* Record the guest's net vcpu time for enforced NMI injections. */
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+		vmx->entry_time = ktime_get();
+
+	/* Don't enter VMX if guest state is invalid, let the exit handler
+	   start emulation until we arrive back to a valid state */
+	if (vmx->emulation_required && emulate_invalid_guest_state)
+		return;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	/* When single-stepping over STI and MOV SS, we must clear the
+	 * corresponding interruptibility bits in the guest state. Otherwise
+	 * vmentry fails as it then expects bit 14 (BS) in pending debug
+	 * exceptions being set, but that's not correct for the guest debugging
+	 * case. */
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmx_set_interrupt_shadow(vcpu, 0);
+
+	do_vmx_vcpu_run(vcpu);
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
 				  | (1 << VCPU_EXREG_PDPTR));
-- 
cgit v0.10.2


From d4c90b0043bdb40a6f340d34b2ac1861040de88c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 20 Oct 2010 18:34:54 +0200
Subject: KVM: x86: Add missing inline tag to kvm_read_and_reset_pf_reason

May otherwise generates build warnings about unused
kvm_read_and_reset_pf_reason if included without CONFIG_KVM_GUEST
enabled.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index d3a1a48..a427bf7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -182,7 +182,7 @@ u32 kvm_read_and_reset_pf_reason(void);
 #define kvm_guest_init() do { } while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
-static u32 kvm_read_and_reset_pf_reason(void)
+static inline u32 kvm_read_and_reset_pf_reason(void)
 {
 	return 0;
 }
-- 
cgit v0.10.2


From 13c34e073bec75aa4c8d1bb80c10fd7291f3423a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 21 Oct 2010 12:20:31 +0200
Subject: KVM: SVM: Move guest register save out of interrupts disabled section

Saving guest registers is just a memory copy, and does not need to be in the
critical section.  Move outside the critical section to improve latency a
bit.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 93e8120..8035496 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3412,11 +3412,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
-	vcpu->arch.cr2 = svm->vmcb->save.cr2;
-	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
 	load_host_msrs(vcpu);
 	kvm_load_ldt(ldt_selector);
 	loadsegment(fs, fs_selector);
@@ -3433,6 +3428,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	stgi();
 
+	vcpu->arch.cr2 = svm->vmcb->save.cr2;
+	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
-- 
cgit v0.10.2


From afe9e66f8233e33e16fcc5b855070e45978f919e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 21 Oct 2010 12:20:32 +0200
Subject: KVM: SVM: Move svm->host_gs_base into a separate structure

More members will join it soon.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8035496..a217978 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -124,7 +124,9 @@ struct vcpu_svm {
 	u64 next_rip;
 
 	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-	u64 host_gs_base;
+	struct {
+		u64 gs_base;
+	} host;
 
 	u32 *msrpm;
 
@@ -1353,14 +1355,14 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 static void load_host_msrs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
 #endif
 }
 
 static void save_host_msrs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
+	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
 #endif
 }
 
-- 
cgit v0.10.2


From dacccfdd6bb386ec0714e1fec250c4b7d0aaccc9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 21 Oct 2010 12:20:33 +0200
Subject: KVM: SVM: Move fs/gs/ldt save/restore to heavyweight exit path

ldt is never used in the kernel context; same goes for fs (x86_64) and gs
(i386).  So save/restore them in the heavyweight exit path instead
of the lightweight path.

By itself, this doesn't buy us much, but it paves the way for moving vmload
and vmsave to the heavyweight exit path, since they modify the same registers.

[jan: fix copy/pase mistake on i386]

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a217978..8ea4a79 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -125,6 +125,9 @@ struct vcpu_svm {
 
 	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 	struct {
+		u16 fs;
+		u16 gs;
+		u16 ldt;
 		u64 gs_base;
 	} host;
 
@@ -184,6 +187,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
 
+static void save_host_msrs(struct kvm_vcpu *vcpu);
+static void load_host_msrs(struct kvm_vcpu *vcpu);
+
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
 	return container_of(vcpu, struct vcpu_svm, vcpu);
@@ -996,6 +1002,11 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		svm->asid_generation = 0;
 	}
 
+	save_host_msrs(vcpu);
+	savesegment(fs, svm->host.fs);
+	savesegment(gs, svm->host.gs);
+	svm->host.ldt = kvm_read_ldt();
+
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
@@ -1006,6 +1017,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	int i;
 
 	++vcpu->stat.host_state_reload;
+	kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+	loadsegment(fs, svm->host.fs);
+	load_gs_index(svm->host.gs);
+	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+	loadsegment(gs, svm->host.gs);
+#endif
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
@@ -3314,9 +3333,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	u16 fs_selector;
-	u16 gs_selector;
-	u16 ldt_selector;
 
 	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
 	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3333,10 +3349,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	sync_lapic_to_cr8(vcpu);
 
-	save_host_msrs(vcpu);
-	savesegment(fs, fs_selector);
-	savesegment(gs, gs_selector);
-	ldt_selector = kvm_read_ldt();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
 	clgi();
@@ -3415,13 +3427,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		);
 
 	load_host_msrs(vcpu);
-	kvm_load_ldt(ldt_selector);
-	loadsegment(fs, fs_selector);
-#ifdef CONFIG_X86_64
-	load_gs_index(gs_selector);
-	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
-#else
-	loadsegment(gs, gs_selector);
+#ifndef CONFIG_X86_64
+	loadsegment(fs, svm->host.fs);
 #endif
 
 	reload_tss(vcpu);
-- 
cgit v0.10.2


From 82ca2d108b493f7d916975805668bef2096f1147 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 21 Oct 2010 12:20:34 +0200
Subject: KVM: SVM: Fold save_host_msrs() and load_host_msrs() into their
 callers

This abstraction only serves to obfuscate.  Remove.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ea4a79..cf47a2f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -187,9 +187,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
 
-static void save_host_msrs(struct kvm_vcpu *vcpu);
-static void load_host_msrs(struct kvm_vcpu *vcpu);
-
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
 	return container_of(vcpu, struct vcpu_svm, vcpu);
@@ -1002,7 +999,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		svm->asid_generation = 0;
 	}
 
-	save_host_msrs(vcpu);
+#ifdef CONFIG_X86_64
+	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
 	savesegment(fs, svm->host.fs);
 	savesegment(gs, svm->host.gs);
 	svm->host.ldt = kvm_read_ldt();
@@ -1371,20 +1370,6 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	update_db_intercept(vcpu);
 }
 
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
-}
-
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
-}
-
 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 {
 	if (sd->next_asid > sd->max_asid) {
@@ -3426,8 +3411,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
-	load_host_msrs(vcpu);
-#ifndef CONFIG_X86_64
+#ifdef CONFIG_X86_64
+	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
 	loadsegment(fs, svm->host.fs);
 #endif
 
-- 
cgit v0.10.2


From 83bcacb1a548d4d8df532376c981277761622bce Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 25 Oct 2010 15:23:55 +0200
Subject: KVM: Avoid double interrupt injection with vapic

After an interrupt injection, the PPR changes, and we have to reflect that
into the vapic.  This causes a KVM_REQ_EVENT to be set, which causes the
whole interrupt injection routine to be run again (harmlessly).

Optimize by only setting KVM_REQ_EVENT if the ppr was lowered; otherwise
there is no chance that a new injection is needed.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f897..93cf9d0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 
 	if (old_ppr != ppr) {
 		apic_set_reg(apic, APIC_PROCPRI, ppr);
-		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+		if (ppr < old_ppr)
+			kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 	}
 }
 
-- 
cgit v0.10.2


From ff1fcb9ebd53ee3f21ae117e6952204e465f46d8 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Oct 2010 14:18:15 -0200
Subject: KVM: VMX: remove setting of shadow_base_ptes for EPT

The EPT present/writable bits use the same position as normal
pagetable bits.

Since direct_map passes ACC_ALL to mmu_set_spte, thus always setting
the writable bit on sptes, use the generic PT_PRESENT shadow_base_pte.

Also pass present/writable error code information from EPT violation
to generic pagefault handler.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46c8925..e42727b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3476,7 +3476,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
-	return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3);
 }
 
 static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4409,8 +4409,6 @@ static int __init vmx_init(void)
 
 	if (enable_ept) {
 		bypass_guest_pf = 0;
-		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
-			VMX_EPT_WRITABLE_MASK);
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
 				VMX_EPT_EXECUTABLE_MASK);
 		kvm_enable_tdp();
-- 
cgit v0.10.2


From 982c25658c066fbbeb42e44a6a8fcd59ae008837 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Oct 2010 14:18:16 -0200
Subject: KVM: MMU: remove kvm_mmu_set_base_ptes

Unused.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0aac41f..11b9102 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -196,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -215,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
-	shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
@@ -1975,7 +1968,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = shadow_base_present_pte;
+	spte = PT_PRESENT_MASK;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
 	if (!dirty)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0bd2a2..a2a7854 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4693,7 +4693,6 @@ int kvm_arch_init(void *opaque)
 
 	kvm_x86_ops = ops;
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
-	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
-- 
cgit v0.10.2


From 7905d9a5ad7a83f1c1c00559839857ab90afbdfc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Oct 2010 14:18:17 -0200
Subject: KVM: MMU: flush TLBs on writable -> read-only spte overwrite

This can happen in the following scenario:

vcpu0			vcpu1
read fault
gup(.write=0)
			gup(.write=1)
			reuse swap cache, no COW
			set writable spte
			use writable spte
set read-only spte

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 11b9102..9943394 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2069,6 +2069,16 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
+		/*
+		 * If we overwrite a writable spte with a read-only one,
+		 * drop it and flush remote TLBs. Otherwise rmap_write_protect
+		 * will find a read-only spte, even though the writable spte
+		 * might be cached on a CPU's TLB.
+		 */
+		} else if (is_writable_pte(*sptep) &&
+			  (!(pte_access & ACC_WRITE_MASK) || !dirty)) {
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 			was_rmapped = 1;
 	}
-- 
cgit v0.10.2


From 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 22 Oct 2010 14:18:18 -0200
Subject: KVM: propagate fault r/w information to gup(), allow read-only memory

As suggested by Andrea, pass r/w error code to gup(), upgrading read fault
to writable if host pte allows it.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9943394..53509f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2216,7 +2216,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int level, gfn_t gfn, pfn_t pfn)
+			int map_writable, int level, gfn_t gfn, pfn_t pfn)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -2225,9 +2225,13 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
 		if (iterator.level == level) {
-			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+			unsigned pte_access = ACC_ALL;
+
+			if (!map_writable)
+				pte_access &= ~ACC_WRITE_MASK;
+			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
 				     0, write, 1, &pt_write,
-				     level, gfn, pfn, false, true);
+				     level, gfn, pfn, false, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
@@ -2288,6 +2292,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	int level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
+	bool map_writable;
 
 	level = mapping_level(vcpu, gfn);
 
@@ -2302,7 +2307,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable);
 
 	/* mmio */
 	if (is_error_pfn(pfn))
@@ -2312,7 +2317,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, level, gfn, pfn);
+	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -2611,11 +2616,11 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
-			 gva_t gva, pfn_t *pfn)
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
 	bool async;
 
-	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async);
+	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
 
 	if (!async)
 		return false; /* *pfn has correct page already */
@@ -2632,7 +2637,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
 
 	return false;
 }
@@ -2645,6 +2650,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	int level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
+	int write = error_code & PFERR_WRITE_MASK;
+	bool map_writable;
 
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2660,7 +2667,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn))
+	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn, write, &map_writable))
 		return 0;
 
 	/* mmio */
@@ -2670,7 +2677,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+	r = __direct_map(vcpu, gpa, write, map_writable,
 			 level, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d6b281e..ba00eef 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -427,7 +427,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int user_fault, int write_fault, int hlevel,
-			 int *ptwrite, pfn_t pfn)
+			 int *ptwrite, pfn_t pfn, bool map_writable)
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
@@ -501,7 +501,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
-		     gw->gfn, pfn, false, true);
+		     gw->gfn, pfn, false, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
 	return it.sptep;
@@ -539,6 +539,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
 	unsigned long mmu_seq;
+	bool map_writable;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -569,13 +570,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn))
+	if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn, write_fault,
+			 &map_writable))
 		return 0;
 
 	/* mmio */
 	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
+	if (!map_writable)
+		walker.pte_access &= ~ACC_WRITE_MASK;
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
@@ -583,7 +588,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &write_pt, pfn);
+			     level, &write_pt, pfn, map_writable);
 	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ee4314e..462b982 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -334,8 +334,11 @@ void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async);
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+		       bool write_fault, bool *writable);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+		      bool *writable);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
 int memslot_id(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 475a100..2803b4d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -959,7 +959,7 @@ static pfn_t get_fault_pfn(void)
 }
 
 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
-			bool *async)
+			bool *async, bool write_fault, bool *writable)
 {
 	struct page *page[1];
 	int npages = 0;
@@ -968,12 +968,34 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
 
+	BUG_ON(!write_fault && !writable);
+
+	if (writable)
+		*writable = true;
+
 	if (atomic || async)
 		npages = __get_user_pages_fast(addr, 1, 1, page);
 
 	if (unlikely(npages != 1) && !atomic) {
 		might_sleep();
-		npages = get_user_pages_fast(addr, 1, 1, page);
+
+		if (writable)
+			*writable = write_fault;
+
+		npages = get_user_pages_fast(addr, 1, write_fault, page);
+
+		/* map read fault as writable if possible */
+		if (unlikely(!write_fault) && npages == 1) {
+			struct page *wpage[1];
+
+			npages = __get_user_pages_fast(addr, 1, 1, wpage);
+			if (npages == 1) {
+				*writable = true;
+				put_page(page[0]);
+				page[0] = wpage[0];
+			}
+			npages = 1;
+		}
 	}
 
 	if (unlikely(npages != 1)) {
@@ -1011,11 +1033,12 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 {
-	return hva_to_pfn(kvm, addr, true, NULL);
+	return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+			  bool write_fault, bool *writable)
 {
 	unsigned long addr;
 
@@ -1028,32 +1051,40 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async)
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, atomic, async);
+	return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, true, NULL);
+	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async)
+pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
+		       bool write_fault, bool *writable)
 {
-	return __gfn_to_pfn(kvm, gfn, false, async);
+	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
 
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, false, NULL);
+	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
+pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+		      bool *writable)
+{
+	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
+
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr, false, NULL);
+	return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-- 
cgit v0.10.2


From 64be5007066173d11a4635eedd57d41a3b3a7027 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 24 Oct 2010 16:49:08 +0200
Subject: KVM: x86: trace "exit to userspace" event

Add tracepoint for userspace exit.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 9c2cc6a..c86f4e8 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -6,6 +6,36 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
 
+#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }
+
+#define kvm_trace_exit_reason						\
+	ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL),	\
+	ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN),	\
+	ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR),	\
+	ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
+	ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI)
+
+TRACE_EVENT(kvm_userspace_exit,
+	    TP_PROTO(__u32 reason, int errno),
+	    TP_ARGS(reason, errno),
+
+	TP_STRUCT__entry(
+		__field(	__u32,		reason		)
+		__field(	int,		errno		)
+	),
+
+	TP_fast_assign(
+		__entry->reason		= reason;
+		__entry->errno		= errno;
+	),
+
+	TP_printk("reason %s (%d)",
+		  __entry->errno < 0 ?
+		  (__entry->errno == -EINTR ? "restart" : "error") :
+		  __print_symbolic(__entry->reason, kvm_trace_exit_reason),
+		  __entry->errno < 0 ? -__entry->errno : __entry->reason)
+);
+
 #if defined(__KVM_HAVE_IOAPIC)
 TRACE_EVENT(kvm_set_irq,
 	TP_PROTO(unsigned int gsi, int level, int irq_source_id),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2803b4d..880370c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1562,6 +1562,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		if (arg)
 			goto out;
 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	case KVM_GET_REGS: {
 		struct kvm_regs *kvm_regs;
-- 
cgit v0.10.2


From a36a57b1a19bce17b67f5c6f43460baf664ae5fa Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 27 Oct 2010 18:22:19 +0900
Subject: KVM: introduce wrapper functions for creating/destroying dirty
 bitmaps

This makes it easy to change the way of allocating/freeing dirty bitmaps.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 880370c..0021c28 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -444,6 +444,15 @@ out_err_nodisable:
 	return ERR_PTR(r);
 }
 
+static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+	if (!memslot->dirty_bitmap)
+		return;
+
+	vfree(memslot->dirty_bitmap);
+	memslot->dirty_bitmap = NULL;
+}
+
 /*
  * Free any memory in @free but not in @dont.
  */
@@ -456,7 +465,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 		vfree(free->rmap);
 
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-		vfree(free->dirty_bitmap);
+		kvm_destroy_dirty_bitmap(free);
 
 
 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
@@ -467,7 +476,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 	}
 
 	free->npages = 0;
-	free->dirty_bitmap = NULL;
 	free->rmap = NULL;
 }
 
@@ -529,6 +537,18 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+{
+	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
+
+	memslot->dirty_bitmap = vmalloc(dirty_bytes);
+	if (!memslot->dirty_bitmap)
+		return -ENOMEM;
+
+	memset(memslot->dirty_bitmap, 0, dirty_bytes);
+	return 0;
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -663,12 +683,8 @@ skip_lpage:
 
 	/* Allocate page dirty bitmap if needed */
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-		unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new);
-
-		new.dirty_bitmap = vmalloc(dirty_bytes);
-		if (!new.dirty_bitmap)
+		if (kvm_create_dirty_bitmap(&new) < 0)
 			goto out_free;
-		memset(new.dirty_bitmap, 0, dirty_bytes);
 		/* destroy any largepage mappings for dirty tracking */
 		if (old.npages)
 			flush_shadow = 1;
-- 
cgit v0.10.2


From 515a01279a187415322a80736800a7d6325876ab Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 27 Oct 2010 18:23:54 +0900
Subject: KVM: pre-allocate one more dirty bitmap to avoid vmalloc()

Currently x86's kvm_vm_ioctl_get_dirty_log() needs to allocate a bitmap by
vmalloc() which will be used in the next logging and this has been causing
bad effect to VGA and live-migration: vmalloc() consumes extra systime,
triggers tlb flush, etc.

This patch resolves this issue by pre-allocating one more bitmap and switching
between two bitmaps during dirty logging.

Performance improvement:
  I measured performance for the case of VGA update by trace-cmd.
  The result was 1.5 times faster than the original one.

  In the case of live migration, the improvement ratio depends on the workload
  and the guest memory size. In general, the larger the memory size is the more
  benefits we get.

Note:
  This does not change other architectures's logic but the allocation size
  becomes twice. This will increase the actual memory consumption only when
  the new size changes the number of pages allocated by vmalloc().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2a7854..35f82f2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3208,18 +3208,15 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
-		r = -ENOMEM;
-		dirty_bitmap = vmalloc(n);
-		if (!dirty_bitmap)
-			goto out;
+		dirty_bitmap = memslot->dirty_bitmap_head;
+		if (memslot->dirty_bitmap == dirty_bitmap)
+			dirty_bitmap += n / sizeof(long);
 		memset(dirty_bitmap, 0, n);
 
 		r = -ENOMEM;
 		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-		if (!slots) {
-			vfree(dirty_bitmap);
+		if (!slots)
 			goto out;
-		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 		slots->generation++;
@@ -3235,11 +3232,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		spin_unlock(&kvm->mmu_lock);
 
 		r = -EFAULT;
-		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
-			vfree(dirty_bitmap);
+		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
 			goto out;
-		}
-		vfree(dirty_bitmap);
 	} else {
 		r = -EFAULT;
 		if (clear_user(log->dirty_bitmap, n))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 462b982..bcf71c7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -150,6 +150,7 @@ struct kvm_memory_slot {
 	unsigned long flags;
 	unsigned long *rmap;
 	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_head;
 	struct {
 		unsigned long rmap_pde;
 		int write_count;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0021c28..27649fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -449,8 +449,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 	if (!memslot->dirty_bitmap)
 		return;
 
-	vfree(memslot->dirty_bitmap);
+	vfree(memslot->dirty_bitmap_head);
 	memslot->dirty_bitmap = NULL;
+	memslot->dirty_bitmap_head = NULL;
 }
 
 /*
@@ -537,15 +538,21 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * Allocation size is twice as large as the actual dirty bitmap size.
+ * This makes it possible to do double buffering: see x86's
+ * kvm_vm_ioctl_get_dirty_log().
+ */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
-	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
+	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
 	memslot->dirty_bitmap = vmalloc(dirty_bytes);
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
 	memset(memslot->dirty_bitmap, 0, dirty_bytes);
+	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 6f9e5c1702319e048a90e06e31b957fbbcecbe07 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 1 Nov 2010 14:36:09 +0900
Subject: KVM: use kmalloc() for small dirty bitmaps

Currently we are using vmalloc() for all dirty bitmaps even if
they are small enough, say less than K bytes.

We use kmalloc() if dirty bitmap size is less than or equal to
PAGE_SIZE so that we can avoid vmalloc area usage for VGA.

This will also make the logging start/stop faster.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 27649fd..9ce1079 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -449,7 +449,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 	if (!memslot->dirty_bitmap)
 		return;
 
-	vfree(memslot->dirty_bitmap_head);
+	if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
+		vfree(memslot->dirty_bitmap_head);
+	else
+		kfree(memslot->dirty_bitmap_head);
+
 	memslot->dirty_bitmap = NULL;
 	memslot->dirty_bitmap_head = NULL;
 }
@@ -547,11 +551,14 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
-	memslot->dirty_bitmap = vmalloc(dirty_bytes);
+	if (dirty_bytes > PAGE_SIZE)
+		memslot->dirty_bitmap = vzalloc(dirty_bytes);
+	else
+		memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
+
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
-	memset(memslot->dirty_bitmap, 0, dirty_bytes);
 	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 	return 0;
 }
-- 
cgit v0.10.2


From 3bcc8a8c6c13f601dddd948d33d89d5ac5213e3c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 27 Oct 2010 17:21:21 +0200
Subject: KVM: add cast within kvm_clear_guest_page to fix warning

Fixes this:

  CC      arch/s390/kvm/../../../virt/kvm/kvm_main.o
arch/s390/kvm/../../../virt/kvm/kvm_main.c: In function 'kvm_clear_guest_page':
arch/s390/kvm/../../../virt/kvm/kvm_main.c:1224:2: warning: passing argument 3 of 'kvm_write_guest_page' makes pointer from integer without a cast
arch/s390/kvm/../../../virt/kvm/kvm_main.c:1185:5: note: expected 'const void *' but argument is of type 'long unsigned int'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9ce1079..3c99c2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1350,7 +1350,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
-	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+	return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
+				    offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
 
-- 
cgit v0.10.2


From aac8763697c6b7aa133abe8092a25154960e9a0c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 27 Oct 2010 17:22:10 +0200
Subject: KVM: get rid of warning within kvm_dev_ioctl_create_vm

Fixes this:

  CC      arch/s390/kvm/../../../virt/kvm/kvm_main.o
arch/s390/kvm/../../../virt/kvm/kvm_main.c: In function 'kvm_dev_ioctl_create_vm':
arch/s390/kvm/../../../virt/kvm/kvm_main.c:1828:10: warning: unused variable 'r'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3c99c2f..f2c2d84 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1961,7 +1961,7 @@ static struct file_operations kvm_vm_fops = {
 
 static int kvm_dev_ioctl_create_vm(void)
 {
-	int fd, r;
+	int r;
 	struct kvm *kvm;
 
 	kvm = kvm_create_vm();
@@ -1974,11 +1974,11 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
-	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
-	if (fd < 0)
+	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+	if (r < 0)
 		kvm_put_kvm(kvm);
 
-	return fd;
+	return r;
 }
 
 static long kvm_dev_ioctl_check_extension_generic(long arg)
-- 
cgit v0.10.2


From 2eec73437487aa690882cafddca6e4d93df46f26 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 1 Nov 2010 14:01:29 +0100
Subject: KVM: x86: Avoid issuing wbinvd twice

Micro optimization to avoid calling wbinvd twice on the CPU that has to
emulate it. As we might be preempted between smp_call_function_many and
the local wbinvd, the cache might be filled again so that real work
could be done uselessly.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35f82f2..c10135b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4007,13 +4007,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 		return X86EMUL_CONTINUE;
 
 	if (kvm_x86_ops->has_wbinvd_exit()) {
-		preempt_disable();
+		int cpu = get_cpu();
+
+		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
 		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
 				wbinvd_ipi, NULL, 1);
-		preempt_enable();
+		put_cpu();
 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
-	}
-	wbinvd();
+	} else
+		wbinvd();
 	return X86EMUL_CONTINUE;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-- 
cgit v0.10.2


From ec25d5e66ee152e371fd7046f3f8441859579aea Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 1 Nov 2010 15:35:01 +0200
Subject: KVM: handle exit due to INVD in VMX

Currently the exit is unhandled, so guest halts with error if it tries
to execute INVD instruction. Call into emulator when INVD instruction
is executed by a guest instead. This instruction is not needed by ordinary
guests, but firmware (like OpenBIOS) use it and fail.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd9..42d95905 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -239,6 +239,7 @@ enum vmcs_field {
 #define EXIT_REASON_TASK_SWITCH         9
 #define EXIT_REASON_CPUID               10
 #define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVD                13
 #define EXIT_REASON_INVLPG              14
 #define EXIT_REASON_RDPMC               15
 #define EXIT_REASON_RDTSC               16
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e42727b..12c3073 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3349,6 +3349,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3649,6 +3654,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
+	[EXIT_REASON_INVD]		      = handle_invd,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
-- 
cgit v0.10.2


From 2653503769ef9ae771509e1358a3da8de8d21709 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 2 Nov 2010 10:49:34 +0900
Subject: KVM: replace vmalloc and memset with vzalloc

Let's use newly introduced vzalloc().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index e316847..badc983 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1307,12 +1307,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	int err = -ENOMEM;
 	unsigned long p;
 
-	vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
+	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
 	if (!vcpu_book3s)
 		goto out;
 
-	memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
-
 	vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
 		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
 	if (!vcpu_book3s->shadow_vcpu)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f2c2d84..13cefe2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -640,13 +640,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* Allocate if a slot is being created */
 #ifndef CONFIG_S390
 	if (npages && !new.rmap) {
-		new.rmap = vmalloc(npages * sizeof(*new.rmap));
+		new.rmap = vzalloc(npages * sizeof(*new.rmap));
 
 		if (!new.rmap)
 			goto out_free;
 
-		memset(new.rmap, 0, npages * sizeof(*new.rmap));
-
 		new.user_alloc = user_alloc;
 		new.userspace_addr = mem->userspace_addr;
 	}
@@ -669,14 +667,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			     >> KVM_HPAGE_GFN_SHIFT(level));
 		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
 
-		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
+		new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
 
 		if (!new.lpage_info[i])
 			goto out_free;
 
-		memset(new.lpage_info[i], 0,
-		       lpages * sizeof(*new.lpage_info[i]));
-
 		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
 			new.lpage_info[i][0].write_count = 1;
 		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-- 
cgit v0.10.2


From c9b263d2be9c535b410f6617710534f798bf0ff0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 1 Nov 2010 16:58:43 +0800
Subject: KVM: fix tracing kvm_try_async_get_page

Tracing 'async' and *pfn is useless, since 'async' is always true,
and '*pfn' is always "fault_pfn'

We can trace 'gva' and 'gfn' instead, it can help us to see the
life-cycle of an async_pf

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 53509f5..272e302 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2628,7 +2628,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
 	put_page(pfn_to_page(*pfn));
 
 	if (!no_apf && can_do_async_pf(vcpu)) {
-		trace_kvm_try_async_get_page(async, *pfn);
+		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
 			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index c86f4e8..d94d6c3 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -218,18 +218,20 @@ TRACE_EVENT(kvm_age_page,
 #ifdef CONFIG_KVM_ASYNC_PF
 TRACE_EVENT(
 	kvm_try_async_get_page,
-	TP_PROTO(bool async, u64 pfn),
-	TP_ARGS(async, pfn),
+	TP_PROTO(u64 gva, u64 gfn),
+	TP_ARGS(gva, gfn),
 
 	TP_STRUCT__entry(
-		__field(__u64, pfn)
+		__field(u64, gva)
+		__field(u64, gfn)
 		),
 
 	TP_fast_assign(
-		__entry->pfn = (!async) ? pfn : (u64)-1;
+		__entry->gva = gva;
+		__entry->gfn = gfn;
 		),
 
-	TP_printk("pfn %#llx", __entry->pfn)
+	TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
 );
 
 TRACE_EVENT(
-- 
cgit v0.10.2


From 0730388b97d20cc568c25b42b9a23b28959b481f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 1 Nov 2010 16:59:39 +0800
Subject: KVM: cleanup async_pf tracepoints

Use 'DECLARE_EVENT_CLASS' to cleanup async_pf tracepoints

Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index d94d6c3..46e3cd8 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -216,59 +216,71 @@ TRACE_EVENT(kvm_age_page,
 );
 
 #ifdef CONFIG_KVM_ASYNC_PF
-TRACE_EVENT(
-	kvm_try_async_get_page,
+DECLARE_EVENT_CLASS(kvm_async_get_page_class,
+
 	TP_PROTO(u64 gva, u64 gfn),
+
 	TP_ARGS(gva, gfn),
 
 	TP_STRUCT__entry(
-		__field(u64, gva)
+		__field(__u64, gva)
 		__field(u64, gfn)
-		),
+	),
 
 	TP_fast_assign(
 		__entry->gva = gva;
 		__entry->gfn = gfn;
-		),
+	),
 
 	TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
 );
 
-TRACE_EVENT(
-	kvm_async_pf_not_present,
+DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
+
+	TP_PROTO(u64 gva, u64 gfn),
+
+	TP_ARGS(gva, gfn)
+);
+
+DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
+
+	TP_PROTO(u64 gva, u64 gfn),
+
+	TP_ARGS(gva, gfn)
+);
+
+DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,
+
 	TP_PROTO(u64 token, u64 gva),
+
 	TP_ARGS(token, gva),
 
 	TP_STRUCT__entry(
 		__field(__u64, token)
 		__field(__u64, gva)
-		),
+	),
 
 	TP_fast_assign(
 		__entry->token = token;
 		__entry->gva = gva;
-		),
+	),
+
+	TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)
 
-	TP_printk("token %#llx gva %#llx not present", __entry->token,
-		  __entry->gva)
 );
 
-TRACE_EVENT(
-	kvm_async_pf_ready,
+DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,
+
 	TP_PROTO(u64 token, u64 gva),
-	TP_ARGS(token, gva),
 
-	TP_STRUCT__entry(
-		__field(__u64, token)
-		__field(__u64, gva)
-		),
+	TP_ARGS(token, gva)
+);
 
-	TP_fast_assign(
-		__entry->token = token;
-		__entry->gva = gva;
-		),
+DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
+
+	TP_PROTO(u64 token, u64 gva),
 
-	TP_printk("token %#llx gva %#llx ready", __entry->token, __entry->gva)
+	TP_ARGS(token, gva)
 );
 
 TRACE_EVENT(
@@ -292,24 +304,6 @@ TRACE_EVENT(
 		  __entry->address, __entry->pfn)
 );
 
-TRACE_EVENT(
-	kvm_async_pf_doublefault,
-	TP_PROTO(u64 gva, u64 gfn),
-	TP_ARGS(gva, gfn),
-
-	TP_STRUCT__entry(
-		__field(u64, gva)
-		__field(u64, gfn)
-		),
-
-	TP_fast_assign(
-		__entry->gva = gva;
-		__entry->gfn = gfn;
-		),
-
-	TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
-);
-
 #endif
 
 #endif /* _TRACE_KVM_MAIN_H */
-- 
cgit v0.10.2


From c7d28c24042f7969adc29147d6c0a190192a6808 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 1 Nov 2010 17:00:30 +0800
Subject: KVM: fix searching async gfn in kvm_async_pf_gfn_slot

Don't search later slots if the slot is empty

Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c10135b..ab10a6c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6209,8 +6209,8 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
 	u32 key = kvm_async_pf_hash_fn(gfn);
 
 	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
-		     (vcpu->arch.apf.gfns[key] != gfn ||
-		      vcpu->arch.apf.gfns[key] == ~0); i++)
+		     (vcpu->arch.apf.gfns[key] != gfn &&
+		      vcpu->arch.apf.gfns[key] != ~0); i++)
 		key = kvm_async_pf_next_probe(key);
 
 	return key;
-- 
cgit v0.10.2


From e6d53e3b0db7ae3641f01a2b2af1470fda86d10c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 1 Nov 2010 17:01:28 +0800
Subject: KVM: avoid unnecessary wait for a async pf

In current code, it checks async pf completion out of the wait context,
like this:

if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
		    !vcpu->arch.apf.halted)
			r = vcpu_enter_guest(vcpu);
		else {
			......
			kvm_vcpu_block(vcpu)
			 ^- waiting until 'async_pf.done' is not empty
}

kvm_check_async_pf_completion(vcpu)
 ^- delete list from async_pf.done

So, if we check aysnc pf completion first, it can be blocked at
kvm_vcpu_block

Fixed by mark the vcpu is unhalted in kvm_check_async_pf_completion()
path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ab10a6c..5daead8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6283,6 +6283,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 		vcpu->arch.fault.address = work->arch.token;
 		kvm_inject_page_fault(vcpu);
 	}
+	vcpu->arch.apf.halted = false;
 }
 
 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 15096ffceabb9693306982127348890886384aaa Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 2 Nov 2010 17:35:35 +0800
Subject: KVM: handle more completed apfs if possible

If it's no need to inject async #PF to PV guest we can handle
more completed apfs at one time, so we can retry guest #PF
as early as possible

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 60df9e0..100c66e 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -124,24 +124,24 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 {
 	struct kvm_async_pf *work;
 
-	if (list_empty_careful(&vcpu->async_pf.done) ||
-	    !kvm_arch_can_inject_async_page_present(vcpu))
-		return;
-
-	spin_lock(&vcpu->async_pf.lock);
-	work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link);
-	list_del(&work->link);
-	spin_unlock(&vcpu->async_pf.lock);
+	while (!list_empty_careful(&vcpu->async_pf.done) &&
+	      kvm_arch_can_inject_async_page_present(vcpu)) {
+		spin_lock(&vcpu->async_pf.lock);
+		work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
+					      link);
+		list_del(&work->link);
+		spin_unlock(&vcpu->async_pf.lock);
 
-	if (work->page)
-		kvm_arch_async_page_ready(vcpu, work);
-	kvm_arch_async_page_present(vcpu, work);
+		if (work->page)
+			kvm_arch_async_page_ready(vcpu, work);
+		kvm_arch_async_page_present(vcpu, work);
 
-	list_del(&work->queue);
-	vcpu->async_pf.queued--;
-	if (work->page)
-		put_page(work->page);
-	kmem_cache_free(async_pf_cache, work);
+		list_del(&work->queue);
+		vcpu->async_pf.queued--;
+		if (work->page)
+			put_page(work->page);
+		kmem_cache_free(async_pf_cache, work);
+	}
 }
 
 int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
-- 
cgit v0.10.2


From 64f638c7c44fa87e65f51eaf0f8302b9cba2d696 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 1 Nov 2010 17:03:44 +0800
Subject: KVM: fix the race while wakeup all pv guest

In kvm_async_pf_wakeup_all(), we add a dummy apf to vcpu->async_pf.done
without holding vcpu->async_pf.lock, it will break if we are handling apfs
at this time.

Also use 'list_empty_careful()' instead of 'list_empty()'

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 100c66e..74268b4 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -196,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 {
 	struct kvm_async_pf *work;
 
-	if (!list_empty(&vcpu->async_pf.done))
+	if (!list_empty_careful(&vcpu->async_pf.done))
 		return 0;
 
 	work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
@@ -207,7 +207,10 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 	get_page(bad_page);
 	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
+	spin_lock(&vcpu->async_pf.lock);
 	list_add_tail(&work->link, &vcpu->async_pf.done);
+	spin_unlock(&vcpu->async_pf.lock);
+
 	vcpu->async_pf.queued++;
 	return 0;
 }
-- 
cgit v0.10.2


From 30bd0c4c6c5aecc338ebf32e3a6e01c98f0a0b43 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 1 Nov 2010 23:20:48 +0200
Subject: KVM: VMX: Disallow NMI while blocked by STI

While not mandated by the spec, Linux relies on NMI being blocked by an
IF-enabling STI.  VMX also refuses to enter a guest in this state, at
least on some implementations.

Disallow NMI while blocked by STI by checking for the condition, and
requesting an interrupt window exit if it occurs.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 12c3073..8087c4d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2787,6 +2787,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 		return;
 	}
 
+	if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+		enable_irq_window(vcpu);
+		return;
+	}
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2849,7 +2853,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 		return 0;
 
 	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
+		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+		   | GUEST_INTR_STATE_NMI));
 }
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 57e7fbee1dbd72949425b19d28415d2ddffe04ca Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 9 Nov 2010 12:42:12 +0100
Subject: KVM: Refactor srcu struct release on early errors

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13cefe2..fce0578 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -401,23 +401,19 @@ static struct kvm *kvm_create_vm(void)
 	r = -ENOMEM;
 	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 	if (!kvm->memslots)
-		goto out_err;
+		goto out_err_nosrcu;
 	if (init_srcu_struct(&kvm->srcu))
-		goto out_err;
+		goto out_err_nosrcu;
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 					GFP_KERNEL);
-		if (!kvm->buses[i]) {
-			cleanup_srcu_struct(&kvm->srcu);
+		if (!kvm->buses[i])
 			goto out_err;
-		}
 	}
 
 	r = kvm_init_mmu_notifier(kvm);
-	if (r) {
-		cleanup_srcu_struct(&kvm->srcu);
+	if (r)
 		goto out_err;
-	}
 
 	kvm->mm = current->mm;
 	atomic_inc(&kvm->mm->mm_count);
@@ -435,6 +431,8 @@ out:
 	return kvm;
 
 out_err:
+	cleanup_srcu_struct(&kvm->srcu);
+out_err_nosrcu:
 	hardware_disable_all();
 out_err_nodisable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
-- 
cgit v0.10.2


From 2a126faafb840e9a1e46514127cdb88ed998bd64 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 4 Nov 2010 18:29:42 +0800
Subject: KVM: remove unused function declaration

Remove the declaration of kvm_mmu_set_base_ptes()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2ea428..116dac5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -611,7 +611,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
-- 
cgit v0.10.2


From 9d893c6bc177b6ac5a1e937f4fdc359d272d68ff Mon Sep 17 00:00:00 2001
From: Tracey Dent <tdent48227@gmail.com>
Date: Sat, 6 Nov 2010 14:52:58 -0400
Subject: KVM: x86: Makefile clean up

Changed makefile to use the ccflags-y option instead of EXTRA_CFLAGS.

Signed-off-by: Tracey Dent <tdent48227@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c53bf19..f15501f43 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
 
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
 
 CFLAGS_x86.o := -I.
 CFLAGS_svm.o := -I.
-- 
cgit v0.10.2


From d89f5eff70a31237ffa1e21c51d23ca532110aea Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 9 Nov 2010 17:02:49 +0100
Subject: KVM: Clean up vm creation and release

IA64 support forces us to abstract the allocation of the kvm structure.
But instead of mixing this up with arch-specific initialization and
doing the same on destruction, split both steps. This allows to move
generic destruction calls into generic code.

It also fixes error clean-up on failures of kvm_create_vm for IA64.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 2f229e5..2689ee5 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
+#define __KVM_HAVE_ARCH_VM_ALLOC 1
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
 #endif /* __ASSEMBLY__*/
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f56a631..48a48bd 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -749,7 +749,7 @@ out:
 	return r;
 }
 
-static struct kvm *kvm_alloc_kvm(void)
+struct kvm *kvm_arch_alloc_vm(void)
 {
 
 	struct kvm *kvm;
@@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void)
 	vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
 
 	if (!vm_base)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
 	kvm = (struct kvm *)(vm_base +
@@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 #define GUEST_PHYSICAL_RR4	0x2739
 #define VMM_INIT_RR		0x1660
 
-static void kvm_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
 	BUG_ON(!kvm);
 
+	kvm->arch.is_sn2 = ia64_platform_is("sn2");
+
 	kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
 	kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
 	kvm->arch.vmm_init_rr = VMM_INIT_RR;
@@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm)
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
-}
-
-struct  kvm *kvm_arch_create_vm(void)
-{
-	struct kvm *kvm = kvm_alloc_kvm();
-
-	if (IS_ERR(kvm))
-		return ERR_PTR(-ENOMEM);
-
-	kvm->arch.is_sn2 = ia64_platform_is("sn2");
-
-	kvm_init_vm(kvm);
-
-	return kvm;
 
+	return 0;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
@@ -1357,7 +1346,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	return -EINVAL;
 }
 
-static void free_kvm(struct kvm *kvm)
+void kvm_arch_free_vm(struct kvm *kvm)
 {
 	unsigned long vm_base = kvm->arch.vm_base;
 
@@ -1399,9 +1388,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 #endif
 	kfree(kvm->arch.vioapic);
 	kvm_release_vm_pages(kvm);
-	kvm_free_physmem(kvm);
-	cleanup_srcu_struct(&kvm->srcu);
-	free_kvm(kvm);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 38f756f..9975846 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn)
 	*(int *)rtn = kvmppc_core_check_processor_compat();
 }
 
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-	struct kvm *kvm;
-
-	kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
-
-	return kvm;
+	return 0;
 }
 
-static void kvmppc_free_vcpus(struct kvm *kvm)
+void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	unsigned int i;
 	struct kvm_vcpu *vcpu;
@@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 }
 
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-	kvmppc_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
-	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm);
-}
-
 int kvm_dev_ioctl_check_extension(long ext)
 {
 	int r;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 985d825..bade533 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	return r;
 }
 
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-	struct kvm *kvm;
 	int rc;
 	char debug_name[16];
 
 	rc = s390_enable_sie();
 	if (rc)
-		goto out_nokvm;
-
-	rc = -ENOMEM;
-	kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-	if (!kvm)
-		goto out_nokvm;
+		goto out_err;
 
 	kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
 	if (!kvm->arch.sca)
-		goto out_nosca;
+		goto out_err;
 
 	sprintf(debug_name, "kvm-%u", current->pid);
 
@@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void)
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	VM_EVENT(kvm, 3, "%s", "vm created");
 
-	return kvm;
+	return 0;
 out_nodbf:
 	free_page((unsigned long)(kvm->arch.sca));
-out_nosca:
-	kfree(kvm);
-out_nokvm:
-	return ERR_PTR(rc);
+out_err:
+	return rc;
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
 	free_page((unsigned long)(kvm->arch.sca));
 	debug_unregister(kvm->arch.dbf);
-	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm);
 }
 
 /* Section: vcpu related */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5daead8..b7ee61d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5961,13 +5961,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-struct  kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
 {
-	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-	if (!kvm)
-		return ERR_PTR(-ENOMEM);
-
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
@@ -5976,7 +5971,7 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	spin_lock_init(&kvm->arch.tsc_write_lock);
 
-	return kvm;
+	return 0;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -6021,13 +6016,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
-	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
-	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bcf71c7..2d63f2c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/msi.h>
+#include <linux/slab.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -441,7 +442,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
-struct  kvm *kvm_arch_create_vm(void);
+#ifndef __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+	return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+	kfree(kvm);
+}
+#endif
+
+int kvm_arch_init_vm(struct kvm *kvm);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fce0578..4023264 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -383,11 +383,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 static struct kvm *kvm_create_vm(void)
 {
-	int r = 0, i;
-	struct kvm *kvm = kvm_arch_create_vm();
+	int r, i;
+	struct kvm *kvm = kvm_arch_alloc_vm();
 
-	if (IS_ERR(kvm))
-		goto out;
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	r = kvm_arch_init_vm(kvm);
+	if (r)
+		goto out_err_nodisable;
 
 	r = hardware_enable_all();
 	if (r)
@@ -427,7 +431,7 @@ static struct kvm *kvm_create_vm(void)
 	spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
 	spin_unlock(&kvm_lock);
-out:
+
 	return kvm;
 
 out_err:
@@ -438,7 +442,7 @@ out_err_nodisable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm->buses[i]);
 	kfree(kvm->memslots);
-	kfree(kvm);
+	kvm_arch_free_vm(kvm);
 	return ERR_PTR(r);
 }
 
@@ -512,6 +516,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_arch_flush_shadow(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
+	kvm_free_physmem(kvm);
+	cleanup_srcu_struct(&kvm->srcu);
+	kvm_arch_free_vm(kvm);
 	hardware_disable_all();
 	mmdrop(mm);
 }
-- 
cgit v0.10.2


From 5054c0de664dbbb47f1e6b69d668b6018df832f6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 12 Nov 2010 14:46:08 +0800
Subject: KVM: MMU: fix missing post sync audit

Add AUDIT_POST_SYNC audit for long mode shadow page

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 272e302..df465f6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2540,6 +2540,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
+		trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 		return;
 	}
 	for (i = 0; i < 4; ++i) {
-- 
cgit v0.10.2


From e5f3f027960128035a5fa176877b5e9160b35397 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 12 Nov 2010 14:47:01 +0800
Subject: KVM: MMU: clear apfs if page state is changed

If CR0.PG is changed, the page fault cann't be avoid when the prefault address
is accessed later

And it also fix a bug: it can retry a page enabled #PF in page disabled context
if mmu is shadow page

This idear is from Gleb Natapov

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b7ee61d..9b875ff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -520,6 +520,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
+	if ((cr0 ^ old_cr0) & X86_CR0_PG)
+		kvm_clear_async_pf_completion_queue(vcpu);
+
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
 	return 0;
-- 
cgit v0.10.2


From 060c2abe6c5e0f90418baf0a43b48d8d9a5521fb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 12 Nov 2010 14:49:11 +0800
Subject: KVM: MMU: support apf for nonpaing guest

Let's support apf for nonpaing guest

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index df465f6..db0fd55 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2286,7 +2286,11 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 	return 1;
 }
 
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+			 bool no_apf)
 {
 	int r;
 	int level;
@@ -2307,7 +2311,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable);
+
+	if (try_async_pf(vcpu, no_apf, gfn, v, &pfn, write, &map_writable))
+		return 0;
 
 	/* mmio */
 	if (is_error_pfn(pfn))
@@ -2595,7 +2601,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn);
+			     error_code & PFERR_WRITE_MASK, gfn, no_apf);
 }
 
 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
-- 
cgit v0.10.2


From c4806acdcec020fe5bbb054ce9dc75aaecaf29dd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 12 Nov 2010 14:49:55 +0800
Subject: KVM: MMU: fix apf prefault if nested guest is enabled

If apf is generated in L2 guest and is completed in L1 guest, it will
prefault this apf in L1 guest's mmu context.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 116dac5..f1e8d5b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -600,6 +600,7 @@ struct kvm_x86_ops {
 struct kvm_arch_async_pf {
 	u32 token;
 	gfn_t gfn;
+	bool direct_map;
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index db0fd55..ab100a7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2609,6 +2609,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 	struct kvm_arch_async_pf arch;
 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
 	arch.gfn = gfn;
+	arch.direct_map = vcpu->arch.mmu.direct_map;
 
 	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9b875ff..c673e72 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6168,7 +6168,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 {
 	int r;
 
-	if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
+	if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
+	      is_error_page(work->page))
 		return;
 
 	r = kvm_mmu_reload(vcpu);
-- 
cgit v0.10.2


From 2a6b20b83d1d37db57a9c25d99dc78454ad0747b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 9 Nov 2010 16:15:42 +0200
Subject: KVM: SVM: Replace svm_has() by standard Linux cpuid accessors

Instead of querying cpuid directly, use the Linux accessors (boot_cpu_has,
etc.).  This allows the things like the clearcpuid kernel command line to
work (when it's fixed wrt scattered cpuid bits).

Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cf47a2f..7c7f03b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -271,11 +271,6 @@ static u32 svm_msrpm_offset(u32 msr)
 
 #define MAX_INST_SIZE 15
 
-static inline u32 svm_has(u32 feat)
-{
-	return svm_features & feat;
-}
-
 static inline void clgi(void)
 {
 	asm volatile (__ex(SVM_CLGI));
@@ -381,7 +376,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
-	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 
 		/*
@@ -677,7 +672,7 @@ static __init int svm_hardware_setup(void)
 
 	svm_features = cpuid_edx(SVM_CPUID_FUNC);
 
-	if (!svm_has(SVM_FEATURE_NPT))
+	if (!boot_cpu_has(X86_FEATURE_NPT))
 		npt_enabled = false;
 
 	if (npt_enabled && !npt) {
@@ -876,7 +871,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	svm->nested.vmcb = 0;
 	svm->vcpu.arch.hflags = 0;
 
-	if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+	if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 		control->pause_filter_count = 3000;
 		control->intercept |= (1ULL << INTERCEPT_PAUSE);
 	}
@@ -2743,7 +2738,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		svm->vmcb->save.sysenter_esp = data;
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
-		if (!svm_has(SVM_FEATURE_LBRV)) {
+		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
 			pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
 					__func__, data);
 			break;
@@ -3533,7 +3528,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 				   additional features */
 
 		/* Support next_rip if host supports it */
-		if (svm_has(SVM_FEATURE_NRIP))
+		if (boot_cpu_has(X86_FEATURE_NRIPS))
 			entry->edx |= SVM_FEATURE_NRIP;
 
 		/* Support NPT for the guest if enabled */
-- 
cgit v0.10.2


From 945ee35e0764a834bc5bc5d572085d5b6409902a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 9 Nov 2010 16:15:43 +0200
Subject: KVM: Mask KVM_GET_SUPPORTED_CPUID data with Linux cpuid info

This allows Linux to mask cpuid bits if, for example, nx is enabled on only
some cpus.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c673e72..651cf90 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2227,6 +2227,11 @@ out:
 	return r;
 }
 
+static void cpuid_mask(u32 *word, int wordnum)
+{
+	*word &= boot_cpu_data.x86_capability[wordnum];
+}
+
 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			   u32 index)
 {
@@ -2301,7 +2306,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
+		cpuid_mask(&entry->edx, 0);
 		entry->ecx &= kvm_supported_word4_x86_features;
+		cpuid_mask(&entry->ecx, 4);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
 		entry->ecx |= F(X2APIC);
@@ -2392,7 +2399,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	case 0x80000001:
 		entry->edx &= kvm_supported_word1_x86_features;
+		cpuid_mask(&entry->edx, 1);
 		entry->ecx &= kvm_supported_word6_x86_features;
+		cpuid_mask(&entry->ecx, 6);
 		break;
 	}
 
-- 
cgit v0.10.2


From 0c106b5aaa727c7f508828e94cff4a98874f930c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:02 +0100
Subject: KVM: Clear assigned guest IRQ on release

When we deassign a guest IRQ, clear the potentially asserted guest line.
There might be no chance for the guest to do this, specifically if we
switch from INTx to MSI mode.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 7c98928..ecc4419 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -141,6 +141,9 @@ static void deassign_guest_irq(struct kvm *kvm,
 	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
 	assigned_dev->ack_notifier.gsi = -1;
 
+	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+		    assigned_dev->guest_irq, 0);
+
 	if (assigned_dev->irq_source_id != -1)
 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 	assigned_dev->irq_source_id = -1;
-- 
cgit v0.10.2


From 0645211c43df0b96c51e12980066b3227e10b164 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:03 +0100
Subject: KVM: Switch assigned device IRQ forwarding to threaded handler

This improves the IRQ forwarding for assigned devices: By using the
kernel's threaded IRQ scheme, we can get rid of the latency-prone work
queue and simplify the code in the same run.

Moreover, we no longer have to hold assigned_dev_lock while raising the
guest IRQ, which can be a lenghty operation as we may have to iterate
over all VCPUs. The lock is now only used for synchronizing masking vs.
unmasking of INTx-type IRQs, thus is renames to intx_lock.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2d63f2c..9fe7fef 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -470,16 +470,8 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
-#define KVM_ASSIGNED_MSIX_PENDING		0x1
-struct kvm_guest_msix_entry {
-	u32 vector;
-	u16 entry;
-	u16 flags;
-};
-
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
-	struct work_struct interrupt_work;
 	struct list_head list;
 	int assigned_dev_id;
 	int host_segnr;
@@ -490,13 +482,13 @@ struct kvm_assigned_dev_kernel {
 	bool host_irq_disabled;
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
-	struct kvm_guest_msix_entry *guest_msix_entries;
+	struct msix_entry *guest_msix_entries;
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
-	spinlock_t assigned_dev_lock;
+	spinlock_t intx_lock;
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ecc4419..1d77ce1 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 	return index;
 }
 
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
-	struct kvm_assigned_dev_kernel *assigned_dev;
-	int i;
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	u32 vector;
+	int index;
 
-	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-				    interrupt_work);
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
+		spin_lock(&assigned_dev->intx_lock);
+		disable_irq_nosync(irq);
+		assigned_dev->host_irq_disabled = true;
+		spin_unlock(&assigned_dev->intx_lock);
+	}
 
-	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		struct kvm_guest_msix_entry *guest_entries =
-			assigned_dev->guest_msix_entries;
-		for (i = 0; i < assigned_dev->entries_nr; i++) {
-			if (!(guest_entries[i].flags &
-					KVM_ASSIGNED_MSIX_PENDING))
-				continue;
-			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+		index = find_index_from_host_irq(assigned_dev, irq);
+		if (index >= 0) {
+			vector = assigned_dev->
+					guest_msix_entries[index].vector;
 			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id,
-				    guest_entries[i].vector, 1);
+				    assigned_dev->irq_source_id, vector, 1);
 		}
 	} else
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 			    assigned_dev->guest_irq, 1);
 
-	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
-}
-
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-	unsigned long flags;
-	struct kvm_assigned_dev_kernel *assigned_dev =
-		(struct kvm_assigned_dev_kernel *) dev_id;
-
-	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int index = find_index_from_host_irq(assigned_dev, irq);
-		if (index < 0)
-			goto out;
-		assigned_dev->guest_msix_entries[index].flags |=
-			KVM_ASSIGNED_MSIX_PENDING;
-	}
-
-	schedule_work(&assigned_dev->interrupt_work);
-
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-	}
-
-out:
-	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -114,7 +87,6 @@ out:
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_assigned_dev_kernel *dev;
-	unsigned long flags;
 
 	if (kian->gsi == -1)
 		return;
@@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	/* The guest irq may be shared so this ack may be
 	 * from another device.
 	 */
-	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
+	spin_lock(&dev->intx_lock);
 	if (dev->host_irq_disabled) {
 		enable_irq(dev->host_irq);
 		dev->host_irq_disabled = false;
 	}
-	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
+	spin_unlock(&dev->intx_lock);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -155,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm,
 			      struct kvm_assigned_dev_kernel *assigned_dev)
 {
 	/*
-	 * In kvm_free_device_irq, cancel_work_sync return true if:
-	 * 1. work is scheduled, and then cancelled.
-	 * 2. work callback is executed.
-	 *
-	 * The first one ensured that the irq is disabled and no more events
-	 * would happen. But for the second one, the irq may be enabled (e.g.
-	 * for MSI). So we disable irq here to prevent further events.
+	 * We disable irq here to prevent further events.
 	 *
 	 * Notice this maybe result in nested disable if the interrupt type is
 	 * INTx, but it's OK for we are going to free it.
 	 *
 	 * If this function is a part of VM destroy, please ensure that till
 	 * now, the kvm state is still legal for probably we also have to wait
-	 * interrupt_work done.
+	 * on a currently running IRQ handler.
 	 */
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int i;
 		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq_nosync(assigned_dev->
-					   host_msix_entries[i].vector);
-
-		cancel_work_sync(&assigned_dev->interrupt_work);
+			disable_irq(assigned_dev->host_msix_entries[i].vector);
 
 		for (i = 0; i < assigned_dev->entries_nr; i++)
 			free_irq(assigned_dev->host_msix_entries[i].vector,
@@ -188,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm,
 		pci_disable_msix(assigned_dev->dev);
 	} else {
 		/* Deal with MSI and INTx */
-		disable_irq_nosync(assigned_dev->host_irq);
-		cancel_work_sync(&assigned_dev->interrupt_work);
+		disable_irq(assigned_dev->host_irq);
 
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
@@ -268,8 +230,9 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	 * on the same interrupt line is not a happy situation: there
 	 * are going to be long delays in accepting, acking, etc.
 	 */
-	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
-			0, "kvm_assigned_intx_device", (void *)dev))
+	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+				 IRQF_ONESHOT, "kvm_assigned_intx_device",
+				 (void *)dev))
 		return -EIO;
 	return 0;
 }
@@ -287,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 	}
 
 	dev->host_irq = dev->dev->irq;
-	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
-			"kvm_assigned_msi_device", (void *)dev)) {
+	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
+				 0, "kvm_assigned_msi_device", (void *)dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -313,10 +276,10 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 		return r;
 
 	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_irq(dev->host_msix_entries[i].vector,
-				kvm_assigned_dev_intr, 0,
-				"kvm_assigned_msix_device",
-				(void *)dev);
+		r = request_threaded_irq(dev->host_msix_entries[i].vector,
+					 NULL, kvm_assigned_dev_thread,
+					 0, "kvm_assigned_msix_device",
+					 (void *)dev);
 		if (r)
 			goto err;
 	}
@@ -557,12 +520,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->host_devfn = assigned_dev->devfn;
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
-	spin_lock_init(&match->assigned_dev_lock);
+	spin_lock_init(&match->intx_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-	INIT_WORK(&match->interrupt_work,
-		  kvm_assigned_dev_interrupt_work_handler);
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
@@ -654,9 +615,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 			r = -ENOMEM;
 			goto msix_nr_out;
 		}
-		adev->guest_msix_entries = kzalloc(
-				sizeof(struct kvm_guest_msix_entry) *
-				entry_nr->entry_nr, GFP_KERNEL);
+		adev->guest_msix_entries =
+			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+				GFP_KERNEL);
 		if (!adev->guest_msix_entries) {
 			kfree(adev->host_msix_entries);
 			r = -ENOMEM;
-- 
cgit v0.10.2


From 1e001d49f9f9a0e3eb72939ad49d9a2c7754e9c1 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:04 +0100
Subject: KVM: Refactor IRQ names of assigned devices

Cosmetic change, but it helps to correlate IRQs with PCI devices.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9fe7fef..4bd663d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -489,6 +489,7 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 	spinlock_t intx_lock;
+	char irq_name[32];
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 1d77ce1..7623408 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -231,8 +231,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	 * are going to be long delays in accepting, acking, etc.
 	 */
 	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 IRQF_ONESHOT, "kvm_assigned_intx_device",
-				 (void *)dev))
+				 IRQF_ONESHOT, dev->irq_name, (void *)dev))
 		return -EIO;
 	return 0;
 }
@@ -251,7 +250,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
 	dev->host_irq = dev->dev->irq;
 	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, "kvm_assigned_msi_device", (void *)dev)) {
+				 0, dev->irq_name, (void *)dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -278,8 +277,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
 	for (i = 0; i < dev->entries_nr; i++) {
 		r = request_threaded_irq(dev->host_msix_entries[i].vector,
 					 NULL, kvm_assigned_dev_thread,
-					 0, "kvm_assigned_msix_device",
-					 (void *)dev);
+					 0, dev->irq_name, (void *)dev);
 		if (r)
 			goto err;
 	}
@@ -336,6 +334,9 @@ static int assign_host_irq(struct kvm *kvm,
 	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
 		return r;
 
+	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+		 pci_name(dev->dev));
+
 	switch (host_irq_type) {
 	case KVM_DEV_IRQ_HOST_INTX:
 		r = assigned_device_enable_host_intx(kvm, dev);
-- 
cgit v0.10.2


From ed78661f2614d3c9f69c23e280db3bafdabdf5bb Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:05 +0100
Subject: KVM: Save/restore state of assigned PCI device

The guest may change states that pci_reset_function does not touch. So
we better save/restore the assigned device across guest usage.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 7623408..d389207 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -197,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
-	pci_reset_function(assigned_dev->dev);
+	__pci_reset_function(assigned_dev->dev);
+	pci_restore_state(assigned_dev->dev);
 
 	pci_release_regions(assigned_dev->dev);
 	pci_disable_device(assigned_dev->dev);
@@ -514,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	}
 
 	pci_reset_function(dev);
+	pci_save_state(dev);
 
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->host_segnr = assigned_dev->segnr;
@@ -544,6 +546,7 @@ out:
 	mutex_unlock(&kvm->lock);
 	return r;
 out_list_del:
+	pci_restore_state(dev);
 	list_del(&match->list);
 	pci_release_regions(dev);
 out_disable:
-- 
cgit v0.10.2


From 51de271d441c01e7a0cf39f128827e0b4dc56409 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:06 +0100
Subject: KVM: Clean up kvm_vm_ioctl_assigned_device

Any arch not supporting device assigment will also not build
assigned-dev.c. So testing for KVM_CAP_DEVICE_DEASSIGNMENT is pointless.
KVM_CAP_ASSIGN_DEV_IRQ is unconditinally set. Moreover, add a default
case for dispatching the ioctl.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index d389207..ae72ae60 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -674,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 				  unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
-	int r = -ENOTTY;
+	int r;
 
 	switch (ioctl) {
 	case KVM_ASSIGN_PCI_DEVICE: {
@@ -692,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 		r = -EOPNOTSUPP;
 		break;
 	}
-#ifdef KVM_CAP_ASSIGN_DEV_IRQ
 	case KVM_ASSIGN_DEV_IRQ: {
 		struct kvm_assigned_irq assigned_irq;
 
@@ -715,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
-#endif
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
 	case KVM_DEASSIGN_PCI_DEVICE: {
 		struct kvm_assigned_pci_dev assigned_dev;
 
@@ -728,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
-#endif
 #ifdef KVM_CAP_IRQ_ROUTING
 	case KVM_SET_GSI_ROUTING: {
 		struct kvm_irq_routing routing;
@@ -781,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 		break;
 	}
 #endif
+	default:
+		r = -ENOTTY;
+		break;
 	}
 out:
 	return r;
-- 
cgit v0.10.2


From 49f481720438bbd0138218b0bcb494c3512a454f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 16 Nov 2010 22:30:07 +0100
Subject: KVM: Document device assigment API

Adds API documentation for KVM_[DE]ASSIGN_PCI_DEVICE,
KVM_[DE]ASSIGN_DEV_IRQ, KVM_SET_GSI_ROUTING, KVM_ASSIGN_SET_MSIX_NR, and
KVM_ASSIGN_SET_MSIX_ENTRY.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index b336266..e1a9297 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1085,6 +1085,184 @@ of 4 instructions that make up a hypercall.
 If any additional field gets added to this structure later on, a bit for that
 additional piece of information will be set in the flags bitmap.
 
+4.47 KVM_ASSIGN_PCI_DEVICE
+
+Capability: KVM_CAP_DEVICE_ASSIGNMENT
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Assigns a host PCI device to the VM.
+
+struct kvm_assigned_pci_dev {
+	__u32 assigned_dev_id;
+	__u32 busnr;
+	__u32 devfn;
+	__u32 flags;
+	__u32 segnr;
+	union {
+		__u32 reserved[11];
+	};
+};
+
+The PCI device is specified by the triple segnr, busnr, and devfn.
+Identification in succeeding service requests is done via assigned_dev_id. The
+following flags are specified:
+
+/* Depends on KVM_CAP_IOMMU */
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+
+4.48 KVM_DEASSIGN_PCI_DEVICE
+
+Capability: KVM_CAP_DEVICE_DEASSIGNMENT
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Ends PCI device assignment, releasing all associated resources.
+
+See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is
+used in kvm_assigned_pci_dev to identify the device.
+
+4.49 KVM_ASSIGN_DEV_IRQ
+
+Capability: KVM_CAP_ASSIGN_DEV_IRQ
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_irq (in)
+Returns: 0 on success, -1 on error
+
+Assigns an IRQ to a passed-through device.
+
+struct kvm_assigned_irq {
+	__u32 assigned_dev_id;
+	__u32 host_irq;
+	__u32 guest_irq;
+	__u32 flags;
+	union {
+		struct {
+			__u32 addr_lo;
+			__u32 addr_hi;
+			__u32 data;
+		} guest_msi;
+		__u32 reserved[12];
+	};
+};
+
+The following flags are defined:
+
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+It is not valid to specify multiple types per host or guest IRQ. However, the
+IRQ type of host and guest can differ or can even be null.
+
+4.50 KVM_DEASSIGN_DEV_IRQ
+
+Capability: KVM_CAP_ASSIGN_DEV_IRQ
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_irq (in)
+Returns: 0 on success, -1 on error
+
+Ends an IRQ assignment to a passed-through device.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
+by assigned_dev_id, flags must correspond to the IRQ type specified on
+KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
+
+4.51 KVM_SET_GSI_ROUTING
+
+Capability: KVM_CAP_IRQ_ROUTING
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_irq_routing (in)
+Returns: 0 on success, -1 on error
+
+Sets the GSI routing table entries, overwriting any previously set entries.
+
+struct kvm_irq_routing {
+	__u32 nr;
+	__u32 flags;
+	struct kvm_irq_routing_entry entries[0];
+};
+
+No flags are specified so far, the corresponding field must be set to zero.
+
+struct kvm_irq_routing_entry {
+	__u32 gsi;
+	__u32 type;
+	__u32 flags;
+	__u32 pad;
+	union {
+		struct kvm_irq_routing_irqchip irqchip;
+		struct kvm_irq_routing_msi msi;
+		__u32 pad[8];
+	} u;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+
+No flags are specified so far, the corresponding field must be set to zero.
+
+struct kvm_irq_routing_irqchip {
+	__u32 irqchip;
+	__u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 pad;
+};
+
+4.52 KVM_ASSIGN_SET_MSIX_NR
+
+Capability: KVM_CAP_DEVICE_MSIX
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_msix_nr (in)
+Returns: 0 on success, -1 on error
+
+Set the number of MSI-X interrupts for an assigned device. This service can
+only be called once in the lifetime of an assigned device.
+
+struct kvm_assigned_msix_nr {
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV		256
+
+4.53 KVM_ASSIGN_SET_MSIX_ENTRY
+
+Capability: KVM_CAP_DEVICE_MSIX
+Architectures: x86 ia64
+Type: vm ioctl
+Parameters: struct kvm_assigned_msix_entry (in)
+Returns: 0 on success, -1 on error
+
+Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
+the GSI vector to zero means disabling the interrupt.
+
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
cgit v0.10.2


From e730b63cc083852551b092e1c93f0ef22c25f220 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 17 Nov 2010 12:11:41 +0800
Subject: KVM: MMU: don't mark spte notrap if reserved bit set

If reserved bit is set, we need inject the #PF with PFEC.RSVD=1,
but shadow_notrap_nonpresent_pte injects #PF with PFEC.RSVD=0 only

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ba00eef..590bf12 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -395,8 +395,10 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 
 		gpte = gptep[i];
 
-		if (!is_present_gpte(gpte) ||
-		      is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
+		if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
+			continue;
+
+		if (!is_present_gpte(gpte)) {
 			if (!sp->unsync)
 				__set_spte(spte, shadow_notrap_nonpresent_pte);
 			continue;
@@ -760,6 +762,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		pt_element_t gpte;
 		gpa_t pte_gpa;
 		gfn_t gfn;
+		bool rsvd_bits_set;
 
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
@@ -771,12 +774,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
-		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
-		      || !(gpte & PT_ACCESSED_MASK)) {
+		rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
+						 PT_PAGE_TABLE_LEVEL);
+		if (rsvd_bits_set || gfn != sp->gfns[i] ||
+		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
-			if (is_present_gpte(gpte) || !clear_unsync)
+			if (rsvd_bits_set || is_present_gpte(gpte) ||
+			      !clear_unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-- 
cgit v0.10.2


From 97e91e28fa8fcbac30beab3de72060ada27d5671 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 16 Nov 2010 17:35:02 +0900
Subject: KVM: take kvm_lock for hardware_disable() during cpu hotplug

In kvm_cpu_hotplug(), only CPU_STARTING case is protected by kvm_lock.
This patch adds missing protection for CPU_DYING case.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4023264..f69fca7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2140,7 +2140,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	case CPU_DYING:
 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
 		       cpu);
+		spin_lock(&kvm_lock);
 		hardware_disable(NULL);
+		spin_unlock(&kvm_lock);
 		break;
 	case CPU_STARTING:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
-- 
cgit v0.10.2


From 75b7127c3858261fc080dd52a022424a7e7f6ae5 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 16 Nov 2010 17:37:41 +0900
Subject: KVM: rename hardware_[dis|en]able() to *_nolock() and add locking
 wrappers

The naming convension of hardware_[dis|en]able family is little bit confusing
because only hardware_[dis|en]able_all are using _nolock suffix.

Renaming current hardware_[dis|en]able() to *_nolock() and using
hardware_[dis|en]able() as wrapper functions which take kvm_lock for them
reduces extra confusion.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f69fca7..5156d45 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2059,7 +2059,7 @@ static struct miscdevice kvm_dev = {
 	&kvm_chardev_ops,
 };
 
-static void hardware_enable(void *junk)
+static void hardware_enable_nolock(void *junk)
 {
 	int cpu = raw_smp_processor_id();
 	int r;
@@ -2079,7 +2079,14 @@ static void hardware_enable(void *junk)
 	}
 }
 
-static void hardware_disable(void *junk)
+static void hardware_enable(void *junk)
+{
+	spin_lock(&kvm_lock);
+	hardware_enable_nolock(junk);
+	spin_unlock(&kvm_lock);
+}
+
+static void hardware_disable_nolock(void *junk)
 {
 	int cpu = raw_smp_processor_id();
 
@@ -2089,13 +2096,20 @@ static void hardware_disable(void *junk)
 	kvm_arch_hardware_disable(NULL);
 }
 
+static void hardware_disable(void *junk)
+{
+	spin_lock(&kvm_lock);
+	hardware_disable_nolock(junk);
+	spin_unlock(&kvm_lock);
+}
+
 static void hardware_disable_all_nolock(void)
 {
 	BUG_ON(!kvm_usage_count);
 
 	kvm_usage_count--;
 	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable, NULL, 1);
+		on_each_cpu(hardware_disable_nolock, NULL, 1);
 }
 
 static void hardware_disable_all(void)
@@ -2114,7 +2128,7 @@ static int hardware_enable_all(void)
 	kvm_usage_count++;
 	if (kvm_usage_count == 1) {
 		atomic_set(&hardware_enable_failed, 0);
-		on_each_cpu(hardware_enable, NULL, 1);
+		on_each_cpu(hardware_enable_nolock, NULL, 1);
 
 		if (atomic_read(&hardware_enable_failed)) {
 			hardware_disable_all_nolock();
@@ -2140,16 +2154,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	case CPU_DYING:
 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
 		       cpu);
-		spin_lock(&kvm_lock);
 		hardware_disable(NULL);
-		spin_unlock(&kvm_lock);
 		break;
 	case CPU_STARTING:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
-		spin_lock(&kvm_lock);
 		hardware_enable(NULL);
-		spin_unlock(&kvm_lock);
 		break;
 	}
 	return NOTIFY_OK;
@@ -2180,7 +2190,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 	 */
 	printk(KERN_INFO "kvm: exiting hardware virtualization\n");
 	kvm_rebooting = true;
-	on_each_cpu(hardware_disable, NULL, 1);
+	on_each_cpu(hardware_disable_nolock, NULL, 1);
 	return NOTIFY_OK;
 }
 
@@ -2350,7 +2360,7 @@ static void kvm_exit_debug(void)
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 {
 	if (kvm_usage_count)
-		hardware_disable(NULL);
+		hardware_disable_nolock(NULL);
 	return 0;
 }
 
@@ -2358,7 +2368,7 @@ static int kvm_resume(struct sys_device *dev)
 {
 	if (kvm_usage_count) {
 		WARN_ON(spin_is_locked(&kvm_lock));
-		hardware_enable(NULL);
+		hardware_enable_nolock(NULL);
 	}
 	return 0;
 }
@@ -2535,7 +2545,7 @@ void kvm_exit(void)
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
-	on_each_cpu(hardware_disable, NULL, 1);
+	on_each_cpu(hardware_disable_nolock, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	free_cpumask_var(cpus_hardware_enabled);
-- 
cgit v0.10.2


From f9335afea5d649693aee1ec8af2cc8ccf376f5a9 Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Wed, 17 Nov 2010 11:40:17 +0800
Subject: KVM: VMX: Inform user about INTEL_TXT dependency

Inform user to either disable TXT in the BIOS or do TXT launch
with tboot before enabling KVM since some BIOSes do not set
FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX bit when TXT is enabled.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8087c4d..92612fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1305,8 +1305,11 @@ static __init int vmx_disabled_by_bios(void)
 			&& tboot_enabled())
 			return 1;
 		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
-			&& !tboot_enabled())
+			&& !tboot_enabled()) {
+			printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+				" activate TXT before enabling KVM\n");
 			return 1;
+		}
 	}
 
 	return 0;
-- 
cgit v0.10.2


From 8a6bcaa6efeb7f970474890d4dcfe89774ff26a4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 17 Nov 2010 13:40:50 +0200
Subject: KVM: x86 emulator: drop unused #ifndef __KERNEL__

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8d..ffd6e01 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,9 @@
  * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
  */
 
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
 #define DPRINTF(x...) do {} while (0)
-#endif
 #include <linux/module.h>
 #include <asm/kvm_emulate.h>
 
-- 
cgit v0.10.2


From d53db5efc2f6026f7cb0871c91b887ed55e0f265 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 17 Nov 2010 13:40:51 +0200
Subject: KVM: x86 emulator: drop DPRINTF()

Failed emulation is reported via a tracepoint; the cmps printk is pointless.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ffd6e01..3325b47 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -22,7 +22,6 @@
 
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
 #include <linux/module.h>
 #include <asm/kvm_emulate.h>
 
@@ -2796,10 +2795,8 @@ done_prefixes:
 	c->execute = opcode.u.execute;
 
 	/* Unrecognised? */
-	if (c->d == 0 || (c->d & Undefined)) {
-		DPRINTF("Cannot emulate %02x\n", c->b);
+	if (c->d == 0 || (c->d & Undefined))
 		return -1;
-	}
 
 	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
 		c->op_bytes = 8;
@@ -3261,7 +3258,6 @@ special_insn:
 		break;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
 		goto cmp;
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
@@ -3778,6 +3774,5 @@ twobyte_insn:
 	goto writeback;
 
 cannot_emulate:
-	DPRINTF("Cannot emulate %02x\n", c->b);
 	return -1;
 }
-- 
cgit v0.10.2


From 90de84f50b425805bf7ddc430143ed2e224ebd8e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 17 Nov 2010 15:28:21 +0200
Subject: KVM: x86 emulator: preserve an operand's segment identity

Currently the x86 emulator converts the segment register associated with
an operand into a segment base which is added into the operand address.
This loss of information results in us not doing segment limit checks properly.

Replace struct operand's addr.mem field by a segmented_address structure
which holds both the effetive address and segment.  This will allow us to
do the limit check at the point of access.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3..b48c133 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -159,7 +159,10 @@ struct operand {
 	};
 	union {
 		unsigned long *reg;
-		unsigned long mem;
+		struct segmented_address {
+			ulong ea;
+			unsigned seg;
+		} mem;
 	} addr;
 	union {
 		unsigned long val;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3325b47..e967055 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -410,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
 }
 
 static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+register_address(struct decode_cache *c, unsigned long reg)
 {
-	return base + address_mask(c, reg);
+	return address_mask(c, reg);
 }
 
 static inline void
@@ -444,26 +444,26 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
 	return ops->get_cached_segment_base(seg, ctxt->vcpu);
 }
 
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
-				       struct x86_emulate_ops *ops,
-				       struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops,
+			     struct decode_cache *c)
 {
 	if (!c->has_seg_override)
 		return 0;
 
-	return seg_base(ctxt, ops, c->seg_override);
+	return c->seg_override;
 }
 
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops)
+static ulong linear(struct x86_emulate_ctxt *ctxt,
+		    struct segmented_address addr)
 {
-	return seg_base(ctxt, ops, VCPU_SREG_ES);
-}
+	struct decode_cache *c = &ctxt->decode;
+	ulong la;
 
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops)
-{
-	return seg_base(ctxt, ops, VCPU_SREG_SS);
+	la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+	if (c->ad_bytes != 8)
+		la &= (u32)-1;
+	return la;
 }
 
 static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -556,7 +556,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
 
 static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
-			   ulong addr,
+			   struct segmented_address addr,
 			   u16 *size, unsigned long *address, int op_bytes)
 {
 	int rc;
@@ -564,10 +564,12 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (op_bytes == 2)
 		op_bytes = 3;
 	*address = 0;
-	rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
+	rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
+			   ctxt->vcpu, NULL);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
+	rc = ops->read_std(linear(ctxt, addr) + 2, address, op_bytes,
+			   ctxt->vcpu, NULL);
 	return rc;
 }
 
@@ -760,7 +762,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			break;
 		}
 	}
-	op->addr.mem = modrm_ea;
+	op->addr.mem.ea = modrm_ea;
 done:
 	return rc;
 }
@@ -775,13 +777,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
 	op->type = OP_MEM;
 	switch (c->ad_bytes) {
 	case 2:
-		op->addr.mem = insn_fetch(u16, 2, c->eip);
+		op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
 		break;
 	case 4:
-		op->addr.mem = insn_fetch(u32, 4, c->eip);
+		op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
 		break;
 	case 8:
-		op->addr.mem = insn_fetch(u64, 8, c->eip);
+		op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
 		break;
 	}
 done:
@@ -800,7 +802,7 @@ static void fetch_bit_operand(struct decode_cache *c)
 		else if (c->src.bytes == 4)
 			sv = (s32)c->src.val & (s32)mask;
 
-		c->dst.addr.mem += (sv >> 3);
+		c->dst.addr.mem.ea += (sv >> 3);
 	}
 
 	/* only subword offset */
@@ -1093,7 +1095,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	case OP_MEM:
 		if (c->lock_prefix)
 			rc = ops->cmpxchg_emulated(
-					c->dst.addr.mem,
+					linear(ctxt, c->dst.addr.mem),
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
@@ -1101,7 +1103,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
-					c->dst.addr.mem,
+					linear(ctxt, c->dst.addr.mem),
 					&c->dst.val,
 					c->dst.bytes,
 					&err,
@@ -1129,8 +1131,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
-					   c->regs[VCPU_REGS_RSP]);
+	c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+	c->dst.addr.mem.seg = VCPU_SREG_SS;
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1139,10 +1141,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
+	struct segmented_address addr;
 
-	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
-						       c->regs[VCPU_REGS_RSP]),
-			   dest, len);
+	addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+	addr.seg = VCPU_SREG_SS;
+	rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -2223,14 +2226,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
 			    int reg, struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
 	register_address_increment(c, &c->regs[reg], df * op->bytes);
-	op->addr.mem = register_address(c,  base, c->regs[reg]);
+	op->addr.mem.ea = register_address(c, c->regs[reg]);
+	op->addr.mem.seg = seg;
 }
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2639,7 +2643,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
 
 	op->type = OP_IMM;
 	op->bytes = size;
-	op->addr.mem = c->eip;
+	op->addr.mem.ea = c->eip;
 	/* NB. Immediates are sign-extended as necessary. */
 	switch (op->bytes) {
 	case 1:
@@ -2821,14 +2825,13 @@ done_prefixes:
 	if (!c->has_seg_override)
 		set_seg_override(c, VCPU_SREG_DS);
 
-	if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
-		memop.addr.mem += seg_override_base(ctxt, ops, c);
+	memop.addr.mem.seg = seg_override(ctxt, ops, c);
 
 	if (memop.type == OP_MEM && c->ad_bytes != 8)
-		memop.addr.mem = (u32)memop.addr.mem;
+		memop.addr.mem.ea = (u32)memop.addr.mem.ea;
 
 	if (memop.type == OP_MEM && c->rip_relative)
-		memop.addr.mem += c->eip;
+		memop.addr.mem.ea += c->eip;
 
 	/*
 	 * Decode and fetch the source operand: register, memory
@@ -2880,14 +2883,14 @@ done_prefixes:
 	case SrcSI:
 		c->src.type = OP_MEM;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.addr.mem =
-			register_address(c,  seg_override_base(ctxt, ops, c),
-					 c->regs[VCPU_REGS_RSI]);
+		c->src.addr.mem.ea =
+			register_address(c, c->regs[VCPU_REGS_RSI]);
+		c->src.addr.mem.seg = seg_override(ctxt, ops, c),
 		c->src.val = 0;
 		break;
 	case SrcImmFAddr:
 		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
+		c->src.addr.mem.ea = c->eip;
 		c->src.bytes = c->op_bytes + 2;
 		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
 		break;
@@ -2934,7 +2937,7 @@ done_prefixes:
 		break;
 	case DstImmUByte:
 		c->dst.type = OP_IMM;
-		c->dst.addr.mem = c->eip;
+		c->dst.addr.mem.ea = c->eip;
 		c->dst.bytes = 1;
 		c->dst.val = insn_fetch(u8, 1, c->eip);
 		break;
@@ -2959,9 +2962,9 @@ done_prefixes:
 	case DstDI:
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.addr.mem =
-			register_address(c, es_base(ctxt, ops),
-					 c->regs[VCPU_REGS_RDI]);
+		c->dst.addr.mem.ea =
+			register_address(c, c->regs[VCPU_REGS_RDI]);
+		c->dst.addr.mem.seg = VCPU_SREG_ES;
 		c->dst.val = 0;
 		break;
 	case ImplicitOps:
@@ -3040,7 +3043,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
-		rc = read_emulated(ctxt, ops, c->src.addr.mem,
+		rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -3048,7 +3051,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src2.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, c->src2.addr.mem,
+		rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
 					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -3060,7 +3063,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = read_emulated(ctxt, ops, c->dst.addr.mem,
+		rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
 				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -3211,7 +3214,7 @@ special_insn:
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->src.addr.mem;
+		c->dst.val = c->src.addr.mem.ea;
 		break;
 	case 0x8e: { /* mov seg, r/m16 */
 		uint16_t sel;
@@ -3438,11 +3441,11 @@ writeback:
 	c->dst.type = saved_dst_type;
 
 	if ((c->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+		string_addr_inc(ctxt, seg_override(ctxt, ops, c),
 				VCPU_REGS_RSI, &c->src);
 
 	if ((c->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+		string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
 				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
@@ -3535,7 +3538,8 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
+			emulate_invlpg(ctxt->vcpu,
+				       linear(ctxt, c->src.addr.mem));
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
-- 
cgit v0.10.2


From 30b31ab6823988263c72a215fb875edec6161250 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 17 Nov 2010 15:28:22 +0200
Subject: KVM: x86 emulator: do not perform address calculations on linear
 addresses

Linear addresses are supposed to already have segment checks performed on them;
if we play with these addresses the checks become invalid.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e967055..bdbbb18 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -568,7 +568,8 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 			   ctxt->vcpu, NULL);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = ops->read_std(linear(ctxt, addr) + 2, address, op_bytes,
+	addr.ea += 2;
+	rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
 			   ctxt->vcpu, NULL);
 	return rc;
 }
-- 
cgit v0.10.2


From 104f226bfd0a607ca0e804ae4907555374f72cd9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 18 Nov 2010 13:12:52 +0200
Subject: KVM: VMX: Fold __vmx_vcpu_run() into vmx_vcpu_run()

cea15c2 ("KVM: Move KVM context switch into own function") split vmx_vcpu_run()
to prevent multiple copies of the context switch from being generated (causing
problems due to a label).  This patch folds them back together again and adds
the __noclone attribute to prevent the label from being duplicated.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 92612fb..6bf807a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3904,17 +3904,33 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 #define Q "l"
 #endif
 
-/*
- * We put this into a separate noinline function to prevent the compiler
- * from duplicating the code. This is needed because this code
- * uses non local labels that cannot be duplicated.
- * Do not put any flow control into this function.
- * Better would be to put this whole monstrosity into a .S file.
- */
-static void noinline do_vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	asm volatile(
+
+	/* Record the guest's net vcpu time for enforced NMI injections. */
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+		vmx->entry_time = ktime_get();
+
+	/* Don't enter VMX if guest state is invalid, let the exit handler
+	   start emulation until we arrive back to a valid state */
+	if (vmx->emulation_required && emulate_invalid_guest_state)
+		return;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	/* When single-stepping over STI and MOV SS, we must clear the
+	 * corresponding interruptibility bits in the guest state. Otherwise
+	 * vmentry fails as it then expects bit 14 (BS) in pending debug
+	 * exceptions being set, but that's not correct for the guest debugging
+	 * case. */
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmx_set_interrupt_shadow(vcpu, 0);
+
+	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
 		"push %%"R"cx \n\t"
@@ -4009,35 +4025,6 @@ static void noinline do_vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 #endif
 	      );
-}
-
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	/* Record the guest's net vcpu time for enforced NMI injections. */
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
-		vmx->entry_time = ktime_get();
-
-	/* Don't enter VMX if guest state is invalid, let the exit handler
-	   start emulation until we arrive back to a valid state */
-	if (vmx->emulation_required && emulate_invalid_guest_state)
-		return;
-
-	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
-		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
-		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
-	/* When single-stepping over STI and MOV SS, we must clear the
-	 * corresponding interruptibility bits in the guest state. Otherwise
-	 * vmentry fails as it then expects bit 14 (BS) in pending debug
-	 * exceptions being set, but that's not correct for the guest debugging
-	 * case. */
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmx_set_interrupt_shadow(vcpu, 0);
-
-	do_vmx_vcpu_run(vcpu);
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
 				  | (1 << VCPU_EXREG_PDPTR));
-- 
cgit v0.10.2


From bd2b53b20fcd0d6c4c815b54e6d464e34429d3a4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 18 Nov 2010 19:09:08 +0200
Subject: KVM: fast-path msi injection with irqfd

Store irq routing table pointer in the irqfd object,
and use that to inject MSI directly without bouncing out to
a kernel thread.

While we touch this structure, rearrange irqfd fields to make fastpath
better packed for better cache utilization.

This also adds some comments about locking rules and rcu usage in code.

Some notes on the design:
- Use pointer into the rt instead of copying an entry,
  to make it possible to use rcu, thus side-stepping
  locking complexities.  We also save some memory this way.
- Old workqueue code is still used for level irqs.
  I don't think we DTRT with level anyway, however,
  it seems easier to keep the code around as
  it has been thought through and debugged, and fix level later than
  rip out and re-instate it later.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4bd663d..f17beae 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -17,6 +17,7 @@
 #include <linux/preempt.h>
 #include <linux/msi.h>
 #include <linux/slab.h>
+#include <linux/rcupdate.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -240,6 +241,10 @@ struct kvm {
 
 	struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
+	/*
+	 * Update side is protected by irq_lock and,
+	 * if configured, irqfds.lock.
+	 */
 	struct kvm_irq_routing_table __rcu *irq_routing;
 	struct hlist_head mask_notifier_list;
 	struct hlist_head irq_ack_notifier_list;
@@ -511,6 +516,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   unsigned long *deliver_bitmask);
 #endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
+		int irq_source_id, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
@@ -652,17 +659,26 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 void kvm_irqfd_release(struct kvm *kvm);
+void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
 #else
 
 static inline void kvm_eventfd_init(struct kvm *kvm) {}
+
 static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 {
 	return -EINVAL;
 }
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
+
+static inline void kvm_irq_routing_update(struct kvm *kvm,
+					  struct kvm_irq_routing_table *irq_rt)
+{
+	rcu_assign_pointer(kvm->irq_routing, irq_rt);
+}
+
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
 	return -ENOSYS;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c1f1e3c..2ca4535 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -44,14 +44,19 @@
  */
 
 struct _irqfd {
-	struct kvm               *kvm;
-	struct eventfd_ctx       *eventfd;
-	int                       gsi;
-	struct list_head          list;
-	poll_table                pt;
-	wait_queue_t              wait;
-	struct work_struct        inject;
-	struct work_struct        shutdown;
+	/* Used for MSI fast-path */
+	struct kvm *kvm;
+	wait_queue_t wait;
+	/* Update side is protected by irqfds.lock */
+	struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
+	/* Used for level IRQ fast-path */
+	int gsi;
+	struct work_struct inject;
+	/* Used for setup/shutdown */
+	struct eventfd_ctx *eventfd;
+	struct list_head list;
+	poll_table pt;
+	struct work_struct shutdown;
 };
 
 static struct workqueue_struct *irqfd_cleanup_wq;
@@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
 	unsigned long flags = (unsigned long)key;
+	struct kvm_kernel_irq_routing_entry *irq;
+	struct kvm *kvm = irqfd->kvm;
 
-	if (flags & POLLIN)
+	if (flags & POLLIN) {
+		rcu_read_lock();
+		irq = rcu_dereference(irqfd->irq_entry);
 		/* An event has been signaled, inject an interrupt */
-		schedule_work(&irqfd->inject);
+		if (irq)
+			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+		else
+			schedule_work(&irqfd->inject);
+		rcu_read_unlock();
+	}
 
 	if (flags & POLLHUP) {
 		/* The eventfd is closing, detach from KVM */
-		struct kvm *kvm = irqfd->kvm;
 		unsigned long flags;
 
 		spin_lock_irqsave(&kvm->irqfds.lock, flags);
@@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 	add_wait_queue(wqh, &irqfd->wait);
 }
 
+/* Must be called under irqfds.lock */
+static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
+			 struct kvm_irq_routing_table *irq_rt)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct hlist_node *n;
+
+	if (irqfd->gsi >= irq_rt->nr_rt_entries) {
+		rcu_assign_pointer(irqfd->irq_entry, NULL);
+		return;
+	}
+
+	hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
+		/* Only fast-path MSI. */
+		if (e->type == KVM_IRQ_ROUTING_MSI)
+			rcu_assign_pointer(irqfd->irq_entry, e);
+		else
+			rcu_assign_pointer(irqfd->irq_entry, NULL);
+	}
+}
+
 static int
 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 {
+	struct kvm_irq_routing_table *irq_rt;
 	struct _irqfd *irqfd, *tmp;
 	struct file *file = NULL;
 	struct eventfd_ctx *eventfd = NULL;
@@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 		goto fail;
 	}
 
+	irq_rt = rcu_dereference_protected(kvm->irq_routing,
+					   lockdep_is_held(&kvm->irqfds.lock));
+	irqfd_update(kvm, irqfd, irq_rt);
+
 	events = file->f_op->poll(file, &irqfd->pt);
 
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
@@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 	spin_lock_irq(&kvm->irqfds.lock);
 
 	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
-		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
+		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
+			/*
+			 * This rcu_assign_pointer is needed for when
+			 * another thread calls kvm_irqfd_update before
+			 * we flush workqueue below.
+			 * It is paired with synchronize_rcu done by caller
+			 * of that function.
+			 */
+			rcu_assign_pointer(irqfd->irq_entry, NULL);
 			irqfd_deactivate(irqfd);
+		}
 	}
 
 	spin_unlock_irq(&kvm->irqfds.lock);
@@ -322,6 +370,25 @@ kvm_irqfd_release(struct kvm *kvm)
 }
 
 /*
+ * Change irq_routing and irqfd.
+ * Caller must invoke synchronize_rcu afterwards.
+ */
+void kvm_irq_routing_update(struct kvm *kvm,
+			    struct kvm_irq_routing_table *irq_rt)
+{
+	struct _irqfd *irqfd;
+
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	rcu_assign_pointer(kvm->irq_routing, irq_rt);
+
+	list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+		irqfd_update(kvm, irqfd, irq_rt);
+
+	spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+/*
  * create a host-wide workqueue for issuing deferred shutdown requests
  * aggregated from all vm* instances. We need our own isolated single-thread
  * queue to prevent deadlock against flushing the normal work-queue.
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 8edca91..9f614b4 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
 
-static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		       struct kvm *kvm, int irq_source_id, int level)
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id, int level)
 {
 	struct kvm_lapic_irq irq;
 
@@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
 	mutex_lock(&kvm->irq_lock);
 	old = kvm->irq_routing;
-	rcu_assign_pointer(kvm->irq_routing, new);
+	kvm_irq_routing_update(kvm, new);
 	mutex_unlock(&kvm->irq_lock);
+
 	synchronize_rcu();
 
 	new = old;
-- 
cgit v0.10.2


From aa17911e3c21b63e3bf94c580ed029d6dad816b4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 17 Nov 2010 18:44:19 +0200
Subject: KVM: Record instruction set in kvm_exit tracepoint

exit_reason's meaning depend on the instruction set; record it so a trace
taken on one machine can be interpreted on another.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7c7f03b..78a2308 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2980,7 +2980,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	u32 exit_code = svm->vmcb->control.exit_code;
 
-	trace_kvm_exit(exit_code, vcpu);
+	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8..1061022 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,21 +178,26 @@ TRACE_EVENT(kvm_apic,
 #define trace_kvm_apic_read(reg, val)		trace_kvm_apic(0, reg, val)
 #define trace_kvm_apic_write(reg, val)		trace_kvm_apic(1, reg, val)
 
+#define KVM_ISA_VMX   1
+#define KVM_ISA_SVM   2
+
 /*
  * Tracepoint for kvm guest exit:
  */
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
-	TP_ARGS(exit_reason, vcpu),
+	TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+	TP_ARGS(exit_reason, vcpu, isa),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	exit_reason	)
 		__field(	unsigned long,	guest_rip	)
+		__field(	u32,	        isa             )
 	),
 
 	TP_fast_assign(
 		__entry->exit_reason	= exit_reason;
 		__entry->guest_rip	= kvm_rip_read(vcpu);
+		__entry->isa            = isa;
 	),
 
 	TP_printk("reason %s rip 0x%lx",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6bf807a..2495910 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3700,7 +3700,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	trace_kvm_exit(exit_reason, vcpu);
+	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required && emulate_invalid_guest_state)
-- 
cgit v0.10.2


From 586f9607962cd982293759a4e95ff06e75be5225 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 18 Nov 2010 13:09:54 +0200
Subject: KVM: Add instruction-set-specific exit qualifications to kvm_exit
 trace

The exit reason alone is insufficient to understand exactly why an exit
occured; add ISA-specific trace parameters for additional information.

Because fetching these parameters is expensive on vmx, and because these
parameters are fetched even if tracing is disabled, we fetch the
parameters via a callback instead of as traditional trace arguments.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f1e8d5b..3cc80c4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -594,6 +594,7 @@ struct kvm_x86_ops {
 
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
+	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 78a2308..28274cf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2974,6 +2974,14 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
 
 }
 
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+	*info1 = control->exit_info_1;
+	*info2 = control->exit_info_2;
+}
+
 static int handle_exit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3684,7 +3692,9 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.get_tdp_level = get_npt_level,
 	.get_mt_mask = svm_get_mt_mask,
 
+	.get_exit_info = svm_get_exit_info,
 	.exit_reasons_str = svm_exit_reasons_str,
+
 	.get_lpage_level = svm_get_lpage_level,
 
 	.cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 1061022..1357d7c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -192,18 +192,22 @@ TRACE_EVENT(kvm_exit,
 		__field(	unsigned int,	exit_reason	)
 		__field(	unsigned long,	guest_rip	)
 		__field(	u32,	        isa             )
+		__field(	u64,	        info1           )
+		__field(	u64,	        info2           )
 	),
 
 	TP_fast_assign(
 		__entry->exit_reason	= exit_reason;
 		__entry->guest_rip	= kvm_rip_read(vcpu);
 		__entry->isa            = isa;
+		kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+					   &__entry->info2);
 	),
 
-	TP_printk("reason %s rip 0x%lx",
+	TP_printk("reason %s rip 0x%lx info %llx %llx",
 		 ftrace_print_symbols_seq(p, __entry->exit_reason,
 					  kvm_x86_ops->exit_reasons_str),
-		 __entry->guest_rip)
+		 __entry->guest_rip, __entry->info1, __entry->info2)
 );
 
 /*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2495910..ab05ff6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3690,6 +3690,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
 	ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+	*info1 = vmcs_readl(EXIT_QUALIFICATION);
+	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -4334,7 +4340,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.get_tdp_level = get_ept_level,
 	.get_mt_mask = vmx_get_mt_mask,
 
+	.get_exit_info = vmx_get_exit_info,
 	.exit_reasons_str = vmx_exit_reasons_str,
+
 	.get_lpage_level = vmx_get_lpage_level,
 
 	.cpuid_update = vmx_cpuid_update,
-- 
cgit v0.10.2


From 27923eb19c5d1197bd9d1472abdc2e749f21387a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 25 Nov 2010 10:25:44 +0100
Subject: KVM: PPC: Fix compile warning

KVM compilation fails with the following warning:

include/linux/kvm_host.h: In function 'kvm_irq_routing_update':
include/linux/kvm_host.h:679:2: error: 'struct kvm' has no member named 'irq_routing'

That function is only used and reasonable to have on systems that implement
an in-kernel interrupt chip. PPC doesn't.

Fix by #ifdef'ing it out when no irqchip is available.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f17beae..da0794f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -673,11 +673,13 @@ static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 
 static inline void kvm_irqfd_release(struct kvm *kvm) {}
 
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 static inline void kvm_irq_routing_update(struct kvm *kvm,
 					  struct kvm_irq_routing_table *irq_rt)
 {
 	rcu_assign_pointer(kvm->irq_routing, irq_rt);
 }
+#endif
 
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
-- 
cgit v0.10.2


From 30bfb3c4256422221cc763ff6e749ce8aca8e5c9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 19 Nov 2010 17:01:40 +0800
Subject: KVM: MMU: fix forgot flush tlbs on sync_page path

We should flush all tlbs after drop spte on sync_page path since

Quote from Avi:
| sync_page
| drop_spte
| kvm_mmu_notifier_invalidate_page
| kvm_unmap_rmapp
| spte doesn't exist -> no flush
| page is freed
| guest can write into freed page?

KVM-Stable-Tag.
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 590bf12..ca0e5e8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -786,6 +786,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
 			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+			kvm_flush_remote_tlbs(vcpu->kvm);
 			continue;
 		}
 
-- 
cgit v0.10.2


From b330aa0c7df1ece45f45566c45ea44f01e379ab0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 19 Nov 2010 17:02:35 +0800
Subject: KVM: MMU: don't drop spte if overwrite it from W to RO

We just need flush tlb if overwrite a writable spte with a read-only one.

And we should move this operation to set_spte() for sync_page path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ab100a7..29b2ec4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1960,7 +1960,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync, bool reset_host_protection)
 {
-	u64 spte;
+	u64 spte, entry = *sptep;
 	int ret = 0;
 
 	/*
@@ -2031,6 +2031,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
 	update_spte(sptep, spte);
+	/*
+	 * If we overwrite a writable spte with a read-only one we
+	 * should flush remote TLBs. Otherwise rmap_write_protect
+	 * will find a read-only spte, even though the writable spte
+	 * might be cached on a CPU's TLB.
+	 */
+	if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+		kvm_flush_remote_tlbs(vcpu->kvm);
 done:
 	return ret;
 }
@@ -2069,16 +2077,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
-		/*
-		 * If we overwrite a writable spte with a read-only one,
-		 * drop it and flush remote TLBs. Otherwise rmap_write_protect
-		 * will find a read-only spte, even though the writable spte
-		 * might be cached on a CPU's TLB.
-		 */
-		} else if (is_writable_pte(*sptep) &&
-			  (!(pte_access & ACC_WRITE_MASK) || !dirty)) {
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 			was_rmapped = 1;
 	}
-- 
cgit v0.10.2


From 9bdbba13b8868815198e4fdbd484769ef76392d9 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 19 Nov 2010 17:03:22 +0800
Subject: KVM: MMU: rename 'reset_host_protection' to 'host_writable'

Rename it to fit its sense better

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 29b2ec4..5910492 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1958,7 +1958,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int level,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
-		    bool can_unsync, bool reset_host_protection)
+		    bool can_unsync, bool host_writable)
 {
 	u64 spte, entry = *sptep;
 	int ret = 0;
@@ -1985,7 +1985,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
-	if (reset_host_protection)
+	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
 
 	spte |= (u64)pfn << PAGE_SHIFT;
@@ -2048,7 +2048,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int level, gfn_t gfn,
 			 pfn_t pfn, bool speculative,
-			 bool reset_host_protection)
+			 bool host_writable)
 {
 	int was_rmapped = 0;
 	int rmap_count;
@@ -2083,7 +2083,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
 		      dirty, level, gfn, pfn, speculative, true,
-		      reset_host_protection)) {
+		      host_writable)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_mmu_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ca0e5e8..57619ed 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -329,7 +329,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return;
 	kvm_get_pfn(pfn);
 	/*
-	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
+	 * we call mmu_set_spte() with host_writable = true beacuse that
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
 	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -744,7 +744,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			    bool clear_unsync)
 {
 	int i, offset, nr_present;
-	bool reset_host_protection;
+	bool host_writable;
 	gpa_t first_pte_gpa;
 
 	offset = nr_present = 0;
@@ -794,14 +794,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
 			pte_access &= ~ACC_WRITE_MASK;
-			reset_host_protection = 0;
+			host_writable = 0;
 		} else {
-			reset_host_protection = 1;
+			host_writable = 1;
 		}
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 			 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false,
-			 reset_host_protection);
+			 host_writable);
 	}
 
 	return !nr_present;
-- 
cgit v0.10.2


From a4a8e6f76ecf963fa7e4d74b3635655a2033a27b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 19 Nov 2010 17:04:03 +0800
Subject: KVM: MMU: remove 'clear_unsync' parameter

Remove it since we can judge it by using sp->unsync

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3cc80c4..1452478 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -250,7 +250,7 @@ struct kvm_mmu {
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
-			 struct kvm_mmu_page *sp, bool clear_unsync);
+			 struct kvm_mmu_page *sp);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	hpa_t root_hpa;
 	int root_level;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5910492..3db0cd4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1156,7 +1156,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 }
 
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
-			       struct kvm_mmu_page *sp, bool clear_unsync)
+			       struct kvm_mmu_page *sp)
 {
 	return 1;
 }
@@ -1286,7 +1286,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (clear_unsync)
 		kvm_unlink_unsync_page(vcpu->kvm, sp);
 
-	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
@@ -1327,12 +1327,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		kvm_unlink_unsync_page(vcpu->kvm, s);
 		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+			(vcpu->arch.mmu.sync_page(vcpu, s))) {
 			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
 			continue;
 		}
-		kvm_unlink_unsync_page(vcpu->kvm, s);
 		flush = true;
 	}
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 57619ed..60f00db 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -740,8 +740,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
  */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			    bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	int i, offset, nr_present;
 	bool host_writable;
@@ -781,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			u64 nonpresent;
 
 			if (rsvd_bits_set || is_present_gpte(gpte) ||
-			      !clear_unsync)
+			      sp->unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-- 
cgit v0.10.2


From 407c61c6bd6a51b56d02f8bbad8aadf19db8c7b5 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 23 Nov 2010 11:08:42 +0800
Subject: KVM: MMU: abstract invalid guest pte mapping

Introduce a common function to map invalid gpte

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3db0cd4..53ff31f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3075,9 +3075,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
-	if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
-		return;
-
 	++vcpu->kvm->stat.mmu_pte_updated;
 	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 60f00db..a43f4cc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,25 +299,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 					addr, access);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp, u64 *spte,
+				    pt_element_t gpte)
+{
+	u64 nonpresent = shadow_trap_nonpresent_pte;
+
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+		goto no_present;
+
+	if (!is_present_gpte(gpte)) {
+		if (!sp->unsync)
+			nonpresent = shadow_notrap_nonpresent_pte;
+		goto no_present;
+	}
+
+	if (!(gpte & PT_ACCESSED_MASK))
+		goto no_present;
+
+	return false;
+
+no_present:
+	drop_spte(vcpu->kvm, spte, nonpresent);
+	return true;
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
 	pt_element_t gpte;
 	unsigned pte_access;
 	pfn_t pfn;
-	u64 new_spte;
 
 	gpte = *(const pt_element_t *)pte;
-	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-		if (!is_present_gpte(gpte)) {
-			if (sp->unsync)
-				new_spte = shadow_trap_nonpresent_pte;
-			else
-				new_spte = shadow_notrap_nonpresent_pte;
-			__set_spte(spte, new_spte);
-		}
+	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 		return;
-	}
+
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -364,7 +381,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 				u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
-	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	pt_element_t *gptep = gw->prefetch_ptes;
 	u64 *spte;
 	int i;
@@ -395,16 +411,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 
 		gpte = gptep[i];
 
-		if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
-			continue;
-
-		if (!is_present_gpte(gpte)) {
-			if (!sp->unsync)
-				__set_spte(spte, shadow_notrap_nonpresent_pte);
-			continue;
-		}
-
-		if (!(gpte & PT_ACCESSED_MASK))
+		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 			continue;
 
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -761,7 +768,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		pt_element_t gpte;
 		gpa_t pte_gpa;
 		gfn_t gfn;
-		bool rsvd_bits_set;
 
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
@@ -773,18 +779,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
-						 PT_PAGE_TABLE_LEVEL);
-		if (rsvd_bits_set || gfn != sp->gfns[i] ||
-		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
-			u64 nonpresent;
-
-			if (rsvd_bits_set || is_present_gpte(gpte) ||
-			      sp->unsync)
-				nonpresent = shadow_trap_nonpresent_pte;
-			else
-				nonpresent = shadow_notrap_nonpresent_pte;
-			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+
+		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+			kvm_flush_remote_tlbs(vcpu->kvm);
+			continue;
+		}
+
+		if (gfn != sp->gfns[i]) {
+			drop_spte(vcpu->kvm, &sp->spt[i],
+				      shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 			continue;
 		}
-- 
cgit v0.10.2


From a4ee1ca4a36e7857d90ae8c2b85f1bde9a042c10 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 23 Nov 2010 11:13:00 +0800
Subject: KVM: MMU: delay flush all tlbs on sync_page path

Quote from Avi:
| I don't think we need to flush immediately; set a "tlb dirty" bit somewhere
| that is cleareded when we flush the tlb.  kvm_mmu_notifier_invalidate_page()
| can consult the bit and force a flush if set.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a43f4cc..2b3d66c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -746,6 +746,14 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ *   We should flush all tlbs if spte is dropped even though guest is
+ *   responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ *   and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ *   used by guest then tlbs are not flushed, so guest is allowed to access the
+ *   freed pages.
+ *   And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -781,14 +789,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gfn = gpte_to_gfn(gpte);
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
 		if (gfn != sp->gfns[i]) {
 			drop_spte(vcpu->kvm, &sp->spt[i],
 				      shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index da0794f..ac4e83a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -254,6 +254,7 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
+	long tlbs_dirty;
 #endif
 };
 
@@ -382,6 +383,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5156d45..ee99b77 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+	int dirty_count = kvm->tlbs_dirty;
+
+	smp_mb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
+	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -249,7 +253,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
 	kvm->mmu_notifier_seq++;
-	need_tlb_flush = kvm_unmap_hva(kvm, address);
+	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 
@@ -293,6 +297,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mmu_notifier_count++;
 	for (; start < end; start += PAGE_SIZE)
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	need_tlb_flush |= kvm->tlbs_dirty;
 	spin_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 
-- 
cgit v0.10.2


From da9cb575b1127f84984b8ad6d973dcc05ac036dd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:21 +0200
Subject: KVM: x86 emulator: introduce struct x86_exception to communicate
 faults

Introduce a structure that can contain an exception to be passed back
to main kvm code.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b48c133..b7c1127 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,12 @@
 
 struct x86_emulate_ctxt;
 
+struct x86_exception {
+	u8 vector;
+	bool error_code_valid;
+	u16 error_code;
+};
+
 /*
  * x86_emulate_ops:
  *
@@ -229,9 +235,8 @@ struct x86_emulate_ctxt {
 
 	bool perm_ok; /* do not check permissions if true */
 
-	int exception; /* exception that happens during emulation or -1 */
-	u32 error_code; /* error code for exception */
-	bool error_code_valid;
+	bool have_exception;
+	struct x86_exception exception;
 
 	/* decode cache */
 	struct decode_cache decode;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bdbbb18..18596e6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -469,9 +469,9 @@ static ulong linear(struct x86_emulate_ctxt *ctxt,
 static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
 				      u32 error, bool valid)
 {
-	ctxt->exception = vec;
-	ctxt->error_code = error;
-	ctxt->error_code_valid = valid;
+	ctxt->exception.vector = vec;
+	ctxt->exception.error_code = error;
+	ctxt->exception.error_code_valid = valid;
 }
 
 static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -3015,23 +3015,27 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 		emulate_ud(ctxt);
+		rc = X86EMUL_PROPAGATE_FAULT;
 		goto done;
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
 		emulate_ud(ctxt);
+		rc = X86EMUL_PROPAGATE_FAULT;
 		goto done;
 	}
 
 	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
 		emulate_ud(ctxt);
+		rc = X86EMUL_PROPAGATE_FAULT;
 		goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		emulate_gp(ctxt, 0);
+		rc = X86EMUL_PROPAGATE_FAULT;
 		goto done;
 	}
 
@@ -3210,6 +3214,7 @@ special_insn:
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
 			emulate_ud(ctxt);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
@@ -3225,6 +3230,7 @@ special_insn:
 		if (c->modrm_reg == VCPU_SREG_CS ||
 		    c->modrm_reg > VCPU_SREG_GS) {
 			emulate_ud(ctxt);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 
@@ -3357,6 +3363,7 @@ special_insn:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3371,6 +3378,7 @@ special_insn:
 		if (!emulator_io_permited(ctxt, ops, c->dst.val,
 					  c->src.bytes)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3396,6 +3404,7 @@ special_insn:
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		} else
 			ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -3403,6 +3412,7 @@ special_insn:
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3475,6 +3485,8 @@ writeback:
 	ctxt->eip = c->eip;
 
 done:
+	if (rc == X86EMUL_PROPAGATE_FAULT)
+		ctxt->have_exception = true;
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
@@ -3537,6 +3549,7 @@ twobyte_insn:
 			break;
 		case 5: /* not defined */
 			emulate_ud(ctxt);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu,
@@ -3567,6 +3580,7 @@ twobyte_insn:
 		case 5 ... 7:
 		case 9 ... 15:
 			emulate_ud(ctxt);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3575,6 +3589,7 @@ twobyte_insn:
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
 			emulate_ud(ctxt);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3582,6 +3597,7 @@ twobyte_insn:
 	case 0x22: /* mov reg, cr */
 		if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		c->dst.type = OP_NONE;
@@ -3590,6 +3606,7 @@ twobyte_insn:
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
 			emulate_ud(ctxt);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 
@@ -3598,6 +3615,7 @@ twobyte_insn:
 				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
 			/* #UD condition is already handled by the code above */
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 
@@ -3609,6 +3627,7 @@ twobyte_insn:
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
@@ -3617,6 +3636,7 @@ twobyte_insn:
 		/* rdmsr */
 		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			emulate_gp(ctxt, 0);
+			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 651cf90..0c90832 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4254,12 +4254,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-	if (ctxt->exception == PF_VECTOR)
+	if (ctxt->exception.vector == PF_VECTOR)
 		kvm_propagate_fault(vcpu);
-	else if (ctxt->error_code_valid)
-		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+	else if (ctxt->exception.error_code_valid)
+		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+				      ctxt->exception.error_code);
 	else
-		kvm_queue_exception(vcpu, ctxt->exception);
+		kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4371,7 +4372,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		init_emulate_ctxt(vcpu);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
-		vcpu->arch.emulate_ctxt.exception = -1;
+		vcpu->arch.emulate_ctxt.have_exception = false;
 		vcpu->arch.emulate_ctxt.perm_ok = false;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
@@ -4437,7 +4438,7 @@ restart:
 	}
 
 done:
-	if (vcpu->arch.emulate_ctxt.exception >= 0) {
+	if (vcpu->arch.emulate_ctxt.have_exception) {
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
 	} else if (vcpu->arch.pio.count) {
-- 
cgit v0.10.2


From bcc55cba9f1fcda68412c8c3d8579c56d90b16f2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:22 +0200
Subject: KVM: x86 emulator: make emulator memory callbacks return full
 exception

This way, they can return #GP, not just #PF.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7c1127..87d017e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -70,7 +70,8 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to read from memory.
 	 */
 	int (*read_std)(unsigned long addr, void *val,
-			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+			unsigned int bytes, struct kvm_vcpu *vcpu,
+			struct x86_exception *fault);
 
 	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -80,7 +81,8 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to write to memory.
 	 */
 	int (*write_std)(unsigned long addr, void *val,
-			 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+			 unsigned int bytes, struct kvm_vcpu *vcpu,
+			 struct x86_exception *fault);
 	/*
 	 * fetch: Read bytes of standard (non-emulated/special) memory.
 	 *        Used for instruction fetch.
@@ -89,7 +91,8 @@ struct x86_emulate_ops {
 	 *  @bytes: [IN ] Number of bytes to read from memory.
 	 */
 	int (*fetch)(unsigned long addr, void *val,
-			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+		     unsigned int bytes, struct kvm_vcpu *vcpu,
+		     struct x86_exception *fault);
 
 	/*
 	 * read_emulated: Read bytes from emulated/special memory area.
@@ -100,7 +103,7 @@ struct x86_emulate_ops {
 	int (*read_emulated)(unsigned long addr,
 			     void *val,
 			     unsigned int bytes,
-			     unsigned int *error,
+			     struct x86_exception *fault,
 			     struct kvm_vcpu *vcpu);
 
 	/*
@@ -113,7 +116,7 @@ struct x86_emulate_ops {
 	int (*write_emulated)(unsigned long addr,
 			      const void *val,
 			      unsigned int bytes,
-			      unsigned int *error,
+			      struct x86_exception *fault,
 			      struct kvm_vcpu *vcpu);
 
 	/*
@@ -128,7 +131,7 @@ struct x86_emulate_ops {
 				const void *old,
 				const void *new,
 				unsigned int bytes,
-				unsigned int *error,
+				struct x86_exception *fault,
 				struct kvm_vcpu *vcpu);
 
 	int (*pio_in_emulated)(int size, unsigned short port, void *val,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 18596e6..16ed6c1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -512,7 +512,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 		cur_size = fc->end - fc->start;
 		size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
 		rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
-				size, ctxt->vcpu, NULL);
+				size, ctxt->vcpu, &ctxt->exception);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		fc->end += size;
@@ -565,12 +565,12 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 		op_bytes = 3;
 	*address = 0;
 	rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
-			   ctxt->vcpu, NULL);
+			   ctxt->vcpu, &ctxt->exception);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	addr.ea += 2;
 	rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
-			   ctxt->vcpu, NULL);
+			   ctxt->vcpu, &ctxt->exception);
 	return rc;
 }
 
@@ -816,7 +816,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	struct read_cache *mc = &ctxt->decode.mem_read;
-	u32 err;
 
 	while (size) {
 		int n = min(size, 8u);
@@ -824,10 +823,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		if (mc->pos < mc->end)
 			goto read_cached;
 
-		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt);
+		rc = ops->read_emulated(addr, mc->data + mc->end, n,
+					&ctxt->exception, ctxt->vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -902,7 +899,6 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
 	int ret;
-	u32 err;
 	ulong addr;
 
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
@@ -912,9 +908,8 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	addr = dt.address + index * 8;
-	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt);
+	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
+			    &ctxt->exception);
 
        return ret;
 }
@@ -926,7 +921,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
-	u32 err;
 	ulong addr;
 	int ret;
 
@@ -938,9 +932,8 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	}
 
 	addr = dt.address + index * 8;
-	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt);
+	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
+			     &ctxt->exception);
 
 	return ret;
 }
@@ -1087,7 +1080,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	struct decode_cache *c = &ctxt->decode;
-	u32 err;
 
 	switch (c->dst.type) {
 	case OP_REG:
@@ -1100,17 +1092,15 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
-					&err,
+					&ctxt->exception,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
 					linear(ctxt, c->dst.addr.mem),
 					&c->dst.val,
 					c->dst.bytes,
-					&err,
+					&ctxt->exception,
 					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1283,7 +1273,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	gva_t cs_addr;
 	gva_t eip_addr;
 	u16 cs, eip;
-	u32 err;
 
 	/* TODO: Add limit checks */
 	c->src.val = ctxt->eflags;
@@ -1313,11 +1302,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	eip_addr = dt.address + (irq << 2);
 	cs_addr = dt.address + (irq << 2) + 2;
 
-	rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+	rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+	rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1930,33 +1919,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 {
 	struct tss_segment_16 tss_seg;
 	int ret;
-	u32 err, new_tss_base = get_desc_base(new_desc);
+	u32 new_tss_base = get_desc_base(new_desc);
 
 	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-			    &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT) {
+			    &ctxt->exception);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt);
 		return ret;
-	}
 
 	save_state_to_tss16(ctxt, ops, &tss_seg);
 
 	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-			     &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT) {
+			     &ctxt->exception);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt);
 		return ret;
-	}
 
 	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-			    &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT) {
+			    &ctxt->exception);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt);
 		return ret;
-	}
 
 	if (old_tss_sel != 0xffff) {
 		tss_seg.prev_task_link = old_tss_sel;
@@ -1964,12 +1947,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 		ret = ops->write_std(new_tss_base,
 				     &tss_seg.prev_task_link,
 				     sizeof tss_seg.prev_task_link,
-				     ctxt->vcpu, &err);
-		if (ret == X86EMUL_PROPAGATE_FAULT) {
+				     ctxt->vcpu, &ctxt->exception);
+		if (ret == X86EMUL_PROPAGATE_FAULT)
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt);
 			return ret;
-		}
 	}
 
 	return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2072,33 +2053,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 {
 	struct tss_segment_32 tss_seg;
 	int ret;
-	u32 err, new_tss_base = get_desc_base(new_desc);
+	u32 new_tss_base = get_desc_base(new_desc);
 
 	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-			    &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT) {
+			    &ctxt->exception);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt);
 		return ret;
-	}
 
 	save_state_to_tss32(ctxt, ops, &tss_seg);
 
 	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-			     &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT) {
+			     &ctxt->exception);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt);
 		return ret;
-	}
 
 	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
-			    &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT) {
+			    &ctxt->exception);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt);
 		return ret;
-	}
 
 	if (old_tss_sel != 0xffff) {
 		tss_seg.prev_task_link = old_tss_sel;
@@ -2106,12 +2081,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 		ret = ops->write_std(new_tss_base,
 				     &tss_seg.prev_task_link,
 				     sizeof tss_seg.prev_task_link,
-				     ctxt->vcpu, &err);
-		if (ret == X86EMUL_PROPAGATE_FAULT) {
+				     ctxt->vcpu, &ctxt->exception);
+		if (ret == X86EMUL_PROPAGATE_FAULT)
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt);
 			return ret;
-		}
 	}
 
 	return load_state_from_tss32(ctxt, ops, &tss_seg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c90832..8311ed9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3642,24 +3642,31 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
 }
 
+static int make_page_fault(struct x86_exception *exception, u32 error)
+{
+	exception->vector = PF_VECTOR;
+	exception->error_code_valid = true;
+	exception->error_code = error;
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 				      struct kvm_vcpu *vcpu, u32 access,
-				      u32 *error)
+				      struct x86_exception *exception)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
+	u32 error;
 
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
-							    error);
+							    &error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
-		if (gpa == UNMAPPED_GVA) {
-			r = X86EMUL_PROPAGATE_FAULT;
-			goto out;
-		}
+		if (gpa == UNMAPPED_GVA)
+			return make_page_fault(exception, error);
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
@@ -3676,47 +3683,50 @@ out:
 
 /* used for instruction fetching */
 static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu, u32 *error)
+				struct kvm_vcpu *vcpu,
+				struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
-					  access | PFERR_FETCH_MASK, error);
+					  access | PFERR_FETCH_MASK,
+					  exception);
 }
 
 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-			       struct kvm_vcpu *vcpu, u32 *error)
+			       struct kvm_vcpu *vcpu,
+			       struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
-					  error);
+					  exception);
 }
 
 static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
-			       struct kvm_vcpu *vcpu, u32 *error)
+				      struct kvm_vcpu *vcpu,
+				      struct x86_exception *exception)
 {
-	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
 static int kvm_write_guest_virt_system(gva_t addr, void *val,
 				       unsigned int bytes,
 				       struct kvm_vcpu *vcpu,
-				       u32 *error)
+				       struct x86_exception *exception)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
+	u32 error;
 
 	while (bytes) {
 		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
 							     PFERR_WRITE_MASK,
-							     error);
+							     &error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
-		if (gpa == UNMAPPED_GVA) {
-			r = X86EMUL_PROPAGATE_FAULT;
-			goto out;
-		}
+		if (gpa == UNMAPPED_GVA)
+			return make_page_fault(exception, error);
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
@@ -3734,10 +3744,11 @@ out:
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
-				  unsigned int *error_code,
+				  struct x86_exception *exception,
 				  struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	u32 error_code;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -3747,17 +3758,17 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 	}
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
 
 	if (gpa == UNMAPPED_GVA)
-		return X86EMUL_PROPAGATE_FAULT;
+		return make_page_fault(exception, error_code);
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
-	if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
-				== X86EMUL_CONTINUE)
+	if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+	    == X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
 
 mmio:
@@ -3781,7 +3792,7 @@ mmio:
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-			  const void *val, int bytes)
+			const void *val, int bytes)
 {
 	int ret;
 
@@ -3795,15 +3806,16 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-					   unsigned int *error_code,
+					   struct x86_exception *exception,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	u32 error_code;
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
 
 	if (gpa == UNMAPPED_GVA)
-		return X86EMUL_PROPAGATE_FAULT;
+		return make_page_fault(exception, error_code);
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3833,7 +3845,7 @@ mmio:
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
-			    unsigned int *error_code,
+			    struct x86_exception *exception,
 			    struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
@@ -3841,7 +3853,7 @@ int emulator_write_emulated(unsigned long addr,
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+		rc = emulator_write_emulated_onepage(addr, val, now, exception,
 						     vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
@@ -3849,7 +3861,7 @@ int emulator_write_emulated(unsigned long addr,
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+	return emulator_write_emulated_onepage(addr, val, bytes, exception,
 					       vcpu);
 }
 
@@ -3867,7 +3879,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
-				     unsigned int *error_code,
+				     struct x86_exception *exception,
 				     struct kvm_vcpu *vcpu)
 {
 	gpa_t gpa;
@@ -3925,7 +3937,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-	return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
+	return emulator_write_emulated(addr, new, bytes, exception, vcpu);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-- 
cgit v0.10.2


From 42438e364cbc1e95cacb4ca92688d54a50cf5c4d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:23 +0200
Subject: KVM: x86 emulator: drop dead pf injection in emulate_popf()

If rc == X86EMUL_PROPAGATE_FAULT, we would have returned earlier.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 16ed6c1..345aa4d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -479,11 +479,6 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_pf(struct x86_emulate_ctxt *ctxt)
-{
-	emulate_exception(ctxt, PF_VECTOR, 0, true);
-}
-
 static void emulate_ud(struct x86_emulate_ctxt *ctxt)
 {
 	emulate_exception(ctxt, UD_VECTOR, 0, false);
@@ -1184,9 +1179,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	*(unsigned long *)dest =
 		(ctxt->eflags & ~change_mask) | (val & change_mask);
 
-	if (rc == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt);
-
 	return rc;
 }
 
-- 
cgit v0.10.2


From db297e3d8ed8409b969512c3ecd9d13223f2981c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:24 +0200
Subject: KVM: x86 emulator: tighen up ->read_std() and ->write_std() error
 checks

Instead of checking for X86EMUL_PROPAGATE_FAULT, check for any error,
making the callers more reliable.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 345aa4d..223c536 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1915,7 +1915,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 
 	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
 			    &ctxt->exception);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
+	if (ret != X86EMUL_CONTINUE)
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
@@ -1923,13 +1923,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 
 	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
 			     &ctxt->exception);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
+	if (ret != X86EMUL_CONTINUE)
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
 			    &ctxt->exception);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
+	if (ret != X86EMUL_CONTINUE)
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
@@ -1940,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     &tss_seg.prev_task_link,
 				     sizeof tss_seg.prev_task_link,
 				     ctxt->vcpu, &ctxt->exception);
-		if (ret == X86EMUL_PROPAGATE_FAULT)
+		if (ret != X86EMUL_CONTINUE)
 			/* FIXME: need to provide precise fault address */
 			return ret;
 	}
@@ -2049,7 +2049,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 
 	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
 			    &ctxt->exception);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
+	if (ret != X86EMUL_CONTINUE)
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
@@ -2057,13 +2057,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 
 	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
 			     &ctxt->exception);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
+	if (ret != X86EMUL_CONTINUE)
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
 	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
 			    &ctxt->exception);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
+	if (ret != X86EMUL_CONTINUE)
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
@@ -2074,7 +2074,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     &tss_seg.prev_task_link,
 				     sizeof tss_seg.prev_task_link,
 				     ctxt->vcpu, &ctxt->exception);
-		if (ret == X86EMUL_PROPAGATE_FAULT)
+		if (ret != X86EMUL_CONTINUE)
 			/* FIXME: need to provide precise fault address */
 			return ret;
 	}
-- 
cgit v0.10.2


From 35d3d4a1dd2c1ffd6f2481f6d8ad6c358bb22f07 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:25 +0200
Subject: KVM: x86 emulator: simplify exception generation

Immediately after we generate an exception, we want a X86EMUL_PROPAGATE_FAULT
constant, so return it from the generation functions.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 223c536..36534ec 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -466,33 +466,33 @@ static ulong linear(struct x86_emulate_ctxt *ctxt,
 	return la;
 }
 
-static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
-				      u32 error, bool valid)
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+			     u32 error, bool valid)
 {
 	ctxt->exception.vector = vec;
 	ctxt->exception.error_code = error;
 	ctxt->exception.error_code_valid = valid;
+	return X86EMUL_PROPAGATE_FAULT;
 }
 
-static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 {
-	emulate_exception(ctxt, GP_VECTOR, err, true);
+	return emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
 {
-	emulate_exception(ctxt, UD_VECTOR, 0, false);
+	return emulate_exception(ctxt, UD_VECTOR, 0, false);
 }
 
-static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
 {
-	emulate_exception(ctxt, TS_VECTOR, err, true);
+	return emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 
 static int emulate_de(struct x86_emulate_ctxt *ctxt)
 {
-	emulate_exception(ctxt, DE_VECTOR, 0, false);
-	return X86EMUL_PROPAGATE_FAULT;
+	return emulate_exception(ctxt, DE_VECTOR, 0, false);
 }
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -898,10 +898,8 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-	if (dt.size < index * 8 + 7) {
-		emulate_gp(ctxt, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (dt.size < index * 8 + 7)
+		return emulate_gp(ctxt, selector & 0xfffc);
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
 			    &ctxt->exception);
@@ -921,10 +919,8 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-	if (dt.size < index * 8 + 7) {
-		emulate_gp(ctxt, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (dt.size < index * 8 + 7)
+		return emulate_gp(ctxt, selector & 0xfffc);
 
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
@@ -1165,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 			change_mask |= EFLG_IF;
 		break;
 	case X86EMUL_MODE_VM86:
-		if (iopl < 3) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
+		if (iopl < 3)
+			return emulate_gp(ctxt, 0);
 		change_mask |= EFLG_IF;
 		break;
 	default: /* real mode */
@@ -1347,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	if (temp_eip & ~0xffff) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (temp_eip & ~0xffff)
+		return emulate_gp(ctxt, 0);
 
 	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
 
@@ -1601,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	/* syscall is not available in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
-	    ctxt->mode == X86EMUL_MODE_VM86) {
-		emulate_ud(ctxt);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	    ctxt->mode == X86EMUL_MODE_VM86)
+		return emulate_ud(ctxt);
 
 	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
@@ -1655,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u16 cs_sel, ss_sel;
 
 	/* inject #GP if in real mode */
-	if (ctxt->mode == X86EMUL_MODE_REAL) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (ctxt->mode == X86EMUL_MODE_REAL)
+		return emulate_gp(ctxt, 0);
 
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	* Therefore, we inject an #UD.
 	*/
-	if (ctxt->mode == X86EMUL_MODE_PROT64) {
-		emulate_ud(ctxt);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		return emulate_ud(ctxt);
 
 	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_PROT32:
-		if ((msr_data & 0xfffc) == 0x0) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
+		if ((msr_data & 0xfffc) == 0x0)
+			return emulate_gp(ctxt, 0);
 		break;
 	case X86EMUL_MODE_PROT64:
-		if (msr_data == 0x0) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
+		if (msr_data == 0x0)
+			return emulate_gp(ctxt, 0);
 		break;
 	}
 
@@ -1722,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
-	    ctxt->mode == X86EMUL_MODE_VM86) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	    ctxt->mode == X86EMUL_MODE_VM86)
+		return emulate_gp(ctxt, 0);
 
 	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
@@ -1740,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	switch (usermode) {
 	case X86EMUL_MODE_PROT32:
 		cs_sel = (u16)(msr_data + 16);
-		if ((msr_data & 0xfffc) == 0x0) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
+		if ((msr_data & 0xfffc) == 0x0)
+			return emulate_gp(ctxt, 0);
 		ss_sel = (u16)(msr_data + 24);
 		break;
 	case X86EMUL_MODE_PROT64:
 		cs_sel = (u16)(msr_data + 32);
-		if (msr_data == 0x0) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
+		if (msr_data == 0x0)
+			return emulate_gp(ctxt, 0);
 		ss_sel = cs_sel + 8;
 		cs.d = 0;
 		cs.l = 1;
@@ -1982,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+		return emulate_gp(ctxt, 0);
 	c->eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
 	c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2107,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (reason != TASK_SWITCH_IRET) {
 		if ((tss_selector & 3) > next_tss_desc.dpl ||
-		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
+		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+			return emulate_gp(ctxt, 0);
 	}
 
 	desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2331,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 	u64 tsc = 0;
 
-	if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
+		return emulate_gp(ctxt, 0);
 	ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
 	c->regs[VCPU_REGS_RAX] = (u32)tsc;
 	c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2979,28 +2949,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	ctxt->decode.mem_read.pos = 0;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		emulate_ud(ctxt);
-		rc = X86EMUL_PROPAGATE_FAULT;
+		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
-		emulate_ud(ctxt);
-		rc = X86EMUL_PROPAGATE_FAULT;
+		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
-		emulate_ud(ctxt);
-		rc = X86EMUL_PROPAGATE_FAULT;
+		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
-		emulate_gp(ctxt, 0);
-		rc = X86EMUL_PROPAGATE_FAULT;
+		rc = emulate_gp(ctxt, 0);
 		goto done;
 	}
 
@@ -3178,8 +3144,7 @@ special_insn:
 		break;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
-			emulate_ud(ctxt);
-			rc = X86EMUL_PROPAGATE_FAULT;
+			rc = emulate_ud(ctxt);
 			goto done;
 		}
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
@@ -3194,8 +3159,7 @@ special_insn:
 
 		if (c->modrm_reg == VCPU_SREG_CS ||
 		    c->modrm_reg > VCPU_SREG_GS) {
-			emulate_ud(ctxt);
-			rc = X86EMUL_PROPAGATE_FAULT;
+			rc = emulate_ud(ctxt);
 			goto done;
 		}
 
@@ -3327,8 +3291,7 @@ special_insn:
 	do_io_in:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
+			rc = emulate_gp(ctxt, 0);
 			goto done;
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3342,8 +3305,7 @@ special_insn:
 		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->dst.val,
 					  c->src.bytes)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
+			rc = emulate_gp(ctxt, 0);
 			goto done;
 		}
 		ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3368,16 +3330,14 @@ special_insn:
 		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
+			rc = emulate_gp(ctxt, 0);
 			goto done;
 		} else
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 		break;
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
+			rc = emulate_gp(ctxt, 0);
 			goto done;
 		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
-- 
cgit v0.10.2


From ab9ae3138789afacd133a9c4b3d7a3f1578e25c7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:26 +0200
Subject: KVM: Push struct x86_exception info the various gva_to_gpa variants

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1452478..9980a24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -245,7 +245,7 @@ struct kvm_mmu {
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
-			    u32 *error);
+			    struct x86_exception *exception);
 	gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
@@ -708,10 +708,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+			      struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+			       struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+			       struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+				struct x86_exception *exception);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 53ff31f..9ce0414 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2567,18 +2567,19 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
-				  u32 access, u32 *error)
+				  u32 access, struct x86_exception *exception)
 {
-	if (error)
-		*error = 0;
+	if (exception)
+		exception->error_code = 0;
 	return vaddr;
 }
 
 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
-					 u32 access, u32 *error)
+					 u32 access,
+					 struct x86_exception *exception)
 {
-	if (error)
-		*error = 0;
+	if (exception)
+		exception->error_code = 0;
 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2b3d66c..3ac39de 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -677,7 +677,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
-			       u32 *error)
+			       struct x86_exception *exception)
 {
 	struct guest_walker walker;
 	gpa_t gpa = UNMAPPED_GVA;
@@ -688,14 +688,18 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
 		gpa |= vaddr & ~PAGE_MASK;
-	} else if (error)
-		*error = walker.error_code;
+	} else if (exception) {
+		exception->vector = PF_VECTOR;
+		exception->error_code_valid = true;
+		exception->error_code = walker.error_code;
+	}
 
 	return gpa;
 }
 
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
-				      u32 access, u32 *error)
+				      u32 access,
+				      struct x86_exception *exception)
 {
 	struct guest_walker walker;
 	gpa_t gpa = UNMAPPED_GVA;
@@ -706,8 +710,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
 		gpa |= vaddr & ~PAGE_MASK;
-	} else if (error)
-		*error = walker.error_code;
+	} else if (exception) {
+		exception->vector = PF_VECTOR;
+		exception->error_code_valid = true;
+		exception->error_code = walker.error_code;
+	}
 
 	return gpa;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8311ed9..a7a7dec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3603,51 +3603,47 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
 	gpa_t t_gpa;
-	u32 error;
+	struct x86_exception exception;
 
 	BUG_ON(!mmu_is_nested(vcpu));
 
 	/* NPT walks are always user-walks */
 	access |= PFERR_USER_MASK;
-	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
+	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
 	if (t_gpa == UNMAPPED_GVA)
 		vcpu->arch.fault.nested = true;
 
 	return t_gpa;
 }
 
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+			      struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+				struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_FETCH_MASK;
-	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+			       struct x86_exception *exception)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
-	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
 /* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
-{
-	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
-}
-
-static int make_page_fault(struct x86_exception *exception, u32 error)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+				struct x86_exception *exception)
 {
-	exception->vector = PF_VECTOR;
-	exception->error_code_valid = true;
-	exception->error_code = error;
-	return X86EMUL_PROPAGATE_FAULT;
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3656,17 +3652,16 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
-	u32 error;
 
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
-							    &error);
+							    exception);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
 		if (gpa == UNMAPPED_GVA)
-			return make_page_fault(exception, error);
+			return X86EMUL_PROPAGATE_FAULT;
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
@@ -3715,18 +3710,17 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
-	u32 error;
 
 	while (bytes) {
 		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
 							     PFERR_WRITE_MASK,
-							     &error);
+							     exception);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
 		if (gpa == UNMAPPED_GVA)
-			return make_page_fault(exception, error);
+			return X86EMUL_PROPAGATE_FAULT;
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
@@ -3748,7 +3742,6 @@ static int emulator_read_emulated(unsigned long addr,
 				  struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
-	u32 error_code;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -3758,10 +3751,10 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 	}
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
 
 	if (gpa == UNMAPPED_GVA)
-		return make_page_fault(exception, error_code);
+		return X86EMUL_PROPAGATE_FAULT;
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3810,12 +3803,11 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
-	u32 error_code;
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
 	if (gpa == UNMAPPED_GVA)
-		return make_page_fault(exception, error_code);
+		return X86EMUL_PROPAGATE_FAULT;
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-- 
cgit v0.10.2


From 8c28d031657adaf238634ac406834a839d7fa43d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 22 Nov 2010 17:53:27 +0200
Subject: KVM: Push struct x86_exception into walk_addr()

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3ac39de..ad5a5a2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
 	unsigned pt_access;
 	unsigned pte_access;
 	gfn_t gfn;
-	u32 error_code;
+	struct x86_exception fault;
 };
 
 static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
 	return 1;
 
 error:
-	walker->error_code = 0;
+	walker->fault.vector = PF_VECTOR;
+	walker->fault.error_code_valid = true;
+	walker->fault.error_code = 0;
 	if (present)
-		walker->error_code |= PFERR_PRESENT_MASK;
+		walker->fault.error_code |= PFERR_PRESENT_MASK;
 
-	walker->error_code |= write_fault | user_fault;
+	walker->fault.error_code |= write_fault | user_fault;
 
 	if (fetch_fault && mmu->nx)
-		walker->error_code |= PFERR_FETCH_MASK;
+		walker->fault.error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
-		walker->error_code |= PFERR_RSVD_MASK;
+		walker->fault.error_code |= PFERR_RSVD_MASK;
 
 	vcpu->arch.fault.address    = addr;
-	vcpu->arch.fault.error_code = walker->error_code;
+	vcpu->arch.fault.error_code = walker->fault.error_code;
 
-	trace_kvm_mmu_walker_error(walker->error_code);
+	trace_kvm_mmu_walker_error(walker->fault.error_code);
 	return 0;
 }
 
@@ -688,11 +690,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
 		gpa |= vaddr & ~PAGE_MASK;
-	} else if (exception) {
-		exception->vector = PF_VECTOR;
-		exception->error_code_valid = true;
-		exception->error_code = walker.error_code;
-	}
+	} else if (exception)
+		*exception = walker.fault;
 
 	return gpa;
 }
@@ -710,11 +709,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
 		gpa |= vaddr & ~PAGE_MASK;
-	} else if (exception) {
-		exception->vector = PF_VECTOR;
-		exception->error_code_valid = true;
-		exception->error_code = walker.error_code;
-	}
+	} else if (exception)
+		*exception = walker.fault;
 
 	return gpa;
 }
-- 
cgit v0.10.2


From 6389ee946303cb4313dba0a49865e495a53351ff Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 29 Nov 2010 16:12:30 +0200
Subject: KVM: Pull extra page fault information into struct x86_exception

Currently page fault cr2 and nesting infomation are carried outside
the fault data structure.  Instead they are placed in the vcpu struct,
which results in confusion as global variables are manipulated instead
of passing parameters.

Fix this issue by adding address and nested fields to struct x86_exception,
so this struct can carry all information associated with a fault.

Signed-off-by: Avi Kivity <avi@redhat.com>
Tested-by: Joerg Roedel <joerg.roedel@amd.com>
Tested-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 87d017e..bf70ece 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -19,6 +19,8 @@ struct x86_exception {
 	u8 vector;
 	bool error_code_valid;
 	u16 error_code;
+	bool nested_page_fault;
+	u64 address; /* cr2 or nested page fault gpa */
 };
 
 /*
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9980a24..0c0941d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -242,7 +242,8 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool no_apf);
-	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  struct x86_exception *fault);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    struct x86_exception *exception);
@@ -318,16 +319,6 @@ struct kvm_vcpu_arch {
 	 */
 	struct kvm_mmu *walk_mmu;
 
-	/*
-	 * This struct is filled with the necessary information to propagate a
-	 * page fault into the guest
-	 */
-	struct {
-		u64      address;
-		unsigned error_code;
-		bool     nested;
-	} fault;
-
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -686,11 +677,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t gfn, void *data, int offset, int len,
 			    u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9ce0414..d359500 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2736,9 +2736,10 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr3;
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu)
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+			      struct x86_exception *fault)
 {
-	vcpu->arch.mmu.inject_page_fault(vcpu);
+	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ad5a5a2..d5a0a11 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -279,8 +279,8 @@ error:
 	if (rsvd_fault)
 		walker->fault.error_code |= PFERR_RSVD_MASK;
 
-	vcpu->arch.fault.address    = addr;
-	vcpu->arch.fault.error_code = walker->fault.error_code;
+	walker->fault.address = addr;
+	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
 	trace_kvm_mmu_walker_error(walker->fault.error_code);
 	return 0;
@@ -568,7 +568,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		inject_page_fault(vcpu);
+		inject_page_fault(vcpu, &walker.fault);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 28274cf..b985cb8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1647,14 +1647,15 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 	force_new_asid(vcpu);
 }
 
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+				       struct x86_exception *fault)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.exit_code = SVM_EXIT_NPF;
 	svm->vmcb->control.exit_code_hi = 0;
-	svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
-	svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+	svm->vmcb->control.exit_info_1 = fault->error_code;
+	svm->vmcb->control.exit_info_2 = fault->address;
 
 	nested_svm_vmexit(svm);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a7a7dec..47e5a41 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -334,23 +334,19 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
-	unsigned error_code = vcpu->arch.fault.error_code;
-
 	++vcpu->stat.pf_guest;
-	vcpu->arch.cr2 = vcpu->arch.fault.address;
-	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+	vcpu->arch.cr2 = fault->address;
+	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
 
-void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
-	if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
-		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 	else
-		vcpu->arch.mmu.inject_page_fault(vcpu);
-
-	vcpu->arch.fault.nested = false;
+		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3610,8 +3606,6 @@ static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 	/* NPT walks are always user-walks */
 	access |= PFERR_USER_MASK;
 	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
-	if (t_gpa == UNMAPPED_GVA)
-		vcpu->arch.fault.nested = true;
 
 	return t_gpa;
 }
@@ -4259,7 +4253,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception.vector == PF_VECTOR)
-		kvm_propagate_fault(vcpu);
+		kvm_propagate_fault(vcpu, &ctxt->exception);
 	else if (ctxt->exception.error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
 				      ctxt->exception.error_code);
@@ -6264,6 +6258,8 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work)
 {
+	struct x86_exception fault;
+
 	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
 
@@ -6272,15 +6268,20 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 	     kvm_x86_ops->get_cpl(vcpu) == 0))
 		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
 	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
-		vcpu->arch.fault.error_code = 0;
-		vcpu->arch.fault.address = work->arch.token;
-		kvm_inject_page_fault(vcpu);
+		fault.vector = PF_VECTOR;
+		fault.error_code_valid = true;
+		fault.error_code = 0;
+		fault.nested_page_fault = false;
+		fault.address = work->arch.token;
+		kvm_inject_page_fault(vcpu, &fault);
 	}
 }
 
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
+	struct x86_exception fault;
+
 	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 	if (is_error_page(work->page))
 		work->arch.token = ~0; /* broadcast wakeup */
@@ -6289,9 +6290,12 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 
 	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
 	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
-		vcpu->arch.fault.error_code = 0;
-		vcpu->arch.fault.address = work->arch.token;
-		kvm_inject_page_fault(vcpu);
+		fault.vector = PF_VECTOR;
+		fault.error_code_valid = true;
+		fault.error_code = 0;
+		fault.nested_page_fault = false;
+		fault.address = work->arch.token;
+		kvm_inject_page_fault(vcpu, &fault);
 	}
 	vcpu->arch.apf.halted = false;
 }
-- 
cgit v0.10.2


From ec9e60b21977007e3dfacc2b8fe3a8fbb9276b51 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 29 Nov 2010 17:51:47 +0100
Subject: KVM: X86: Introduce generic guest-mode representation

This patch introduces a generic representation of guest-mode
fpr a vcpu. This currently only exists in the SVM code.
Having this representation generic will help making the
non-svm code aware of nesting when this is necessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c0941d..56e45a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -779,6 +779,7 @@ enum {
 #define HF_VINTR_MASK		(1 << 2)
 #define HF_NMI_MASK		(1 << 3)
 #define HF_IRET_MASK		(1 << 4)
+#define HF_GUEST_MASK		(1 << 5) /* VCPU is in guest-mode */
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45..95ac3af 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,4 +84,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
 		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
 }
 
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
 #endif
-- 
cgit v0.10.2


From 2030753de70a8aed39543ed09c2360665b3af481 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 29 Nov 2010 17:51:48 +0100
Subject: KVM: SVM: Make Use of the generic guest-mode functions

This patch replaces the is_nested logic in the SVM module
with the generic notion of guest-mode.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b985cb8..2ae94b5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -192,11 +192,6 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
-static inline bool is_nested(struct vcpu_svm *svm)
-{
-	return svm->nested.vmcb;
-}
-
 static inline void enable_gif(struct vcpu_svm *svm)
 {
 	svm->vcpu.arch.hflags |= HF_GIF_MASK;
@@ -727,7 +722,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 g_tsc_offset = 0;
 
-	if (is_nested(svm)) {
+	if (is_guest_mode(vcpu)) {
 		g_tsc_offset = svm->vmcb->control.tsc_offset -
 			       svm->nested.hsave->control.tsc_offset;
 		svm->nested.hsave->control.tsc_offset = offset;
@@ -741,7 +736,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.tsc_offset += adjustment;
-	if (is_nested(svm))
+	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.tsc_offset += adjustment;
 }
 
@@ -1209,7 +1204,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
 		vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
 		vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
-		if (is_nested(svm)) {
+		if (is_guest_mode(&svm->vcpu)) {
 			struct vmcb *hsave = svm->nested.hsave;
 
 			hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK;
@@ -1220,7 +1215,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 	} else {
 		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
 		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
-		if (is_nested(svm)) {
+		if (is_guest_mode(&svm->vcpu)) {
 			struct vmcb *hsave = svm->nested.hsave;
 
 			hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
@@ -1233,7 +1228,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (is_nested(svm)) {
+	if (is_guest_mode(vcpu)) {
 		/*
 		 * We are here because we run in nested mode, the host kvm
 		 * intercepts cr0 writes but the l1 hypervisor does not.
@@ -1471,7 +1466,7 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 excp;
 
-	if (is_nested(svm)) {
+	if (is_guest_mode(vcpu)) {
 		u32 h_excp, n_excp;
 
 		h_excp  = svm->nested.hsave->control.intercept_exceptions;
@@ -1701,7 +1696,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 {
 	int vmexit;
 
-	if (!is_nested(svm))
+	if (!is_guest_mode(&svm->vcpu))
 		return 0;
 
 	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1719,7 +1714,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 /* This function returns true if it is save to enable the irq window */
 static inline bool nested_svm_intr(struct vcpu_svm *svm)
 {
-	if (!is_nested(svm))
+	if (!is_guest_mode(&svm->vcpu))
 		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1758,7 +1753,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 /* This function returns true if it is save to enable the nmi window */
 static inline bool nested_svm_nmi(struct vcpu_svm *svm)
 {
-	if (!is_nested(svm))
+	if (!is_guest_mode(&svm->vcpu))
 		return true;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1995,7 +1990,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return 1;
 
-	/* Exit nested SVM mode */
+	/* Exit Guest-Mode */
+	leave_guest_mode(&svm->vcpu);
 	svm->nested.vmcb = 0;
 
 	/* Give the current vmcb to the guest */
@@ -2303,7 +2299,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
-	/* nested_vmcb is our indicator if nested SVM is activated */
+	/* Enter Guest-Mode */
+	enter_guest_mode(&svm->vcpu);
+
 	svm->nested.vmcb = vmcb_gpa;
 
 	enable_gif(svm);
@@ -2589,7 +2587,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_TSC: {
 		u64 tsc_offset;
 
-		if (is_nested(svm))
+		if (is_guest_mode(vcpu))
 			tsc_offset = svm->nested.hsave->control.tsc_offset;
 		else
 			tsc_offset = svm->vmcb->control.tsc_offset;
@@ -3003,7 +3001,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (is_nested(svm)) {
+	if (is_guest_mode(vcpu)) {
 		int vmexit;
 
 		trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3110,7 +3108,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
 		return;
 
 	if (irr == -1)
@@ -3164,7 +3162,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 
 	ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
 
-	if (is_nested(svm))
+	if (is_guest_mode(vcpu))
 		return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
 
 	return ret;
@@ -3221,7 +3219,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
 		return;
 
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
@@ -3235,7 +3233,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
-	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
 		return;
 
 	cr8 = kvm_get_cr8(vcpu);
@@ -3621,7 +3619,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
-	if (is_nested(svm))
+	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
 	update_cr0_intercept(svm);
 }
-- 
cgit v0.10.2


From fc3a9157d3148ab91039c75423da8ef97be3e105 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 29 Nov 2010 17:51:49 +0100
Subject: KVM: X86: Don't report L2 emulation failures to user-space

This patch prevents that emulation failures which result
from emulating an instruction for an L2-Guest results in
being reported to userspace.
Without this patch a malicious L2-Guest would be able to
kill the L1 by triggering a race-condition between an vmexit
and the instruction emulator.
With this patch the L2 will most likely only kill itself in
this situation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 47e5a41..8b4d5fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4314,13 +4314,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
+	int r = EMULATE_DONE;
+
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
-	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-	vcpu->run->internal.ndata = 0;
+	if (!is_guest_mode(vcpu)) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		r = EMULATE_FAIL;
+	}
 	kvm_queue_exception(vcpu, UD_VECTOR);
-	return EMULATE_FAIL;
+
+	return r;
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
-- 
cgit v0.10.2


From 384c636843971c8ebbffd1cc8881e3184cbd23e2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 30 Nov 2010 18:03:56 +0100
Subject: KVM: SVM: Add function to recalculate intercept masks

This patch adds a function to recalculate the effective
intercepts masks when the vcpu is in guest-mode and either
the host or the guest intercept masks change.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ae94b5..74f89f0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -192,6 +192,26 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+	struct vmcb_control_area *c, *h;
+	struct nested_state *g;
+
+	if (!is_guest_mode(&svm->vcpu))
+		return;
+
+	c = &svm->vmcb->control;
+	h = &svm->nested.hsave->control;
+	g = &svm->nested;
+
+	c->intercept_cr_read = h->intercept_cr_read | g->intercept_cr_read;
+	c->intercept_cr_write = h->intercept_cr_write | g->intercept_cr_write;
+	c->intercept_dr_read = h->intercept_dr_read | g->intercept_dr_read;
+	c->intercept_dr_write = h->intercept_dr_write | g->intercept_dr_write;
+	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+	c->intercept = h->intercept | g->intercept;
+}
+
 static inline void enable_gif(struct vcpu_svm *svm)
 {
 	svm->vcpu.arch.hflags |= HF_GIF_MASK;
@@ -2273,23 +2293,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	/* We don't want to see VMMCALLs from a nested guest */
 	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
 
-	/*
-	 * We don't want a nested guest to be more powerful than the guest, so
-	 * all intercepts are ORed
-	 */
-	svm->vmcb->control.intercept_cr_read |=
-		nested_vmcb->control.intercept_cr_read;
-	svm->vmcb->control.intercept_cr_write |=
-		nested_vmcb->control.intercept_cr_write;
-	svm->vmcb->control.intercept_dr_read |=
-		nested_vmcb->control.intercept_dr_read;
-	svm->vmcb->control.intercept_dr_write |=
-		nested_vmcb->control.intercept_dr_write;
-	svm->vmcb->control.intercept_exceptions |=
-		nested_vmcb->control.intercept_exceptions;
-
-	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
 	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
@@ -2302,6 +2305,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	/* Enter Guest-Mode */
 	enter_guest_mode(&svm->vcpu);
 
+	/*
+	 * Merge guest and host intercepts - must be called  with vcpu in
+	 * guest-mode to take affect here
+	 */
+	recalc_intercepts(svm);
+
 	svm->nested.vmcb = vmcb_gpa;
 
 	enable_gif(svm);
-- 
cgit v0.10.2


From 4ee546b434504a618eac40421e595c68e494da9f Mon Sep 17 00:00:00 2001
From: "Roedel, Joerg" <Joerg.Roedel@amd.com>
Date: Fri, 3 Dec 2010 10:50:51 +0100
Subject: KVM: SVM: Add manipulation functions for CRx intercepts

This patch wraps changes to the CRx intercepts of SVM into
seperate functions to abstract nested-svm better and prepare
the implementation of the vmcb-clean-bits feature.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e83105..39f9ddf 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -51,8 +51,7 @@ enum {
 
 
 struct __attribute__ ((__packed__)) vmcb_control_area {
-	u16 intercept_cr_read;
-	u16 intercept_cr_write;
+	u32 intercept_cr;
 	u16 intercept_dr_read;
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
@@ -204,10 +203,14 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
 #define SVM_SELECTOR_CODE_MASK (1 << 3)
 
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
+#define INTERCEPT_CR0_READ	0
+#define INTERCEPT_CR3_READ	3
+#define INTERCEPT_CR4_READ	4
+#define INTERCEPT_CR8_READ	8
+#define INTERCEPT_CR0_WRITE	(16 + 0)
+#define INTERCEPT_CR3_WRITE	(16 + 3)
+#define INTERCEPT_CR4_WRITE	(16 + 4)
+#define INTERCEPT_CR8_WRITE	(16 + 8)
 
 #define INTERCEPT_DR0_MASK 1
 #define INTERCEPT_DR1_MASK (1 << 1)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 74f89f0..1e7bb9c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -98,8 +98,7 @@ struct nested_state {
 	unsigned long vmexit_rax;
 
 	/* cache for intercepts of the guest */
-	u16 intercept_cr_read;
-	u16 intercept_cr_write;
+	u32 intercept_cr;
 	u16 intercept_dr_read;
 	u16 intercept_dr_write;
 	u32 intercept_exceptions;
@@ -204,14 +203,46 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 	h = &svm->nested.hsave->control;
 	g = &svm->nested;
 
-	c->intercept_cr_read = h->intercept_cr_read | g->intercept_cr_read;
-	c->intercept_cr_write = h->intercept_cr_write | g->intercept_cr_write;
+	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 	c->intercept_dr_read = h->intercept_dr_read | g->intercept_dr_read;
 	c->intercept_dr_write = h->intercept_dr_write | g->intercept_dr_write;
 	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 	c->intercept = h->intercept | g->intercept;
 }
 
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+	if (is_guest_mode(&svm->vcpu))
+		return svm->nested.hsave;
+	else
+		return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept_cr |= (1U << bit);
+
+	recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept_cr &= ~(1U << bit);
+
+	recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	return vmcb->control.intercept_cr & (1U << bit);
+}
+
 static inline void enable_gif(struct vcpu_svm *svm)
 {
 	svm->vcpu.arch.hflags |= HF_GIF_MASK;
@@ -766,15 +797,15 @@ static void init_vmcb(struct vcpu_svm *svm)
 	struct vmcb_save_area *save = &svm->vmcb->save;
 
 	svm->vcpu.fpu_active = 1;
+	svm->vcpu.arch.hflags = 0;
 
-	control->intercept_cr_read =	INTERCEPT_CR0_MASK |
-					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK;
-
-	control->intercept_cr_write =	INTERCEPT_CR0_MASK |
-					INTERCEPT_CR3_MASK |
-					INTERCEPT_CR4_MASK |
-					INTERCEPT_CR8_MASK;
+	set_cr_intercept(svm, INTERCEPT_CR0_READ);
+	set_cr_intercept(svm, INTERCEPT_CR3_READ);
+	set_cr_intercept(svm, INTERCEPT_CR4_READ);
+	set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+	set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+	set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+	set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 
 	control->intercept_dr_read =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
@@ -875,8 +906,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
 					(1ULL << INTERCEPT_INVLPG));
 		control->intercept_exceptions &= ~(1 << PF_VECTOR);
-		control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
-		control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
+		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
 		save->g_pat = 0x0007040600070406ULL;
 		save->cr3 = 0;
 		save->cr4 = 0;
@@ -1210,7 +1241,6 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
-	struct vmcb *vmcb = svm->vmcb;
 	ulong gcr0 = svm->vcpu.arch.cr0;
 	u64 *hcr0 = &svm->vmcb->save.cr0;
 
@@ -1222,25 +1252,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 
 
 	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
-		vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
-		vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
-		if (is_guest_mode(&svm->vcpu)) {
-			struct vmcb *hsave = svm->nested.hsave;
-
-			hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK;
-			hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
-			vmcb->control.intercept_cr_read  |= svm->nested.intercept_cr_read;
-			vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
-		}
+		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
 	} else {
-		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
-		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
-		if (is_guest_mode(&svm->vcpu)) {
-			struct vmcb *hsave = svm->nested.hsave;
-
-			hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
-			hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
-		}
+		set_cr_intercept(svm, INTERCEPT_CR0_READ);
+		set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
 	}
 }
 
@@ -1901,15 +1917,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	case SVM_EXIT_IOIO:
 		vmexit = nested_svm_intercept_ioio(svm);
 		break;
-	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
-		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
-		if (svm->nested.intercept_cr_read & cr_bits)
-			vmexit = NESTED_EXIT_DONE;
-		break;
-	}
-	case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
-		u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
-		if (svm->nested.intercept_cr_write & cr_bits)
+	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+		u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+		if (svm->nested.intercept_cr & bit)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
@@ -1966,8 +1976,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
 	struct vmcb_control_area *dst  = &dst_vmcb->control;
 	struct vmcb_control_area *from = &from_vmcb->control;
 
-	dst->intercept_cr_read    = from->intercept_cr_read;
-	dst->intercept_cr_write   = from->intercept_cr_write;
+	dst->intercept_cr         = from->intercept_cr;
 	dst->intercept_dr_read    = from->intercept_dr_read;
 	dst->intercept_dr_write   = from->intercept_dr_write;
 	dst->intercept_exceptions = from->intercept_exceptions;
@@ -2189,8 +2198,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 			       nested_vmcb->control.event_inj,
 			       nested_vmcb->control.nested_ctl);
 
-	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
-				    nested_vmcb->control.intercept_cr_write,
+	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+				    nested_vmcb->control.intercept_cr >> 16,
 				    nested_vmcb->control.intercept_exceptions,
 				    nested_vmcb->control.intercept);
 
@@ -2270,8 +2279,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
 	/* cache intercepts */
-	svm->nested.intercept_cr_read    = nested_vmcb->control.intercept_cr_read;
-	svm->nested.intercept_cr_write   = nested_vmcb->control.intercept_cr_write;
+	svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
 	svm->nested.intercept_dr_read    = nested_vmcb->control.intercept_dr_read;
 	svm->nested.intercept_dr_write   = nested_vmcb->control.intercept_dr_write;
 	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
@@ -2286,8 +2294,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
 	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
 		/* We only want the cr8 intercept bits of the guest */
-		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
-		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+		clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 	}
 
 	/* We don't want to see VMMCALLs from a nested guest */
@@ -2579,7 +2587,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	/* instruction emulation calls kvm_set_cr8() */
 	emulate_instruction(&svm->vcpu, 0, 0, 0);
 	if (irqchip_in_kernel(svm->vcpu.kvm)) {
-		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 		return 1;
 	}
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
@@ -2896,8 +2904,8 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
 	struct vmcb_save_area *save = &svm->vmcb->save;
 
 	pr_err("VMCB Control Area:\n");
-	pr_err("cr_read:            %04x\n", control->intercept_cr_read);
-	pr_err("cr_write:           %04x\n", control->intercept_cr_write);
+	pr_err("cr_read:            %04x\n", control->intercept_cr & 0xffff);
+	pr_err("cr_write:           %04x\n", control->intercept_cr >> 16);
 	pr_err("dr_read:            %04x\n", control->intercept_dr_read);
 	pr_err("dr_write:           %04x\n", control->intercept_dr_write);
 	pr_err("exceptions:         %08x\n", control->intercept_exceptions);
@@ -2998,7 +3006,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
-	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
 		vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -3124,7 +3132,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		return;
 
 	if (tpr >= irr)
-		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3231,7 +3239,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
 		return;
 
-	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+	if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
 		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
 		kvm_set_cr8(vcpu, cr8);
 	}
-- 
cgit v0.10.2


From 3aed041a4c1b78cac87db76cf264b081df64dd37 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 30 Nov 2010 18:03:58 +0100
Subject: KVM: SVM: Add manipulation functions for DRx intercepts

This patch wraps changes to the DRx intercepts of SVM into
seperate functions to abstract nested-svm better and prepare
the implementation of the vmcb-clean-bits feature.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 39f9ddf..11dbca7 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -52,8 +52,7 @@ enum {
 
 struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 intercept_cr;
-	u16 intercept_dr_read;
-	u16 intercept_dr_write;
+	u32 intercept_dr;
 	u32 intercept_exceptions;
 	u64 intercept;
 	u8 reserved_1[42];
@@ -212,14 +211,22 @@ struct __attribute__ ((__packed__)) vmcb {
 #define INTERCEPT_CR4_WRITE	(16 + 4)
 #define INTERCEPT_CR8_WRITE	(16 + 8)
 
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
+#define INTERCEPT_DR0_READ	0
+#define INTERCEPT_DR1_READ	1
+#define INTERCEPT_DR2_READ	2
+#define INTERCEPT_DR3_READ	3
+#define INTERCEPT_DR4_READ	4
+#define INTERCEPT_DR5_READ	5
+#define INTERCEPT_DR6_READ	6
+#define INTERCEPT_DR7_READ	7
+#define INTERCEPT_DR0_WRITE	(16 + 0)
+#define INTERCEPT_DR1_WRITE	(16 + 1)
+#define INTERCEPT_DR2_WRITE	(16 + 2)
+#define INTERCEPT_DR3_WRITE	(16 + 3)
+#define INTERCEPT_DR4_WRITE	(16 + 4)
+#define INTERCEPT_DR5_WRITE	(16 + 5)
+#define INTERCEPT_DR6_WRITE	(16 + 6)
+#define INTERCEPT_DR7_WRITE	(16 + 7)
 
 #define SVM_EVTINJ_VEC_MASK 0xff
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1e7bb9c..a7c38eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -99,8 +99,7 @@ struct nested_state {
 
 	/* cache for intercepts of the guest */
 	u32 intercept_cr;
-	u16 intercept_dr_read;
-	u16 intercept_dr_write;
+	u32 intercept_dr;
 	u32 intercept_exceptions;
 	u64 intercept;
 
@@ -204,8 +203,7 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 	g = &svm->nested;
 
 	c->intercept_cr = h->intercept_cr | g->intercept_cr;
-	c->intercept_dr_read = h->intercept_dr_read | g->intercept_dr_read;
-	c->intercept_dr_write = h->intercept_dr_write | g->intercept_dr_write;
+	c->intercept_dr = h->intercept_dr | g->intercept_dr;
 	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 	c->intercept = h->intercept | g->intercept;
 }
@@ -243,6 +241,24 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 	return vmcb->control.intercept_cr & (1U << bit);
 }
 
+static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept_dr |= (1U << bit);
+
+	recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept_dr &= ~(1U << bit);
+
+	recalc_intercepts(svm);
+}
+
 static inline void enable_gif(struct vcpu_svm *svm)
 {
 	svm->vcpu.arch.hflags |= HF_GIF_MASK;
@@ -807,23 +823,23 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
 	set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 
-	control->intercept_dr_read =	INTERCEPT_DR0_MASK |
-					INTERCEPT_DR1_MASK |
-					INTERCEPT_DR2_MASK |
-					INTERCEPT_DR3_MASK |
-					INTERCEPT_DR4_MASK |
-					INTERCEPT_DR5_MASK |
-					INTERCEPT_DR6_MASK |
-					INTERCEPT_DR7_MASK;
-
-	control->intercept_dr_write =	INTERCEPT_DR0_MASK |
-					INTERCEPT_DR1_MASK |
-					INTERCEPT_DR2_MASK |
-					INTERCEPT_DR3_MASK |
-					INTERCEPT_DR4_MASK |
-					INTERCEPT_DR5_MASK |
-					INTERCEPT_DR6_MASK |
-					INTERCEPT_DR7_MASK;
+	set_dr_intercept(svm, INTERCEPT_DR0_READ);
+	set_dr_intercept(svm, INTERCEPT_DR1_READ);
+	set_dr_intercept(svm, INTERCEPT_DR2_READ);
+	set_dr_intercept(svm, INTERCEPT_DR3_READ);
+	set_dr_intercept(svm, INTERCEPT_DR4_READ);
+	set_dr_intercept(svm, INTERCEPT_DR5_READ);
+	set_dr_intercept(svm, INTERCEPT_DR6_READ);
+	set_dr_intercept(svm, INTERCEPT_DR7_READ);
+
+	set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
+	set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
 
 	control->intercept_exceptions = (1 << PF_VECTOR) |
 					(1 << UD_VECTOR) |
@@ -1923,15 +1939,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
-	case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
-		u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
-		if (svm->nested.intercept_dr_read & dr_bits)
-			vmexit = NESTED_EXIT_DONE;
-		break;
-	}
-	case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
-		u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
-		if (svm->nested.intercept_dr_write & dr_bits)
+	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+		u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+		if (svm->nested.intercept_dr & bit)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
@@ -1977,8 +1987,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
 	struct vmcb_control_area *from = &from_vmcb->control;
 
 	dst->intercept_cr         = from->intercept_cr;
-	dst->intercept_dr_read    = from->intercept_dr_read;
-	dst->intercept_dr_write   = from->intercept_dr_write;
+	dst->intercept_dr         = from->intercept_dr;
 	dst->intercept_exceptions = from->intercept_exceptions;
 	dst->intercept            = from->intercept;
 	dst->iopm_base_pa         = from->iopm_base_pa;
@@ -2280,8 +2289,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
 	/* cache intercepts */
 	svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
-	svm->nested.intercept_dr_read    = nested_vmcb->control.intercept_dr_read;
-	svm->nested.intercept_dr_write   = nested_vmcb->control.intercept_dr_write;
+	svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
 	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
 	svm->nested.intercept            = nested_vmcb->control.intercept;
 
@@ -2906,8 +2914,8 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("VMCB Control Area:\n");
 	pr_err("cr_read:            %04x\n", control->intercept_cr & 0xffff);
 	pr_err("cr_write:           %04x\n", control->intercept_cr >> 16);
-	pr_err("dr_read:            %04x\n", control->intercept_dr_read);
-	pr_err("dr_write:           %04x\n", control->intercept_dr_write);
+	pr_err("dr_read:            %04x\n", control->intercept_dr & 0xffff);
+	pr_err("dr_write:           %04x\n", control->intercept_dr >> 16);
 	pr_err("exceptions:         %08x\n", control->intercept_exceptions);
 	pr_err("intercepts:         %016llx\n", control->intercept);
 	pr_err("pause filter count: %d\n", control->pause_filter_count);
-- 
cgit v0.10.2


From 18c918c5f59bc35f9c567689daef8c255b575fdc Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 30 Nov 2010 18:03:59 +0100
Subject: KVM: SVM: Add manipulation functions for exception intercepts

This patch wraps changes to the exception intercepts of SVM
into seperate functions to abstract nested-svm better and
prepare the implementation of the vmcb-clean-bits feature.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a7c38eb..d69ec44 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -259,6 +259,24 @@ static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
 	recalc_intercepts(svm);
 }
 
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept_exceptions |= (1U << bit);
+
+	recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+	recalc_intercepts(svm);
+}
+
 static inline void enable_gif(struct vcpu_svm *svm)
 {
 	svm->vcpu.arch.hflags |= HF_GIF_MASK;
@@ -841,10 +859,9 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
 	set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
 
-	control->intercept_exceptions = (1 << PF_VECTOR) |
-					(1 << UD_VECTOR) |
-					(1 << MC_VECTOR);
-
+	set_exception_intercept(svm, PF_VECTOR);
+	set_exception_intercept(svm, UD_VECTOR);
+	set_exception_intercept(svm, MC_VECTOR);
 
 	control->intercept =	(1ULL << INTERCEPT_INTR) |
 				(1ULL << INTERCEPT_NMI) |
@@ -921,7 +938,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 		control->nested_ctl = 1;
 		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
 					(1ULL << INTERCEPT_INVLPG));
-		control->intercept_exceptions &= ~(1 << PF_VECTOR);
+		clr_exception_intercept(svm, PF_VECTOR);
 		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
 		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
 		save->g_pat = 0x0007040600070406ULL;
@@ -1382,20 +1399,18 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.intercept_exceptions &=
-		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+	clr_exception_intercept(svm, DB_VECTOR);
+	clr_exception_intercept(svm, BP_VECTOR);
 
 	if (svm->nmi_singlestep)
-		svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+		set_exception_intercept(svm, DB_VECTOR);
 
 	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
 		if (vcpu->guest_debug &
 		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-			svm->vmcb->control.intercept_exceptions |=
-				1 << DB_VECTOR;
+			set_exception_intercept(svm, DB_VECTOR);
 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
-			svm->vmcb->control.intercept_exceptions |=
-				1 << BP_VECTOR;
+			set_exception_intercept(svm, BP_VECTOR);
 	} else
 		vcpu->guest_debug = 0;
 }
@@ -1516,21 +1531,8 @@ static int ud_interception(struct vcpu_svm *svm)
 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	u32 excp;
-
-	if (is_guest_mode(vcpu)) {
-		u32 h_excp, n_excp;
 
-		h_excp  = svm->nested.hsave->control.intercept_exceptions;
-		n_excp  = svm->nested.intercept_exceptions;
-		h_excp &= ~(1 << NM_VECTOR);
-		excp    = h_excp | n_excp;
-	} else {
-		excp  = svm->vmcb->control.intercept_exceptions;
-		excp &= ~(1 << NM_VECTOR);
-	}
-
-	svm->vmcb->control.intercept_exceptions = excp;
+	clr_exception_intercept(svm, NM_VECTOR);
 
 	svm->vcpu.fpu_active = 1;
 	update_cr0_intercept(svm);
@@ -3643,9 +3645,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
-	if (is_guest_mode(vcpu))
-		svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+	set_exception_intercept(svm, NM_VECTOR);
 	update_cr0_intercept(svm);
 }
 
-- 
cgit v0.10.2


From 8a05a1b8693371547bbb2d06f842595cebd16687 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 30 Nov 2010 18:04:00 +0100
Subject: KVM: SVM: Add manipulation functions for misc intercepts

This patch wraps changes to the misc intercepts of SVM
into seperate functions to abstract nested-svm better and
prepare the implementation of the vmcb-clean-bits feature.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d69ec44..cde5392 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -277,6 +277,24 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 	recalc_intercepts(svm);
 }
 
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept |= (1ULL << bit);
+
+	recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+	struct vmcb *vmcb = get_host_vmcb(svm);
+
+	vmcb->control.intercept &= ~(1ULL << bit);
+
+	recalc_intercepts(svm);
+}
+
 static inline void enable_gif(struct vcpu_svm *svm)
 {
 	svm->vcpu.arch.hflags |= HF_GIF_MASK;
@@ -863,29 +881,29 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_exception_intercept(svm, UD_VECTOR);
 	set_exception_intercept(svm, MC_VECTOR);
 
-	control->intercept =	(1ULL << INTERCEPT_INTR) |
-				(1ULL << INTERCEPT_NMI) |
-				(1ULL << INTERCEPT_SMI) |
-				(1ULL << INTERCEPT_SELECTIVE_CR0) |
-				(1ULL << INTERCEPT_CPUID) |
-				(1ULL << INTERCEPT_INVD) |
-				(1ULL << INTERCEPT_HLT) |
-				(1ULL << INTERCEPT_INVLPG) |
-				(1ULL << INTERCEPT_INVLPGA) |
-				(1ULL << INTERCEPT_IOIO_PROT) |
-				(1ULL << INTERCEPT_MSR_PROT) |
-				(1ULL << INTERCEPT_TASK_SWITCH) |
-				(1ULL << INTERCEPT_SHUTDOWN) |
-				(1ULL << INTERCEPT_VMRUN) |
-				(1ULL << INTERCEPT_VMMCALL) |
-				(1ULL << INTERCEPT_VMLOAD) |
-				(1ULL << INTERCEPT_VMSAVE) |
-				(1ULL << INTERCEPT_STGI) |
-				(1ULL << INTERCEPT_CLGI) |
-				(1ULL << INTERCEPT_SKINIT) |
-				(1ULL << INTERCEPT_WBINVD) |
-				(1ULL << INTERCEPT_MONITOR) |
-				(1ULL << INTERCEPT_MWAIT);
+	set_intercept(svm, INTERCEPT_INTR);
+	set_intercept(svm, INTERCEPT_NMI);
+	set_intercept(svm, INTERCEPT_SMI);
+	set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+	set_intercept(svm, INTERCEPT_CPUID);
+	set_intercept(svm, INTERCEPT_INVD);
+	set_intercept(svm, INTERCEPT_HLT);
+	set_intercept(svm, INTERCEPT_INVLPG);
+	set_intercept(svm, INTERCEPT_INVLPGA);
+	set_intercept(svm, INTERCEPT_IOIO_PROT);
+	set_intercept(svm, INTERCEPT_MSR_PROT);
+	set_intercept(svm, INTERCEPT_TASK_SWITCH);
+	set_intercept(svm, INTERCEPT_SHUTDOWN);
+	set_intercept(svm, INTERCEPT_VMRUN);
+	set_intercept(svm, INTERCEPT_VMMCALL);
+	set_intercept(svm, INTERCEPT_VMLOAD);
+	set_intercept(svm, INTERCEPT_VMSAVE);
+	set_intercept(svm, INTERCEPT_STGI);
+	set_intercept(svm, INTERCEPT_CLGI);
+	set_intercept(svm, INTERCEPT_SKINIT);
+	set_intercept(svm, INTERCEPT_WBINVD);
+	set_intercept(svm, INTERCEPT_MONITOR);
+	set_intercept(svm, INTERCEPT_MWAIT);
 
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
@@ -936,8 +954,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
 		control->nested_ctl = 1;
-		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
-					(1ULL << INTERCEPT_INVLPG));
+		clr_intercept(svm, INTERCEPT_TASK_SWITCH);
+		clr_intercept(svm, INTERCEPT_INVLPG);
 		clr_exception_intercept(svm, PF_VECTOR);
 		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
 		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
@@ -952,7 +970,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 		control->pause_filter_count = 3000;
-		control->intercept |= (1ULL << INTERCEPT_PAUSE);
+		set_intercept(svm, INTERCEPT_PAUSE);
 	}
 
 	enable_gif(svm);
@@ -1126,12 +1144,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
-	svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+	set_intercept(svm, INTERCEPT_VINTR);
 }
 
 static void svm_clear_vintr(struct vcpu_svm *svm)
 {
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+	clr_intercept(svm, INTERCEPT_VINTR);
 }
 
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -2309,7 +2327,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	}
 
 	/* We don't want to see VMMCALLs from a nested guest */
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+	clr_intercept(svm, INTERCEPT_VMMCALL);
 
 	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2557,7 +2575,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
 static int iret_interception(struct vcpu_svm *svm)
 {
 	++svm->vcpu.stat.nmi_window_exits;
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+	clr_intercept(svm, INTERCEPT_IRET);
 	svm->vcpu.arch.hflags |= HF_IRET_MASK;
 	return 1;
 }
@@ -3103,7 +3121,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
 	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
 	vcpu->arch.hflags |= HF_NMI_MASK;
-	svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+	set_intercept(svm, INTERCEPT_IRET);
 	++vcpu->stat.nmi_injections;
 }
 
@@ -3170,10 +3188,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 	if (masked) {
 		svm->vcpu.arch.hflags |= HF_NMI_MASK;
-		svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+		set_intercept(svm, INTERCEPT_IRET);
 	} else {
 		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+		clr_intercept(svm, INTERCEPT_IRET);
 	}
 }
 
-- 
cgit v0.10.2


From 4cc703100b1fd97c71804e2fa5c9c377c5b37dc5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 30 Nov 2010 18:04:01 +0100
Subject: KVM: SVM: Use get_host_vmcb function in svm_get_msr for TSC

This patch replaces the open-coded vmcb-selection for the
TSC calculation with the new get_host_vmcb helper function
introduced in this patchset.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cde5392..5038786 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2630,14 +2630,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
 	switch (ecx) {
 	case MSR_IA32_TSC: {
-		u64 tsc_offset;
+		struct vmcb *vmcb = get_host_vmcb(svm);
 
-		if (is_guest_mode(vcpu))
-			tsc_offset = svm->nested.hsave->control.tsc_offset;
-		else
-			tsc_offset = svm->vmcb->control.tsc_offset;
-
-		*data = tsc_offset + native_read_tsc();
+		*data = vmcb->control.tsc_offset + native_read_tsc();
 		break;
 	}
 	case MSR_STAR:
-- 
cgit v0.10.2


From b7c4145ba2eb0717db0ddac1b5f7f48012189c53 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 2 Dec 2010 17:52:50 +0200
Subject: KVM: Don't spin on virt instruction faults during reboot

Since vmx blocks INIT signals, we disable virtualization extensions during
reboot.  This leads to virtualization instructions faulting; we trap these
faults and spin while the reboot continues.

Unfortunately spinning on a non-preemptible kernel may block a task that
reboot depends on; this causes the reboot to hang.

Fix by skipping over the instruction and hoping for the best.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 56e45a2..d968cc5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -786,14 +786,18 @@ enum {
  * reboot turns off virtualization while processes are running.
  * Trap the fault and ignore the instruction if that happens.
  */
-asmlinkage void kvm_handle_fault_on_reboot(void);
+asmlinkage void kvm_spurious_fault(void);
+extern bool kvm_rebooting;
 
 #define __kvm_handle_fault_on_reboot(insn) \
 	"666: " insn "\n\t" \
+	"668: \n\t"                           \
 	".pushsection .fixup, \"ax\" \n" \
 	"667: \n\t" \
+	"cmpb $0, kvm_rebooting \n\t"	      \
+	"jne 668b \n\t"      		      \
 	__ASM_SIZE(push) " $666b \n\t"	      \
-	"jmp kvm_handle_fault_on_reboot \n\t" \
+	"call kvm_spurious_fault \n\t"	      \
 	".popsection \n\t" \
 	".pushsection __ex_table, \"a\" \n\t" \
 	_ASM_PTR " 666b, 667b \n\t" \
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ee99b77..7f68625 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -90,7 +90,8 @@ static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
-static bool kvm_rebooting;
+bool kvm_rebooting;
+EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
@@ -2171,18 +2172,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 }
 
 
-asmlinkage void kvm_handle_fault_on_reboot(void)
+asmlinkage void kvm_spurious_fault(void)
 {
-	if (kvm_rebooting) {
-		/* spin while reset goes on */
-		local_irq_enable();
-		while (true)
-			cpu_relax();
-	}
 	/* Fault while not rebooting.  We want the trace. */
 	BUG();
 }
-EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		      void *v)
-- 
cgit v0.10.2


From a295673aba42895997a6c1be87f467a7cfc0f332 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 2 Dec 2010 17:55:23 +0200
Subject: KVM: VMX: Return 0 from a failed VMREAD

If we execute VMREAD during reboot we'll just skip over it.  Instead of
returning garbage, return 0, which has a much smaller chance of confusing
the code.  Otherwise we risk a flood of debug printk()s which block the
reboot process if a serial console or netconsole is enabled.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ab05ff6..f3693ca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -565,10 +565,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 
 static unsigned long vmcs_readl(unsigned long field)
 {
-	unsigned long value;
+	unsigned long value = 0;
 
 	asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-		      : "=a"(value) : "d"(field) : "cc");
+		      : "+a"(value) : "d"(field) : "cc");
 	return value;
 }
 
-- 
cgit v0.10.2


From 700e1b12196c4b01524ca10d89f8731418d72b6e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 6 Dec 2010 01:11:33 +0900
Subject: KVM: MMU: Avoid dropping accessed bit while removing write access

One more "KVM: MMU: Don't drop accessed bit while updating an spte."

Sptes are accessed by both kvm and hardware.
This patch uses update_spte() to fix the way of removing write access.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d359500..482a5c0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3447,7 +3447,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 			/* avoid RMW */
 			if (is_writable_pte(pt[i]))
-				pt[i] &= ~PT_WRITABLE_MASK;
+				update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
 	}
 	kvm_flush_remote_tlbs(kvm);
 }
-- 
cgit v0.10.2


From 8d28fec406e4d5ce6c109fe12699976e72e9748e Mon Sep 17 00:00:00 2001
From: "Roedel, Joerg" <Joerg.Roedel@amd.com>
Date: Fri, 3 Dec 2010 13:15:21 +0100
Subject: KVM: SVM: Add clean-bits infrastructure code

This patch adds the infrastructure for the implementation of
the individual clean-bits.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 11dbca7..235dd73 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -79,7 +79,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 event_inj_err;
 	u64 nested_cr3;
 	u64 lbr_ctl;
-	u64 reserved_5;
+	u32 clean;
+	u32 reserved_5;
 	u64 next_rip;
 	u8 reserved_6[816];
 };
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5038786..e73cbc3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -185,6 +185,28 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
 
+enum {
+	VMCB_DIRTY_MAX,
+};
+
+#define VMCB_ALWAYS_DIRTY_MASK	0U
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+	vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+	vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+			       & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+	vmcb->control.clean &= ~(1 << bit);
+}
+
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
 	return container_of(vcpu, struct vcpu_svm, vcpu);
@@ -973,6 +995,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 		set_intercept(svm, INTERCEPT_PAUSE);
 	}
 
+	mark_all_dirty(svm->vmcb);
+
 	enable_gif(svm);
 }
 
@@ -1089,6 +1113,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (unlikely(cpu != vcpu->cpu)) {
 		svm->asid_generation = 0;
+		mark_all_dirty(svm->vmcb);
 	}
 
 #ifdef CONFIG_X86_64
@@ -2140,6 +2165,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	svm->vmcb->save.cpl = 0;
 	svm->vmcb->control.exit_int_info = 0;
 
+	mark_all_dirty(svm->vmcb);
+
 	nested_svm_unmap(page);
 
 	nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2351,6 +2378,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
 	enable_gif(svm);
 
+	mark_all_dirty(svm->vmcb);
+
 	return true;
 }
 
@@ -3490,6 +3519,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->vmcb->control.exit_code ==
 		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
 		svm_handle_mce(svm);
+
+	mark_all_clean(svm->vmcb);
 }
 
 #undef R
-- 
cgit v0.10.2


From 116a0a23676e7d29b7747a53f819baaaf4b88575 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:49 +0100
Subject: KVM: SVM: Add clean-bit for intercetps, tsc-offset and pause filter
 count

This patch adds the clean-bit for intercepts-vectors, the
TSC offset and the pause-filter count to the appropriate
places. The IO and MSR permission bitmaps are not subject to
this bit.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e73cbc3..052561c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -186,6 +186,8 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
 
 enum {
+	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+			    pause filter count */
 	VMCB_DIRTY_MAX,
 };
 
@@ -217,6 +219,8 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 	struct vmcb_control_area *c, *h;
 	struct nested_state *g;
 
+	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
 	if (!is_guest_mode(&svm->vcpu))
 		return;
 
@@ -854,6 +858,8 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	}
 
 	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
 static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -863,6 +869,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
 	svm->vmcb->control.tsc_offset += adjustment;
 	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.tsc_offset += adjustment;
+	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
 static void init_vmcb(struct vcpu_svm *svm)
-- 
cgit v0.10.2


From f56838e4c38cbbc428ae276783f5dbb225e58817 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:50 +0100
Subject: KVM: SVM: Add clean-bit for IOPM_BASE and MSRPM_BASE

This patch adds the clean bit for the physical addresses of
the MSRPM and the IOPM. It does not need to be set in the
code because the only place where these values are changed
is the nested-svm vmrun and vmexit path. These functions
already mark the complete VMCB as dirty.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 052561c..284c34c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -188,6 +188,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 enum {
 	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 			    pause filter count */
+	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 	VMCB_DIRTY_MAX,
 };
 
-- 
cgit v0.10.2


From d48086d1e316e0cefd69b6d2ce75a42856cfba57 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:51 +0100
Subject: KVM: SVM: Add clean-bit for the ASID

This patch implements the clean-bit for the asid in the
vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 284c34c..e188d9e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -189,6 +189,7 @@ enum {
 	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 			    pause filter count */
 	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
+	VMCB_ASID,	 /* ASID */
 	VMCB_DIRTY_MAX,
 };
 
@@ -1488,6 +1489,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 
 	svm->asid_generation = sd->asid_generation;
 	svm->vmcb->control.asid = sd->next_asid++;
+
+	mark_dirty(svm->vmcb, VMCB_ASID);
 }
 
 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
-- 
cgit v0.10.2


From decdbf6a4c4f5cf14343946031aff24ff815c909 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:52 +0100
Subject: KVM: SVM: Add clean-bit for interrupt state

This patch implements the clean-bit for all interrupt
related state in the vmcb. This corresponds to vmcb offset
0x60-0x67.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e188d9e..748569e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -190,10 +190,12 @@ enum {
 			    pause filter count */
 	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 	VMCB_ASID,	 /* ASID */
+	VMCB_INTR,	 /* int_ctl, int_vector */
 	VMCB_DIRTY_MAX,
 };
 
-#define VMCB_ALWAYS_DIRTY_MASK	0U
+/* TPR is always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK	(1U << VMCB_INTR)
 
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
@@ -2508,6 +2510,8 @@ static int clgi_interception(struct vcpu_svm *svm)
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 
+	mark_dirty(svm->vmcb, VMCB_INTR);
+
 	return 1;
 }
 
@@ -2878,6 +2882,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+	mark_dirty(svm->vmcb, VMCB_INTR);
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
@@ -3169,6 +3174,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
 	control->int_ctl |= V_IRQ_MASK |
 		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+	mark_dirty(svm->vmcb, VMCB_INTR);
 }
 
 static void svm_set_irq(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From b2747166dc315b31281fb659a5b8938873d5f1d7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:53 +0100
Subject: KVM: SVM: Add clean-bit for NPT state

This patch implements the clean-bit for all nested paging
related state in the vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 748569e..5038201 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -191,6 +191,7 @@ enum {
 	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 	VMCB_ASID,	 /* ASID */
 	VMCB_INTR,	 /* int_ctl, int_vector */
+	VMCB_NPT,        /* npt_en, nCR3, gPAT */
 	VMCB_DIRTY_MAX,
 };
 
@@ -1749,6 +1750,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.nested_cr3 = root;
+	mark_dirty(svm->vmcb, VMCB_NPT);
 	force_new_asid(vcpu);
 }
 
@@ -3555,6 +3557,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.nested_cr3 = root;
+	mark_dirty(svm->vmcb, VMCB_NPT);
 
 	/* Also sync guest cr3 here in case we live migrate */
 	svm->vmcb->save.cr3 = vcpu->arch.cr3;
-- 
cgit v0.10.2


From dcca1a6506123cd47af334b7ee2a4b0288196389 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:54 +0100
Subject: KVM: SVM: Add clean-bit for control registers

This patch implements the CRx clean-bit for the vmcb. This
bit covers cr0, cr3, cr4, and efer.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5038201..0f55b8a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -192,6 +192,7 @@ enum {
 	VMCB_ASID,	 /* ASID */
 	VMCB_INTR,	 /* int_ctl, int_vector */
 	VMCB_NPT,        /* npt_en, nCR3, gPAT */
+	VMCB_CR,	 /* CR0, CR3, CR4, EFER */
 	VMCB_DIRTY_MAX,
 };
 
@@ -441,6 +442,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		efer &= ~EFER_LME;
 
 	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 }
 
 static int is_external_interrupt(u32 info)
@@ -1338,6 +1340,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 		*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
 			| (gcr0 & SVM_CR0_SELECTIVE_MASK);
 
+	mark_dirty(svm->vmcb, VMCB_CR);
 
 	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
 		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1404,6 +1407,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	 */
 	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
 	svm->vmcb->save.cr0 = cr0;
+	mark_dirty(svm->vmcb, VMCB_CR);
 	update_cr0_intercept(svm);
 }
 
@@ -1420,6 +1424,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		cr4 |= X86_CR4_PAE;
 	cr4 |= host_cr4_mce;
 	to_svm(vcpu)->vmcb->save.cr4 = cr4;
+	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -3549,6 +3554,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->save.cr3 = root;
+	mark_dirty(svm->vmcb, VMCB_CR);
 	force_new_asid(vcpu);
 }
 
@@ -3561,6 +3567,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
 	/* Also sync guest cr3 here in case we live migrate */
 	svm->vmcb->save.cr3 = vcpu->arch.cr3;
+	mark_dirty(svm->vmcb, VMCB_CR);
 
 	force_new_asid(vcpu);
 }
-- 
cgit v0.10.2


From 72214b9601f2b6c8343ea57b0e405f9da7a92d29 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:55 +0100
Subject: KVM: SVM: Add clean-bit for DR6 and DR7

This patch implements the clean-bit for the dr6 and dr7
debug registers in the vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0f55b8a..e9224ca 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -193,6 +193,7 @@ enum {
 	VMCB_INTR,	 /* int_ctl, int_vector */
 	VMCB_NPT,        /* npt_en, nCR3, gPAT */
 	VMCB_CR,	 /* CR0, CR3, CR4, EFER */
+	VMCB_DR,         /* DR6, DR7 */
 	VMCB_DIRTY_MAX,
 };
 
@@ -1484,6 +1485,8 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	else
 		svm->vmcb->save.dr7 = vcpu->arch.dr7;
 
+	mark_dirty(svm->vmcb, VMCB_DR);
+
 	update_db_intercept(vcpu);
 }
 
@@ -1506,6 +1509,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->save.dr7 = value;
+	mark_dirty(svm->vmcb, VMCB_DR);
 }
 
 static int pf_interception(struct vcpu_svm *svm)
-- 
cgit v0.10.2


From 17a703cb14055c5dac7f34672d368c4d8f6ffe49 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:56 +0100
Subject: KVM: SVM: Add clean-bit for GDT and IDT

This patch implements the clean-bit for the base and limit
of the gdt and idt in the vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e9224ca..cbfb2ac 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -194,6 +194,7 @@ enum {
 	VMCB_NPT,        /* npt_en, nCR3, gPAT */
 	VMCB_CR,	 /* CR0, CR3, CR4, EFER */
 	VMCB_DR,         /* DR6, DR7 */
+	VMCB_DT,         /* GDT, IDT */
 	VMCB_DIRTY_MAX,
 };
 
@@ -1304,6 +1305,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 
 	svm->vmcb->save.idtr.limit = dt->size;
 	svm->vmcb->save.idtr.base = dt->address ;
+	mark_dirty(svm->vmcb, VMCB_DT);
 }
 
 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1320,6 +1322,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 
 	svm->vmcb->save.gdtr.limit = dt->size;
 	svm->vmcb->save.gdtr.base = dt->address ;
+	mark_dirty(svm->vmcb, VMCB_DT);
 }
 
 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 060d0c9a2ee2b1d2cf10afc11d8a0b2d97d8f3e3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:57 +0100
Subject: KVM: SVM: Add clean-bit for Segements and CPL

This patch implements the clean-bit defined for the cs, ds,
ss, an es segemnts and the current cpl saved in the vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cbfb2ac..4db7157 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -195,6 +195,7 @@ enum {
 	VMCB_CR,	 /* CR0, CR3, CR4, EFER */
 	VMCB_DR,         /* DR6, DR7 */
 	VMCB_DT,         /* GDT, IDT */
+	VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 	VMCB_DIRTY_MAX,
 };
 
@@ -1457,6 +1458,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 			= (svm->vmcb->save.cs.attrib
 			   >> SVM_SELECTOR_DPL_SHIFT) & 3;
 
+	mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
 static void update_db_intercept(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 0574dec0d73ab87a21a8965467ac8caeedac7fed Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:58 +0100
Subject: KVM: SVM: Add clean-bit for CR2 register

This patch implements the clean-bit for the cr2 register in
the vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4db7157..d2ddad9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -196,11 +196,12 @@ enum {
 	VMCB_DR,         /* DR6, DR7 */
 	VMCB_DT,         /* GDT, IDT */
 	VMCB_SEG,        /* CS, DS, SS, ES, CPL */
+	VMCB_CR2,        /* CR2 only */
 	VMCB_DIRTY_MAX,
 };
 
-/* TPR is always written before VMRUN */
-#define VMCB_ALWAYS_DIRTY_MASK	(1U << VMCB_INTR)
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
 
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
-- 
cgit v0.10.2


From b53ba3f9cc0b5ac21a86a95c702768f871b02610 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 11:45:59 +0100
Subject: KVM: SVM: Add clean-bit for LBR state

This patch implements the clean-bit for all LBR related
state. This includes the debugctl, br_from, br_to,
last_excp_from, and last_excp_to msrs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d2ddad9..58cabb55 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -197,6 +197,7 @@ enum {
 	VMCB_DT,         /* GDT, IDT */
 	VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 	VMCB_CR2,        /* CR2 only */
+	VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 	VMCB_DIRTY_MAX,
 };
 
@@ -2847,6 +2848,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 			return 1;
 
 		svm->vmcb->save.dbgctl = data;
+		mark_dirty(svm->vmcb, VMCB_LBR);
 		if (data & (1ULL<<0))
 			svm_enable_lbrv(svm);
 		else
-- 
cgit v0.10.2


From 78b2c54aa4a7e9e4257d2b8e3a4b96d2d0c6e636 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 7 Dec 2010 10:48:06 +0800
Subject: KVM: MMU: rename 'no_apf' to 'prefault'

It's the speculative path if 'no_apf = 1' and we will specially handle this
speculative path in the later patch, so 'prefault' is better to fit the sense.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d968cc5..aa1518d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -241,7 +241,8 @@ struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
-	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool no_apf);
+	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+			  bool prefault);
 	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
 				  struct x86_exception *fault);
 	void (*free)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 482a5c0..83d45cf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2284,11 +2284,11 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 	return 1;
 }
 
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
-			 bool no_apf)
+			 bool prefault)
 {
 	int r;
 	int level;
@@ -2310,7 +2310,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, gfn, v, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
 		return 0;
 
 	/* mmio */
@@ -2584,7 +2584,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 }
 
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-				u32 error_code, bool no_apf)
+				u32 error_code, bool prefault)
 {
 	gfn_t gfn;
 	int r;
@@ -2600,7 +2600,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn, no_apf);
+			     error_code & PFERR_WRITE_MASK, gfn, prefault);
 }
 
 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -2622,7 +2622,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
 	bool async;
@@ -2634,7 +2634,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
 
 	put_page(pfn_to_page(*pfn));
 
-	if (!no_apf && can_do_async_pf(vcpu)) {
+	if (!prefault && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
@@ -2650,7 +2650,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
 }
 
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
-			  bool no_apf)
+			  bool prefault)
 {
 	pfn_t pfn;
 	int r;
@@ -2674,7 +2674,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
 		return 0;
 
 	/* mmio */
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d5a0a11..52b3e91 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -539,7 +539,7 @@ out_gpte_changed:
  *           a negative value on error.
  */
 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
-			     bool no_apf)
+			     bool prefault)
 {
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
@@ -581,7 +581,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn, write_fault,
+	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
 			 &map_writable))
 		return 0;
 
-- 
cgit v0.10.2


From 2ec4739ddc889af11d09b3d5ca33687f1f3f1020 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 7 Dec 2010 10:34:42 +0800
Subject: KVM: MMU: fix accessed bit set on prefault path

Retry #PF is the speculative path, so don't set the accessed bit

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 83d45cf..d475b6b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2214,7 +2214,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int map_writable, int level, gfn_t gfn, pfn_t pfn)
+			int map_writable, int level, gfn_t gfn, pfn_t pfn,
+			bool prefault)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -2229,7 +2230,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				pte_access &= ~ACC_WRITE_MASK;
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
 				     0, write, 1, &pt_write,
-				     level, gfn, pfn, false, map_writable);
+				     level, gfn, pfn, prefault, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
@@ -2321,7 +2322,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn);
+	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+			 prefault);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -2685,7 +2687,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, write, map_writable,
-			 level, gfn, pfn);
+			 level, gfn, pfn, prefault);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
-- 
cgit v0.10.2


From fb67e14fc90f18250259faf61a269320ea8e4d8f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 7 Dec 2010 10:35:25 +0800
Subject: KVM: MMU: retry #PF for softmmu

Retry #PF for softmmu only when the current vcpu has the same cr3 as the time
when #PF occurs

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa1518d..4461429 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -593,6 +593,7 @@ struct kvm_x86_ops {
 struct kvm_arch_async_pf {
 	u32 token;
 	gfn_t gfn;
+	unsigned long cr3;
 	bool direct_map;
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d475b6b..abda57f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2608,9 +2608,11 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 {
 	struct kvm_arch_async_pf arch;
+
 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
 	arch.gfn = gfn;
 	arch.direct_map = vcpu->arch.mmu.direct_map;
+	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
 
 	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 52b3e91..146b681 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -438,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int user_fault, int write_fault, int hlevel,
-			 int *ptwrite, pfn_t pfn, bool map_writable)
+			 int *ptwrite, pfn_t pfn, bool map_writable,
+			 bool prefault)
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
@@ -512,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
-		     gw->gfn, pfn, false, map_writable);
+		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
 	return it.sptep;
@@ -568,8 +569,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		inject_page_fault(vcpu, &walker.fault);
-		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+		if (!prefault) {
+			inject_page_fault(vcpu, &walker.fault);
+			/* reset fork detector */
+			vcpu->arch.last_pt_write_count = 0;
+		}
 		return 0;
 	}
 
@@ -599,7 +603,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &write_pt, pfn, map_writable);
+			     level, &write_pt, pfn, map_writable, prefault);
 	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b4d5fc..cd71d21 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6182,7 +6182,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 {
 	int r;
 
-	if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
+	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
 	      is_error_page(work->page))
 		return;
 
@@ -6190,6 +6190,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 	if (unlikely(r))
 		return;
 
+	if (!vcpu->arch.mmu.direct_map &&
+	      work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+		return;
+
 	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
 }
 
-- 
cgit v0.10.2


From fa22a8d608a580264a99f0ddf4bb7fe65a9c2eb3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 15:25:14 +0100
Subject: KVM: SVM: Remove flush_guest_tlb function

This function is unused and there is svm_flush_tlb which
does the same. So this function can be removed.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 58cabb55..16a73c7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -426,11 +426,6 @@ static inline void force_new_asid(struct kvm_vcpu *vcpu)
 	to_svm(vcpu)->asid_generation--;
 }
 
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
-	force_new_asid(vcpu);
-}
-
 static int get_npt_level(void)
 {
 #ifdef CONFIG_X86_64
-- 
cgit v0.10.2


From f40f6a459c2fefa77fd74eec28d6233c0a7f431a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 15:25:15 +0100
Subject: KVM: SVM: Use svm_flush_tlb instead of force_new_asid

This patch replaces all calls to force_new_asid which are
intended to flush the guest-tlb by the more appropriate
function svm_flush_tlb. As a side-effect the force_new_asid
function is removed.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 16a73c7..7272b36 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -421,11 +421,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
 	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 }
 
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
-	to_svm(vcpu)->asid_generation--;
-}
-
 static int get_npt_level(void)
 {
 #ifdef CONFIG_X86_64
@@ -999,7 +994,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 		save->cr3 = 0;
 		save->cr4 = 0;
 	}
-	force_new_asid(&svm->vcpu);
+	svm->asid_generation = 0;
 
 	svm->nested.vmcb = 0;
 	svm->vcpu.arch.hflags = 0;
@@ -1419,7 +1414,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
 	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
-		force_new_asid(vcpu);
+		svm_flush_tlb(vcpu);
 
 	vcpu->arch.cr4 = cr4;
 	if (!npt_enabled)
@@ -1762,7 +1757,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 
 	svm->vmcb->control.nested_cr3 = root;
 	mark_dirty(svm->vmcb, VMCB_NPT);
-	force_new_asid(vcpu);
+	svm_flush_tlb(vcpu);
 }
 
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -2366,7 +2361,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
 	svm->nested.intercept            = nested_vmcb->control.intercept;
 
-	force_new_asid(&svm->vcpu);
+	svm_flush_tlb(&svm->vcpu);
 	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
 	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -3308,7 +3303,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	force_new_asid(vcpu);
+	to_svm(vcpu)->asid_generation--;
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3562,7 +3557,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
 	svm->vmcb->save.cr3 = root;
 	mark_dirty(svm->vmcb, VMCB_CR);
-	force_new_asid(vcpu);
+	svm_flush_tlb(vcpu);
 }
 
 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3576,7 +3571,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	svm->vmcb->save.cr3 = vcpu->arch.cr3;
 	mark_dirty(svm->vmcb, VMCB_CR);
 
-	force_new_asid(vcpu);
+	svm_flush_tlb(vcpu);
 }
 
 static int is_disabled(void)
-- 
cgit v0.10.2


From 38e5e92fe8c02a8766459d505423b855caf9af1f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 3 Dec 2010 15:25:16 +0100
Subject: KVM: SVM: Implement Flush-By-Asid feature

This patch adds the new flush-by-asid of upcoming AMD
processors to the KVM-AMD module.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 235dd73..82ecaa32 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -88,6 +88,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 
 #define TLB_CONTROL_DO_NOTHING 0
 #define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
 
 #define V_TPR_MASK 0x0f
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7272b36..4dbc372 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3158,7 +3158,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
 
 	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
-	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 	/* FIXME: handle wraparound of asid_generation */
 	if (svm->asid_generation != sd->asid_generation)
 		new_asid(svm, sd);
@@ -3303,7 +3302,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	to_svm(vcpu)->asid_generation--;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+	else
+		svm->asid_generation--;
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3529,6 +3533,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	svm->next_rip = 0;
 
+	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
 	/* if exit due to PF check for async PF */
 	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
 		svm->apf_reason = kvm_read_and_reset_pf_reason();
-- 
cgit v0.10.2


From 443381a828910efa3d71ba4491d180f2d0bb4212 Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Mon, 6 Dec 2010 10:53:38 -0600
Subject: KVM: VMX: add module parameter to avoid trapping HLT instructions
 (v5)

In certain use-cases, we want to allocate guests fixed time slices where idle
guest cycles leave the machine idling.  There are many approaches to achieve
this but the most direct is to simply avoid trapping the HLT instruction which
lets the guest directly execute the instruction putting the processor to sleep.

Introduce this as a module-level option for kvm-vmx.ko since if you do this
for one guest, you probably want to do it for all.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 42d95905..9642c22 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -297,6 +297,12 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_SMI		0x00000004
 #define GUEST_INTR_STATE_NMI		0x00000008
 
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE		0
+#define GUEST_ACTIVITY_HLT		1
+#define GUEST_ACTIVITY_SHUTDOWN		2
+#define GUEST_ACTIVITY_WAIT_SIPI	3
+
 /*
  * Exit Qualifications for MOV for Control Register Access
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3693ca..c195260 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static int __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
@@ -1009,6 +1012,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+	/* Ensure that we clear the HLT state in the VMCS.  We don't need to
+	 * explicitly skip the instruction because if the HLT state is set, then
+	 * the instruction is already executing and RIP has already been
+	 * advanced. */
+	if (!yield_on_hlt &&
+	    vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code,
 				bool reinject)
@@ -1035,6 +1049,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+	vmx_clear_hlt(vcpu);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -1419,7 +1434,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_pin_based_exec_control) < 0)
 		return -EIO;
 
-	min = CPU_BASED_HLT_EXITING |
+	min =
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
@@ -1432,6 +1447,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_MWAIT_EXITING |
 	      CPU_BASED_MONITOR_EXITING |
 	      CPU_BASED_INVLPG_EXITING;
+
+	if (yield_on_hlt)
+		min |= CPU_BASED_HLT_EXITING;
+
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -2728,7 +2747,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_IDTR_BASE, 0);
 	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
 
-	vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
@@ -2821,6 +2840,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	} else
 		intr |= INTR_TYPE_EXT_INTR;
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+	vmx_clear_hlt(vcpu);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2848,6 +2868,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+	vmx_clear_hlt(vcpu);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From d4dbf470096c51cb4785167ea59fdbdea87ccbe4 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 7 Dec 2010 12:59:07 +0900
Subject: KVM: MMU: Make the way of accessing lpage_info more generic

Large page information has two elements but one of them, write_count, alone
is accessed by a helper function.

This patch replaces this helper function with more generic one which returns
newly named kvm_lpage_info structure and use it to access the other element
rmap_pde.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index abda57f..475a122 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -477,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 }
 
 /*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
  */
-static int *slot_largepage_idx(gfn_t gfn,
-			       struct kvm_memory_slot *slot,
-			       int level)
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+					      struct kvm_memory_slot *slot,
+					      int level)
 {
 	unsigned long idx;
 
 	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-	return &slot->lpage_info[level - 2][idx].write_count;
+	return &slot->lpage_info[level - 2][idx];
 }
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
-	int *write_count;
+	struct kvm_lpage_info *linfo;
 	int i;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		write_count   = slot_largepage_idx(gfn, slot, i);
-		*write_count += 1;
+		linfo = lpage_info_slot(gfn, slot, i);
+		linfo->write_count += 1;
 	}
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
-	int *write_count;
+	struct kvm_lpage_info *linfo;
 	int i;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		write_count   = slot_largepage_idx(gfn, slot, i);
-		*write_count -= 1;
-		WARN_ON(*write_count < 0);
+		linfo = lpage_info_slot(gfn, slot, i);
+		linfo->write_count -= 1;
+		WARN_ON(linfo->write_count < 0);
 	}
 }
 
@@ -525,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
 				int level)
 {
 	struct kvm_memory_slot *slot;
-	int *largepage_idx;
+	struct kvm_lpage_info *linfo;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
-		largepage_idx = slot_largepage_idx(gfn, slot, level);
-		return *largepage_idx;
+		linfo = lpage_info_slot(gfn, slot, level);
+		return linfo->write_count;
 	}
 
 	return 1;
@@ -585,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
 	struct kvm_memory_slot *slot;
-	unsigned long idx;
+	struct kvm_lpage_info *linfo;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
-	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+	linfo = lpage_info_slot(gfn, slot, level);
 
-	return &slot->lpage_info[level - 2][idx].rmap_pde;
+	return &linfo->rmap_pde;
 }
 
 /*
@@ -882,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 		end = start + (memslot->npages << PAGE_SHIFT);
 		if (hva >= start && hva < end) {
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+			gfn_t gfn = memslot->base_gfn + gfn_offset;
 
 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-				unsigned long idx;
-				int sh;
-
-				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
-				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
-					(memslot->base_gfn >> sh);
-				ret |= handler(kvm,
-					&memslot->lpage_info[j][idx].rmap_pde,
-					data);
+				struct kvm_lpage_info *linfo;
+
+				linfo = lpage_info_slot(gfn, memslot,
+							PT_DIRECTORY_LEVEL + j);
+				ret |= handler(kvm, &linfo->rmap_pde, data);
 			}
 			trace_kvm_age_page(hva, memslot, ret);
 			retval |= ret;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac4e83a..bd0da8f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -146,6 +146,11 @@ struct kvm_vcpu {
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
+struct kvm_lpage_info {
+	unsigned long rmap_pde;
+	int write_count;
+};
+
 struct kvm_memory_slot {
 	gfn_t base_gfn;
 	unsigned long npages;
@@ -153,10 +158,7 @@ struct kvm_memory_slot {
 	unsigned long *rmap;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_head;
-	struct {
-		unsigned long rmap_pde;
-		int write_count;
-	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	int user_alloc;
 	int id;
-- 
cgit v0.10.2


From 81dd35d42c9aef3c1f7ae6ce4cf6a0d382661db5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 7 Dec 2010 17:15:06 +0100
Subject: KVM: SVM: Add xsetbv intercept

This patch implements the xsetbv intercept to the AMD part
of KVM. This makes AVX usable in a save way for the guest on
AVX capable AMD hardware.

The patch is tested by using AVX in the guest and host in
parallel and checking for data corruption. I also used the
KVM xsave unit-tests and they all pass.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ecaa32..f7087bf 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,6 +47,7 @@ enum {
 	INTERCEPT_MONITOR,
 	INTERCEPT_MWAIT,
 	INTERCEPT_MWAIT_COND,
+	INTERCEPT_XSETBV,
 };
 
 
@@ -329,6 +330,7 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EXIT_MONITOR	0x08a
 #define SVM_EXIT_MWAIT		0x08b
 #define SVM_EXIT_MWAIT_COND	0x08c
+#define SVM_EXIT_XSETBV		0x08d
 #define SVM_EXIT_NPF  		0x400
 
 #define SVM_EXIT_ERR		-1
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4dbc372..73461b1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -935,6 +935,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_WBINVD);
 	set_intercept(svm, INTERCEPT_MONITOR);
 	set_intercept(svm, INTERCEPT_MWAIT);
+	set_intercept(svm, INTERCEPT_XSETBV);
 
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
@@ -2546,6 +2547,19 @@ static int skinit_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+	u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+	u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+	if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+		svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+		skip_emulated_instruction(&svm->vcpu);
+	}
+
+	return 1;
+}
+
 static int invalid_op_interception(struct vcpu_svm *svm)
 {
 	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2971,6 +2985,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
 	[SVM_EXIT_MONITOR]			= invalid_op_interception,
 	[SVM_EXIT_MWAIT]			= invalid_op_interception,
+	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
 };
 
@@ -3624,10 +3639,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 	switch (func) {
-	case 0x00000001:
-		/* Mask out xsave bit as long as it is not supported by SVM */
-		entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
-		break;
 	case 0x80000001:
 		if (nested)
 			entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3701,6 +3712,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
 	{ SVM_EXIT_WBINVD,			"wbinvd" },
 	{ SVM_EXIT_MONITOR,			"monitor" },
 	{ SVM_EXIT_MWAIT,			"mwait" },
+	{ SVM_EXIT_XSETBV,			"xsetbv" },
 	{ SVM_EXIT_NPF,				"npf" },
 	{ -1, NULL }
 };
-- 
cgit v0.10.2


From 5c663a1534d27d817e17eed06a83d08f497f9f4f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 8 Dec 2010 18:04:51 +0200
Subject: KVM: Fix build error on s390 due to missing tlbs_dirty

Make it available for all archs.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd0da8f..b5021db 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,8 +256,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
-	long tlbs_dirty;
 #endif
+	long tlbs_dirty;
 };
 
 /* The guest did something we don't support. */
-- 
cgit v0.10.2


From d3c422bd33388e6fe6777bde0e9bd20152133083 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 12 Dec 2010 19:30:55 +0200
Subject: KVM: MMU: Fix incorrect direct page write protection due to ro host
 page

If KVM sees a read-only host page, it will map it as read-only to prevent
breaking a COW.  However, if the page was part of a large guest page, KVM
incorrectly extends the write protection to the entire large page frame
instead of limiting it to the normal host page.

This results in the instantiation of a new shadow page with read-only access.

If this happens for a MOVS instruction that moves memory between two normal
pages, within a single large page frame, and mapped within the guest as a
large page, and if, in addition, the source operand is not writeable in the
host (perhaps due to KSM), then KVM will instantiate a read-only direct
shadow page, instantiate an spte for the source operand, then instantiate
a new read/write direct shadow page and instantiate an spte for the
destination operand.  Since these two sptes are in different shadow pages,
MOVS will never see them at the same time and the guest will not make
progress.

Fix by mapping the direct shadow page read/write, and only marking the
host page read-only.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 146b681..5ca9426 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -511,6 +511,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		link_shadow_page(it.sptep, sp);
 	}
 
+	if (!map_writable)
+		access &= ~ACC_WRITE_MASK;
+
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
@@ -593,9 +596,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (is_error_pfn(pfn))
 		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
-	if (!map_writable)
-		walker.pte_access &= ~ACC_WRITE_MASK;
-
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
-- 
cgit v0.10.2


From 61cfab2e83263ea294a034b12079476a917299f4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 13 Dec 2010 17:24:53 +0200
Subject: KVM: Correct kvm_pio tracepoint count field

Currently, we record '1' for count regardless of the real count.  Fix.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd71d21..386cab9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3948,7 +3948,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 	if (vcpu->arch.pio.count)
 		goto data_avail;
 
-	trace_kvm_pio(0, port, size, 1);
+	trace_kvm_pio(0, port, size, count);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 1;
@@ -3976,7 +3976,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
 			      const void *val, unsigned int count,
 			      struct kvm_vcpu *vcpu)
 {
-	trace_kvm_pio(1, port, size, 1);
+	trace_kvm_pio(1, port, size, count);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 0;
-- 
cgit v0.10.2


From a355c85c5f137d93c4e9274c50e26c20f1ebc1c9 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 14 Dec 2010 17:57:47 +0800
Subject: KVM: return true when user space query KVM_CAP_USER_NMI extension

userspace may check this extension in runtime.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 386cab9..cbaea7d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1941,6 +1941,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
-- 
cgit v0.10.2


From 175504cdbfef6a0fde3bafb6c38b4929049ac8ea Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 16 Dec 2010 01:41:37 +0900
Subject: KVM: Take missing slots_lock for kvm_io_bus_unregister_dev()

In KVM_CREATE_IRQCHIP, kvm_io_bus_unregister_dev() is called without taking
slots_lock in the error handling path.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 48a48bd..70d224d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -951,7 +951,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
+			mutex_lock(&kvm->slots_lock);
 			kvm_ioapic_destroy(kvm);
+			mutex_unlock(&kvm->slots_lock);
 			goto out;
 		}
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cbaea7d..f569da8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3309,8 +3309,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (vpic) {
 			r = kvm_ioapic_init(kvm);
 			if (r) {
+				mutex_lock(&kvm->slots_lock);
 				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
 							  &vpic->dev);
+				mutex_unlock(&kvm->slots_lock);
 				kfree(vpic);
 				goto create_irqchip_unlock;
 			}
@@ -3321,10 +3323,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		smp_wmb();
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
+			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
 			kvm_destroy_pic(kvm);
 			mutex_unlock(&kvm->irq_lock);
+			mutex_unlock(&kvm->slots_lock);
 		}
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
-- 
cgit v0.10.2


From a63512a4d711c9bd6a5d03847f45fcf88cdea0c6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 16 Dec 2010 11:27:23 +0200
Subject: KVM guest: Fix kvm clock initialization when it's configured out

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 91b3d65..8dc4466 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -486,7 +486,9 @@ static struct notifier_block kvm_pv_reboot_nb = {
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
+#ifdef CONFIG_KVM_CLOCK
 	WARN_ON(kvm_register_clock("primary cpu clock"));
+#endif
 	kvm_guest_cpu_init();
 	native_smp_prepare_boot_cpu();
 }
-- 
cgit v0.10.2


From eea1cff9ab732ea56358ff5e1bd8b99db2e8402d Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:00 +0100
Subject: KVM: x86: fix CR8 handling

The handling of CR8 writes in KVM is currently somewhat cumbersome.
This patch makes it look like the other CR register handlers
and fixes a possible issue in VMX, where the RIP would be incremented
despite an injected #GP.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4461429..cb5cad2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -661,7 +661,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 73461b1..3d4b88a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2676,16 +2676,17 @@ static int cr0_write_interception(struct vcpu_svm *svm)
 static int cr8_write_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
+	int r;
 
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
-	emulate_instruction(&svm->vcpu, 0, 0, 0);
+	r = emulate_instruction(&svm->vcpu, 0, 0, 0);
 	if (irqchip_in_kernel(svm->vcpu.kvm)) {
 		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-		return 1;
+		return r == EMULATE_DONE;
 	}
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
-		return 1;
+		return r == EMULATE_DONE;
 	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c195260..8e87bae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3185,8 +3185,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		case 8: {
 				u8 cr8_prev = kvm_get_cr8(vcpu);
 				u8 cr8 = kvm_register_read(vcpu, reg);
-				kvm_set_cr8(vcpu, cr8);
-				skip_emulated_instruction(vcpu);
+				err = kvm_set_cr8(vcpu, cr8);
+				complete_insn_gp(vcpu, err);
 				if (irqchip_in_kernel(vcpu->kvm))
 					return 1;
 				if (cr8_prev <= cr8)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f569da8..2dbf68c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -662,7 +662,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
-int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
@@ -672,12 +672,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 		vcpu->arch.cr8 = cr8;
 	return 0;
 }
-
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
-	if (__kvm_set_cr8(vcpu, cr8))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -4104,7 +4098,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 	case 8:
-		res = __kvm_set_cr8(vcpu, val & 0xfUL);
+		res = kvm_set_cr8(vcpu, val);
 		break;
 	default:
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -5381,8 +5375,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm))
-		kvm_set_cr8(vcpu, kvm_run->cr8);
+	if (!irqchip_in_kernel(vcpu->kvm)) {
+		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+			r = -EINVAL;
+			goto out;
+		}
+	}
 
 	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
 		if (vcpu->mmio_needed) {
-- 
cgit v0.10.2


From db8fcefaa704ccb40b6dcd24e3b75bad3ce7dde3 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:01 +0100
Subject: KVM: move complete_insn_gp() into x86.c

move the complete_insn_gp() helper function out of the VMX part
into the generic x86 part to make it usable by SVM.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb5cad2..cd4a990 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -828,4 +828,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
 extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e87bae..fd8ffde 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3147,14 +3147,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
-static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
-	if (err)
-		kvm_inject_gp(vcpu, 0);
-	else
-		skip_emulated_instruction(vcpu);
-}
-
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -3172,21 +3164,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		switch (cr) {
 		case 0:
 			err = kvm_set_cr0(vcpu, val);
-			complete_insn_gp(vcpu, err);
+			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
 			err = kvm_set_cr3(vcpu, val);
-			complete_insn_gp(vcpu, err);
+			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 4:
 			err = kvm_set_cr4(vcpu, val);
-			complete_insn_gp(vcpu, err);
+			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 8: {
 				u8 cr8_prev = kvm_get_cr8(vcpu);
 				u8 cr8 = kvm_register_read(vcpu, reg);
 				err = kvm_set_cr8(vcpu, cr8);
-				complete_insn_gp(vcpu, err);
+				kvm_complete_insn_gp(vcpu, err);
 				if (irqchip_in_kernel(vcpu->kvm))
 					return 1;
 				if (cr8_prev <= cr8)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2dbf68c..1d54cb7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -334,6 +334,15 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+	if (err)
+		kvm_inject_gp(vcpu, 0);
+	else
+		kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
 	++vcpu->stat.pf_guest;
-- 
cgit v0.10.2


From 51d8b66199e94284e7725a79eae4a38de4b80d54 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:02 +0100
Subject: KVM: cleanup emulate_instruction

emulate_instruction had many callers, but only one used all
parameters. One parameter was unused, another one is now
hidden by a wrapper function (required for a future addition
anyway), so most callers use now a shorter parameter list.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cd4a990..de00b60 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -634,8 +634,15 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
-			unsigned long cr2, u16 error_code, int emulation_type);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+			unsigned long cr2, int emulation_type);
+
+static inline int emulate_instruction(struct kvm_vcpu *vcpu,
+			int emulation_type)
+{
+	return x86_emulate_instruction(vcpu, 0, emulation_type);
+}
+
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 475a122..01c5a10 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3348,7 +3348,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 	if (r)
 		goto out;
 
-	er = emulate_instruction(vcpu, cr2, error_code, 0);
+	er = x86_emulate_instruction(vcpu, cr2, 0);
 
 	switch (er) {
 	case EMULATE_DONE:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3d4b88a..90d0658 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -475,7 +475,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		svm->next_rip = svm->vmcb->control.next_rip;
 
 	if (!svm->next_rip) {
-		if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
+		if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 				EMULATE_DONE)
 			printk(KERN_DEBUG "%s: NOP\n", __func__);
 		return;
@@ -1586,7 +1586,7 @@ static int ud_interception(struct vcpu_svm *svm)
 {
 	int er;
 
-	er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
+	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
 	if (er != EMULATE_DONE)
 		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
 	return 1;
@@ -1703,7 +1703,7 @@ static int io_interception(struct vcpu_svm *svm)
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	if (string || in)
-		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -2648,12 +2648,12 @@ static int iret_interception(struct vcpu_svm *svm)
 
 static int invlpg_interception(struct vcpu_svm *svm)
 {
-	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
 }
 
 static int emulate_on_interception(struct vcpu_svm *svm)
 {
-	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
 }
 
 static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2661,7 +2661,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	int r;
 
-	r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+	r = emulate_instruction(&svm->vcpu, 0);
 
 	if (svm->nested.vmexit_rip) {
 		kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2680,7 +2680,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
-	r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+	r = emulate_instruction(&svm->vcpu, 0);
 	if (irqchip_in_kernel(svm->vcpu.kvm)) {
 		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 		return r == EMULATE_DONE;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fd8ffde..f3c60fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2939,7 +2939,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	 * Cause the #SS fault with 0 error code in VM86 mode.
 	 */
 	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-		if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+		if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
 			return 1;
 	/*
 	 * Forward all other exceptions that are valid in real mode.
@@ -3036,7 +3036,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	}
 
 	if (is_invalid_opcode(intr_info)) {
-		er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 		if (er != EMULATE_DONE)
 			kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
@@ -3127,7 +3127,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	++vcpu->stat.io_exits;
 
 	if (string || in)
-		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
@@ -3372,7 +3372,7 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
-	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
 static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -3403,7 +3403,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
-	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 }
 
 static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3618,7 +3618,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
 			return handle_interrupt_window(&vmx->vcpu);
 
-		err = emulate_instruction(vcpu, 0, 0, 0);
+		err = emulate_instruction(vcpu, 0);
 
 		if (err == EMULATE_DO_MMIO) {
 			ret = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d54cb7..a6fcb76 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4363,10 +4363,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	return false;
 }
 
-int emulate_instruction(struct kvm_vcpu *vcpu,
-			unsigned long cr2,
-			u16 error_code,
-			int emulation_type)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+			    unsigned long cr2,
+			    int emulation_type)
 {
 	int r;
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4474,7 +4473,7 @@ done:
 
 	return r;
 }
-EXPORT_SYMBOL_GPL(emulate_instruction);
+EXPORT_SYMBOL_GPL(x86_emulate_instruction);
 
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
@@ -5398,7 +5397,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			vcpu->mmio_needed = 0;
 		}
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 		if (r != EMULATE_DONE) {
 			r = 0;
-- 
cgit v0.10.2


From ddce97aac5405e0b2b8b2191cb65b5a48eb14145 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:03 +0100
Subject: KVM: SVM: add new SVM feature bit names

the recent APM Vol.2 and the recent AMD CPUID specification describe
new CPUID features bits for SVM. Name them here for later usage.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 90d0658..2830a73 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -51,6 +51,10 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV           (1 <<  1)
 #define SVM_FEATURE_SVML           (1 <<  2)
 #define SVM_FEATURE_NRIP           (1 <<  3)
+#define SVM_FEATURE_TSC_RATE       (1 <<  4)
+#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
+#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
+#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
 
 #define NESTED_EXIT_HOST	0	/* Exit handled on host level */
-- 
cgit v0.10.2


From 7ff76d58a9dc03a38b86d283abcaae2ac3c74fe3 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:04 +0100
Subject: KVM: SVM: enhance MOV CR intercept handler

Newer SVM implementations provide the GPR number in the VMCB, so
that the emulation path is no longer necesarry to handle CR
register access intercepts. Implement the handling in svm.c and
use it when the info is provided.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index f7087bf..f0ffb81 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -260,6 +260,8 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
 #define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
 
+#define SVM_EXITINFO_REG_MASK 0x0F
+
 #define	SVM_EXIT_READ_CR0 	0x000
 #define	SVM_EXIT_READ_CR3 	0x003
 #define	SVM_EXIT_READ_CR4 	0x004
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2830a73..5abaa5b2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2660,12 +2660,80 @@ static int emulate_on_interception(struct vcpu_svm *svm)
 	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
 }
 
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+	int reg, cr;
+	unsigned long val;
+	int err;
+
+	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+		return emulate_on_interception(svm);
+
+	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+		return emulate_on_interception(svm);
+
+	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+	cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+	err = 0;
+	if (cr >= 16) { /* mov to cr */
+		cr -= 16;
+		val = kvm_register_read(&svm->vcpu, reg);
+		switch (cr) {
+		case 0:
+			err = kvm_set_cr0(&svm->vcpu, val);
+			break;
+		case 3:
+			err = kvm_set_cr3(&svm->vcpu, val);
+			break;
+		case 4:
+			err = kvm_set_cr4(&svm->vcpu, val);
+			break;
+		case 8:
+			err = kvm_set_cr8(&svm->vcpu, val);
+			break;
+		default:
+			WARN(1, "unhandled write to CR%d", cr);
+			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+			return 1;
+		}
+	} else { /* mov from cr */
+		switch (cr) {
+		case 0:
+			val = kvm_read_cr0(&svm->vcpu);
+			break;
+		case 2:
+			val = svm->vcpu.arch.cr2;
+			break;
+		case 3:
+			val = svm->vcpu.arch.cr3;
+			break;
+		case 4:
+			val = kvm_read_cr4(&svm->vcpu);
+			break;
+		case 8:
+			val = kvm_get_cr8(&svm->vcpu);
+			break;
+		default:
+			WARN(1, "unhandled read from CR%d", cr);
+			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+			return 1;
+		}
+		kvm_register_write(&svm->vcpu, reg, val);
+	}
+	kvm_complete_insn_gp(&svm->vcpu, err);
+
+	return 1;
+}
+
 static int cr0_write_interception(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	int r;
 
-	r = emulate_instruction(&svm->vcpu, 0);
+	r = cr_interception(svm);
 
 	if (svm->nested.vmexit_rip) {
 		kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2674,7 +2742,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
 		svm->nested.vmexit_rip = 0;
 	}
 
-	return r == EMULATE_DONE;
+	return r;
 }
 
 static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2684,13 +2752,13 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
-	r = emulate_instruction(&svm->vcpu, 0);
+	r = cr_interception(svm);
 	if (irqchip_in_kernel(svm->vcpu.kvm)) {
 		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-		return r == EMULATE_DONE;
+		return r;
 	}
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
-		return r == EMULATE_DONE;
+		return r;
 	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
 	return 0;
 }
@@ -2933,14 +3001,14 @@ static int pause_interception(struct vcpu_svm *svm)
 }
 
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
-	[SVM_EXIT_READ_CR0]			= emulate_on_interception,
-	[SVM_EXIT_READ_CR3]			= emulate_on_interception,
-	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
-	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR0]			= cr_interception,
+	[SVM_EXIT_READ_CR3]			= cr_interception,
+	[SVM_EXIT_READ_CR4]			= cr_interception,
+	[SVM_EXIT_READ_CR8]			= cr_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR0]			= cr0_write_interception,
-	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR3]			= cr_interception,
+	[SVM_EXIT_WRITE_CR4]			= cr_interception,
 	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
 	[SVM_EXIT_READ_DR0]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
-- 
cgit v0.10.2


From cae3797a4639898f339fad57c7e12d270ec91a6c Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:05 +0100
Subject: KVM: SVM: enhance mov DR intercept handler

Newer SVM implementations provide the GPR number in the VMCB, so
that the emulation path is no longer necesarry to handle debug
register access intercepts. Implement the handling in svm.c and
use it when the info is provided.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5abaa5b2..d84f6a7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2745,6 +2745,30 @@ static int cr0_write_interception(struct vcpu_svm *svm)
 	return r;
 }
 
+static int dr_interception(struct vcpu_svm *svm)
+{
+	int reg, dr;
+	unsigned long val;
+	int err;
+
+	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+		return emulate_on_interception(svm);
+
+	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+	if (dr >= 16) { /* mov to DRn */
+		val = kvm_register_read(&svm->vcpu, reg);
+		kvm_set_dr(&svm->vcpu, dr - 16, val);
+	} else {
+		err = kvm_get_dr(&svm->vcpu, dr, &val);
+		if (!err)
+			kvm_register_write(&svm->vcpu, reg, val);
+	}
+
+	return 1;
+}
+
 static int cr8_write_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
@@ -3010,22 +3034,22 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_WRITE_CR3]			= cr_interception,
 	[SVM_EXIT_WRITE_CR4]			= cr_interception,
 	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
-	[SVM_EXIT_READ_DR0]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR3]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR4]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR5]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR6]			= emulate_on_interception,
-	[SVM_EXIT_READ_DR7]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR0]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR1]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR2]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR4]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR6]			= emulate_on_interception,
-	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
+	[SVM_EXIT_READ_DR0]			= dr_interception,
+	[SVM_EXIT_READ_DR1]			= dr_interception,
+	[SVM_EXIT_READ_DR2]			= dr_interception,
+	[SVM_EXIT_READ_DR3]			= dr_interception,
+	[SVM_EXIT_READ_DR4]			= dr_interception,
+	[SVM_EXIT_READ_DR5]			= dr_interception,
+	[SVM_EXIT_READ_DR6]			= dr_interception,
+	[SVM_EXIT_READ_DR7]			= dr_interception,
+	[SVM_EXIT_WRITE_DR0]			= dr_interception,
+	[SVM_EXIT_WRITE_DR1]			= dr_interception,
+	[SVM_EXIT_WRITE_DR2]			= dr_interception,
+	[SVM_EXIT_WRITE_DR3]			= dr_interception,
+	[SVM_EXIT_WRITE_DR4]			= dr_interception,
+	[SVM_EXIT_WRITE_DR5]			= dr_interception,
+	[SVM_EXIT_WRITE_DR6]			= dr_interception,
+	[SVM_EXIT_WRITE_DR7]			= dr_interception,
 	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
 	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
-- 
cgit v0.10.2


From df4f3108562dc6f6ae6648f2698df7f4c9acf52d Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:06 +0100
Subject: KVM: SVM: implement enhanced INVLPG intercept

When the DecodeAssist feature is available, the linear address
is provided in the VMCB on INVLPG intercepts. Use it directly to
avoid any decoding and emulation.
This is only useful for shadow paging, though.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d84f6a7..a04c01e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2652,7 +2652,12 @@ static int iret_interception(struct vcpu_svm *svm)
 
 static int invlpg_interception(struct vcpu_svm *svm)
 {
-	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+		return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+	skip_emulated_instruction(&svm->vcpu);
+	return 1;
 }
 
 static int emulate_on_interception(struct vcpu_svm *svm)
-- 
cgit v0.10.2


From dc25e89e07d5ef31c476117d2c76b34dbb22196c Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 21 Dec 2010 11:12:07 +0100
Subject: KVM: SVM: copy instruction bytes from VMCB

In case of a nested page fault or an intercepted #PF newer SVM
implementations provide a copy of the faulting instruction bytes
in the VMCB.
Use these bytes to feed the instruction emulator and avoid the costly
guest instruction fetch in this case.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index bf70ece..8e37deb 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -265,7 +265,7 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
 #endif
 
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
 #define EMULATION_FAILED -1
 #define EMULATION_OK 0
 #define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index de00b60..6268f6c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -634,13 +634,13 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
-			unsigned long cr2, int emulation_type);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+			    int emulation_type, void *insn, int insn_len);
 
 static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 			int emulation_type)
 {
-	return x86_emulate_instruction(vcpu, 0, emulation_type);
+	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -721,7 +721,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 
 void kvm_enable_tdp(void);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index f0ffb81..f2b83bc 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -83,7 +83,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 clean;
 	u32 reserved_5;
 	u64 next_rip;
-	u8 reserved_6[816];
+	u8 insn_len;
+	u8 insn_bytes[15];
+	u8 reserved_6[800];
 };
 
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 36534ec..caf9667 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2610,7 +2610,7 @@ done:
 }
 
 int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
@@ -2621,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 	struct operand memop = { .type = OP_NONE };
 
 	c->eip = ctxt->eip;
-	c->fetch.start = c->fetch.end = c->eip;
+	c->fetch.start = c->eip;
+	c->fetch.end = c->fetch.start + insn_len;
+	if (insn_len > 0)
+		memcpy(c->fetch.data, insn, insn_len);
 	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
 
 	switch (mode) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01c5a10..ea6063d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3330,7 +3330,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 	}
 }
 
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+		       void *insn, int insn_len)
 {
 	int r;
 	enum emulation_result er;
@@ -3348,7 +3349,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 	if (r)
 		goto out;
 
-	er = x86_emulate_instruction(vcpu, cr2, 0);
+	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
 
 	switch (er) {
 	case EMULATE_DONE:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a04c01e..af4b911 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1527,7 +1527,9 @@ static int pf_interception(struct vcpu_svm *svm)
 		trace_kvm_page_fault(fault_address, error_code);
 		if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
 			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
-		r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+		r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+			svm->vmcb->control.insn_bytes,
+			svm->vmcb->control.insn_len);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		svm->apf_reason = 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3c60fb..736f839 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3055,7 +3055,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 
 		if (kvm_event_needs_reinjection(vcpu))
 			kvm_mmu_unprotect_page_virt(vcpu, cr2);
-		return kvm_mmu_page_fault(vcpu, cr2, error_code);
+		return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
 	}
 
 	if (vmx->rmode.vm86_active &&
@@ -3502,7 +3502,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
-	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3);
+	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
 }
 
 static u64 ept_rsvd_mask(u64 spte, int level)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a6fcb76..7ad9cda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4365,7 +4365,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
-			    int emulation_type)
+			    int emulation_type,
+			    void *insn,
+			    int insn_len)
 {
 	int r;
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4386,7 +4388,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		vcpu->arch.emulate_ctxt.have_exception = false;
 		vcpu->arch.emulate_ctxt.perm_ok = false;
 
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
 		if (r == X86EMUL_PROPAGATE_FAULT)
 			goto done;
 
-- 
cgit v0.10.2


From 07c116d2f53d721377fe428d067e2cecf80c906e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Dec 2010 12:54:19 +0200
Subject: KVM: VMX: Add definitions for more vm entry/exit control bits

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9642c22..84471b8 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
 
+#define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL      0x00001000
 #define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
 #define VM_EXIT_SAVE_IA32_PAT			0x00040000
 #define VM_EXIT_LOAD_IA32_PAT			0x00080000
+#define VM_EXIT_SAVE_IA32_EFER                  0x00100000
+#define VM_EXIT_LOAD_IA32_EFER                  0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
 
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS            0x00000002
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL     0x00002000
 #define VM_ENTRY_LOAD_IA32_PAT			0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
 
 /* VMCS Encodings */
 enum vmcs_field {
-- 
cgit v0.10.2


From 110312c84b5fbd4daf5de2417fa8ab5ec883858d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Dec 2010 12:54:20 +0200
Subject: KVM: VMX: Optimize atomic EFER load

When NX is enabled on the host but not on the guest, we use the entry/exit
msr load facility, which is slow.  Optimize it to use entry/exit efer load,
which is ~1200 cycles faster.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 736f839..a713c69 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -191,6 +191,8 @@ static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 
+static bool cpu_has_load_ia32_efer;
+
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
 
@@ -664,6 +666,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 	unsigned i;
 	struct msr_autoload *m = &vmx->msr_autoload;
 
+	if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+		vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+		vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+		return;
+	}
+
 	for (i = 0; i < m->nr; ++i)
 		if (m->guest[i].index == msr)
 			break;
@@ -683,6 +691,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 	unsigned i;
 	struct msr_autoload *m = &vmx->msr_autoload;
 
+	if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+		vmcs_write64(GUEST_IA32_EFER, guest_val);
+		vmcs_write64(HOST_IA32_EFER, host_val);
+		vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+		vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+		return;
+	}
+
 	for (i = 0; i < m->nr; ++i)
 		if (m->guest[i].index == msr)
 			break;
@@ -1418,6 +1434,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 	return 0;
 }
 
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+	u32 vmx_msr_low, vmx_msr_high;
+
+	rdmsr(msr, vmx_msr_low, vmx_msr_high);
+	return vmx_msr_high & ctl;
+}
+
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
 	u32 vmx_msr_low, vmx_msr_high;
@@ -1532,6 +1556,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 	vmcs_conf->vmentry_ctrl        = _vmentry_control;
 
+	cpu_has_load_ia32_efer =
+		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+				VM_ENTRY_LOAD_IA32_EFER)
+		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+				   VM_EXIT_LOAD_IA32_EFER);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From c445f8ef43bd9d2fb28213920d9b21f2465d22df Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Dec 2010 16:26:01 +0200
Subject: KVM: MMU: Initialize base_role for tdp mmus

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ea6063d..b77a5ba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2885,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
+	context->base_role.word = 0;
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
-- 
cgit v0.10.2


From 16d8f72f70a85ce333788204a69318827130ca75 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Dec 2010 16:51:50 +0200
Subject: KVM: VMX: Correct asm constraint in vmcs_load()/vmcs_clear()

'error' is byte sized, so use a byte register constraint.

Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a713c69..67c0852 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -477,7 +477,7 @@ static void vmcs_clear(struct vmcs *vmcs)
 	u8 error;
 
 	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
-		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 		      : "cc", "memory");
 	if (error)
 		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -490,7 +490,7 @@ static void vmcs_load(struct vmcs *vmcs)
 	u8 error;
 
 	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 			: "cc", "memory");
 	if (error)
 		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-- 
cgit v0.10.2


From e49146dce8c3dc6f4485c1904b6587855f393e71 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 22 Dec 2010 09:01:57 -0200
Subject: KVM: MMU: only write protect mappings at pagetable level

If a pagetable contains a writeable large spte, all of its sptes will be
write protected, including non-leaf ones, leading to endless pagefaults.

Do not write protect pages above PT_PAGE_TABLE_LEVEL, as the spte fault
paths assume non-leaf sptes are writable.

Tested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b77a5ba..a2127f8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3445,6 +3445,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		if (!test_bit(slot, sp->slot_bitmap))
 			continue;
 
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
 		pt = sp->spt;
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 			/* avoid RMW */
-- 
cgit v0.10.2


From 9f8fe5043fd26627c2fa2e9a41896885e675000b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 5 Dec 2010 17:30:00 +0200
Subject: KVM: Replace reads of vcpu->arch.cr3 by an accessor

This allows us to keep cr3 in the VMCS, later on.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 95ac3af..a6bf8db 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,11 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
 	return vcpu->arch.cr4 & mask;
 }
 
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr3;
+}
+
 static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, ~0UL);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a2127f8..e558795 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2727,13 +2727,13 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
-	pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
+	pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
 	mmu_free_roots(vcpu);
 }
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.cr3;
+	return kvm_read_cr3(vcpu);
 }
 
 static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -3637,7 +3637,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
+	(void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
 	return 1;
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af4b911..a7b04c0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1174,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	switch (reg) {
 	case VCPU_EXREG_PDPTR:
 		BUG_ON(!npt_enabled);
-		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
 		break;
 	default:
 		BUG();
@@ -2116,7 +2116,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
 	nested_vmcb->save.efer   = svm->vcpu.arch.efer;
 	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
-	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
+	nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
 	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2311,7 +2311,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (npt_enabled)
 		hsave->save.cr3    = vmcb->save.cr3;
 	else
-		hsave->save.cr3    = svm->vcpu.arch.cr3;
+		hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
 
 	copy_vmcb_control_area(hsave, vmcb);
 
@@ -2715,7 +2715,7 @@ static int cr_interception(struct vcpu_svm *svm)
 			val = svm->vcpu.arch.cr2;
 			break;
 		case 3:
-			val = svm->vcpu.arch.cr3;
+			val = kvm_read_cr3(&svm->vcpu);
 			break;
 		case 4:
 			val = kvm_read_cr4(&svm->vcpu);
@@ -3693,7 +3693,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	mark_dirty(svm->vmcb, VMCB_NPT);
 
 	/* Also sync guest cr3 here in case we live migrate */
-	svm->vmcb->save.cr3 = vcpu->arch.cr3;
+	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
 	mark_dirty(svm->vmcb, VMCB_CR);
 
 	svm_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 67c0852..141956e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1989,7 +1989,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	if (enable_ept) {
 		eptp = construct_eptp(cr3);
 		vmcs_write64(EPT_POINTER, eptp);
-		guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+		guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
 			vcpu->kvm->arch.ept_identity_map_addr;
 		ept_load_pdptrs(vcpu);
 	}
@@ -3227,8 +3227,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	case 1: /*mov from cr*/
 		switch (cr) {
 		case 3:
-			kvm_register_write(vcpu, reg, vcpu->arch.cr3);
-			trace_kvm_cr_read(cr, vcpu->arch.cr3);
+			val = kvm_read_cr3(vcpu);
+			kvm_register_write(vcpu, reg, val);
+			trace_kvm_cr_read(cr, val);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ad9cda..6e50314 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -473,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		return true;
 
-	gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
-	offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
+	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
 	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
@@ -519,7 +519,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		} else
 #endif
 		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
-						 vcpu->arch.cr3))
+						 kvm_read_cr3(vcpu)))
 			return 1;
 	}
 
@@ -611,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
+		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+				   kvm_read_cr3(vcpu)))
 		return 1;
 
 	if (cr4 & X86_CR4_VMXE)
@@ -631,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
-	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 		return 0;
@@ -4073,7 +4074,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
 		value = vcpu->arch.cr2;
 		break;
 	case 3:
-		value = vcpu->arch.cr3;
+		value = kvm_read_cr3(vcpu);
 		break;
 	case 4:
 		value = kvm_read_cr4(vcpu);
@@ -5512,7 +5513,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 	sregs->cr0 = kvm_read_cr0(vcpu);
 	sregs->cr2 = vcpu->arch.cr2;
-	sregs->cr3 = vcpu->arch.cr3;
+	sregs->cr3 = kvm_read_cr3(vcpu);
 	sregs->cr4 = kvm_read_cr4(vcpu);
 	sregs->cr8 = kvm_get_cr8(vcpu);
 	sregs->efer = vcpu->arch.efer;
@@ -5580,7 +5581,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 
 	vcpu->arch.cr2 = sregs->cr2;
-	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
 	vcpu->arch.cr3 = sregs->cr3;
 
 	kvm_set_cr8(vcpu, sregs->cr8);
@@ -5598,7 +5599,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (sregs->cr4 & X86_CR4_OSXSAVE)
 		update_cpuid(vcpu);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
-		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
 		mmu_reset_needed = 1;
 	}
 
-- 
cgit v0.10.2


From aff48baa34c033318ad322ecbf2e4bcd891b29ca Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 5 Dec 2010 18:56:11 +0200
Subject: KVM: Fetch guest cr3 from hardware on demand

Instead of syncing the guest cr3 every exit, which is expensince on vmx
with ept enabled, sync it only on demand.

[sheng: fix incorrect cr3 seen by Windows XP]

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6268f6c..95f026b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -117,6 +117,7 @@ enum kvm_reg {
 
 enum kvm_reg_ex {
 	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+	VCPU_EXREG_CR3,
 };
 
 enum {
@@ -533,6 +534,7 @@ struct kvm_x86_ops {
 			    struct kvm_segment *var, int seg);
 	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
 	void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+	void (*decache_cr3)(struct kvm_vcpu *vcpu);
 	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
 	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a6bf8db..3377d53 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -75,6 +75,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
 
 static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
 {
+	if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+		kvm_x86_ops->decache_cr3(vcpu);
 	return vcpu->arch.cr3;
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a7b04c0..25bd1bc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1327,6 +1327,10 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 }
 
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
 }
@@ -3871,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.get_cpl = svm_get_cpl,
 	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
 	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+	.decache_cr3 = svm_decache_cr3,
 	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
 	.set_cr0 = svm_set_cr0,
 	.set_cr3 = svm_set_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 141956e..1896cad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -180,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1866,6 +1867,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
 }
 
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+	if (enable_ept && is_paging(vcpu))
+		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1909,6 +1917,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 					unsigned long cr0,
 					struct kvm_vcpu *vcpu)
 {
+	vmx_decache_cr3(vcpu);
 	if (!(cr0 & X86_CR0_PG)) {
 		/* From paging/starting to nonpaging */
 		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -3756,11 +3765,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
 
-	/* Access CR3 don't cause VMExit in paging mode, so we need
-	 * to sync with guest real CR3. */
-	if (enable_ept && is_paging(vcpu))
-		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4077,7 +4081,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	      );
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-				  | (1 << VCPU_EXREG_PDPTR));
+				  | (1 << VCPU_EXREG_PDPTR)
+				  | (1 << VCPU_EXREG_CR3));
 	vcpu->arch.regs_dirty = 0;
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4344,6 +4349,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.get_cpl = vmx_get_cpl,
 	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
 	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+	.decache_cr3 = vmx_decache_cr3,
 	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
 	.set_cr0 = vmx_set_cr0,
 	.set_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e50314..fa708c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -667,6 +667,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 		return 1;
 	vcpu->arch.cr3 = cr3;
+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 	vcpu->arch.mmu.new_cr3(vcpu);
 	return 0;
 }
@@ -5583,6 +5584,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	vcpu->arch.cr2 = sregs->cr2;
 	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
 	vcpu->arch.cr3 = sregs->cr3;
+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 
 	kvm_set_cr8(vcpu, sregs->cr8);
 
-- 
cgit v0.10.2


From b034cf0105235e65ee1b0161dbe8fef0338d06e7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 23 Dec 2010 16:08:35 +0800
Subject: KVM: MMU: audit: allow audit more guests at the same time

It only allows to audit one guest in the system since:
- 'audit_point' is a glob variable
- mmu_audit_disable() is called in kvm_mmu_destroy(), so audit is disabled
  after a guest exited

this patch fix those issues then allow to audit more guests at the same time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 95f026b..aa75f21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -461,6 +461,10 @@ struct kvm_arch {
 	/* fields used by HYPER-V emulation */
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
+
+	#ifdef CONFIG_KVM_MMU_AUDIT
+	int audit_point;
+	#endif
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e558795..892ffc6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3534,13 +3534,6 @@ static void mmu_destroy_caches(void)
 		kmem_cache_destroy(mmu_page_header_cache);
 }
 
-void kvm_mmu_module_exit(void)
-{
-	mmu_destroy_caches();
-	percpu_counter_destroy(&kvm_total_used_mmu_pages);
-	unregister_shrinker(&mmu_shrinker);
-}
-
 int kvm_mmu_module_init(void)
 {
 	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3733,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
@@ -3746,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 	destroy_kvm_mmu(vcpu);
 	free_mmu_pages(vcpu);
 	mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+	mmu_destroy_caches();
+	percpu_counter_destroy(&kvm_total_used_mmu_pages);
+	unregister_shrinker(&mmu_shrinker);
 	mmu_audit_disable();
 }
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdd..5f6223b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
 
 #include <linux/ratelimit.h>
 
-static int audit_point;
-
-#define audit_printk(fmt, args...)		\
+#define audit_printk(kvm, fmt, args...)		\
 	printk(KERN_ERR "audit: (%s) error: "	\
-		fmt, audit_point_name[audit_point], ##args)
+		fmt, audit_point_name[kvm->arch.audit_point], ##args)
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 	if (sp->unsync) {
 		if (level != PT_PAGE_TABLE_LEVEL) {
-			audit_printk("unsync sp: %p level = %d\n", sp, level);
+			audit_printk(vcpu->kvm, "unsync sp: %p "
+				     "level = %d\n", sp, level);
 			return;
 		}
 
 		if (*sptep == shadow_notrap_nonpresent_pte) {
-			audit_printk("notrap spte in unsync sp: %p\n", sp);
+			audit_printk(vcpu->kvm, "notrap spte in unsync "
+				     "sp: %p\n", sp);
 			return;
 		}
 	}
 
 	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-		audit_printk("notrap spte in direct sp: %p\n", sp);
+		audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+			     sp);
 		return;
 	}
 
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 	hpa =  pfn << PAGE_SHIFT;
 	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-		audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
-				   vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+		audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+			     "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+			     hpa, *sptep);
 }
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	if (!gfn_to_memslot(kvm, gfn)) {
 		if (!printk_ratelimit())
 			return;
-		audit_printk("no memslot for gfn %llx\n", gfn);
-		audit_printk("index %ld of sp (gfn=%llx)\n",
+		audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+		audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
 		       (long int)(sptep - rev_sp->spt), rev_sp->gfn);
 		dump_stack();
 		return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	if (!*rmapp) {
 		if (!printk_ratelimit())
 			return;
-		audit_printk("no rmap for writable spte %llx\n", *sptep);
+		audit_printk(kvm, "no rmap for writable spte %llx\n",
+			     *sptep);
 		dump_stack();
 	}
 }
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
 	struct kvm_mmu_page *sp = page_header(__pa(sptep));
 
-	if (audit_point == AUDIT_POST_SYNC && sp->unsync)
-		audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+	if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+		audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+			     "root.\n", sp);
 }
 
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
 		if (is_writable_pte(*spte))
-			audit_printk("shadow page has writable mappings: gfn "
-				     "%llx role %x\n", sp->gfn, sp->role.word);
+			audit_printk(kvm, "shadow page has writable "
+				     "mappings: gfn %llx role %x\n",
+				     sp->gfn, sp->role.word);
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
 	if (!__ratelimit(&ratelimit_state))
 		return;
 
-	audit_point = point;
+	vcpu->kvm->arch.audit_point = point;
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
 }
-- 
cgit v0.10.2


From f8e453b00cf1d90a047eb0ef8eefe6200e0f4512 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 23 Dec 2010 16:09:29 +0800
Subject: KVM: MMU: handle 'map_writable' in set_spte() function

Move the operation of 'writable' to set_spte() to clean up code

[avi: remove unneeded booleanification]

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 892ffc6..9cafbb4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1983,6 +1983,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
+	else
+		pte_access &= ~ACC_WRITE_MASK;
 
 	spte |= (u64)pfn << PAGE_SHIFT;
 
@@ -2222,8 +2224,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		if (iterator.level == level) {
 			unsigned pte_access = ACC_ALL;
 
-			if (!map_writable)
-				pte_access &= ~ACC_WRITE_MASK;
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
 				     0, write, 1, &pt_write,
 				     level, gfn, pfn, prefault, map_writable);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5ca9426..53210f1 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -511,9 +511,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		link_shadow_page(it.sptep, sp);
 	}
 
-	if (!map_writable)
-		access &= ~ACC_WRITE_MASK;
-
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
@@ -809,12 +806,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-		if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
-			pte_access &= ~ACC_WRITE_MASK;
-			host_writable = 0;
-		} else {
-			host_writable = 1;
-		}
+		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 			 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false,
-- 
cgit v0.10.2


From 444e863d13373b958ec6b133dcecf140d6c6c2ab Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 27 Dec 2010 17:25:04 +0200
Subject: KVM: VMX: when entering real mode align segment base to 16 bytes

VMX checks that base is equal segment shifted 4 bits left. Otherwise
guest entry fails.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1896cad..bf89ec2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1736,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
 	save->limit = vmcs_read32(sf->limit);
 	save->ar = vmcs_read32(sf->ar_bytes);
 	vmcs_write16(sf->selector, save->base >> 4);
-	vmcs_write32(sf->base, save->base & 0xfffff);
+	vmcs_write32(sf->base, save->base & 0xffff0);
 	vmcs_write32(sf->limit, 0xffff);
 	vmcs_write32(sf->ar_bytes, 0xf3);
+	if (save->base & 0xf)
+		printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+			    " aligned when entering protected mode (seg=%d)",
+			    seg);
 }
 
 static void enter_rmode(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From e5c301428294cb8925667c9ee39f817c4ab1c2c9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 11 Jan 2011 12:15:54 +0200
Subject: KVM: Initialize fpu state in preemptible context

init_fpu() (which is indirectly called by the fpu switching code) assumes
it is in process context.  Rather than makeing init_fpu() use an atomic
allocation, which can cause a task to be killed, make sure the fpu is
already initialized when we enter the run loop.

KVM-Stable-Tag.
Reported-and-tested-by: Kirill A. Shutemov <kas@openvz.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239..e60c38c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
 	set_stopped_child_used_math(tsk);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(init_fpu);
 
 /*
  * The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..9dda70d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5376,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	sigset_t sigsaved;
 
+	if (!tsk_used_math(current) && init_fpu(current))
+		return -ENOMEM;
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-- 
cgit v0.10.2