From e48ba1cbce12eb4546771d45c09dd94c3404efe8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 10 Aug 2016 11:13:09 +1000
Subject: KVM: PPC: Book3S: Don't crash if irqfd used with no in-kernel XICS
 emulation

It turns out that if userspace creates a pseries-type VM without
in-kernel XICS (interrupt controller) emulation, and then connects
an eventfd to the VM as an irqfd, and the eventfd gets signalled,
that the code will try to deliver an interrupt via the non-existent
XICS object and crash the host kernel with a NULL pointer dereference.

To fix this, we check for the presence of the XICS object before
trying to deliver the interrupt, and return with an error if not.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 05aa113..686147e 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1252,6 +1252,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 {
 	struct kvmppc_xics *xics = kvm->arch.xics;
 
+	if (!xics)
+		return -ENODEV;
 	return ics_deliver_irq(xics, irq, level);
 }
 
-- 
cgit v0.10.2


From 34a75b0f63356097ae9f706d64a793934891002f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 10 Aug 2016 11:27:27 +1000
Subject: KVM: PPC: Implement kvm_arch_intc_initialized() for PPC

It doesn't make sense to create irqfds for a VM that doesn't have
in-kernel interrupt controller emulation.  There is an existing
interface for architecture code to tell the irqfd code whether or
not any interrupt controller has been initialized, called
kvm_arch_intc_initialized(), so let's implement that for powerpc.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ec35af3..e36ce0c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -43,6 +43,8 @@
 #include <asm/cputhreads.h>
 #define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
 
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6ce40dd..ab2b3d5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1167,6 +1167,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+	if (kvm->arch.mpic)
+		return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+	if (kvm->arch.xics)
+		return true;
+#endif
+	return false;
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
 {
-- 
cgit v0.10.2


From 4b3d173d0440d37534906b6d93c02dfb577c68ce Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 18 Aug 2016 16:04:41 +1000
Subject: KVM: PPC: Always select KVM_VFIO, plus Makefile cleanup

As discussed recently on the kvm mailing list, David Gibson's
intention in commit 178a78750212 ("vfio: Enable VFIO device for
powerpc", 2016-02-01) was to have the KVM VFIO device built in
on all powerpc platforms.  This patch adds the "select KVM_VFIO"
statement that makes this happen.

Currently, arch/powerpc/kvm/Makefile doesn't include vfio.o for
the 64-bit kvm module, because the list of objects doesn't use
the $(common-objs-y) list.  The reason it doesn't is because we
don't necessarily want coalesced_mmio.o or emulate.o (for example
if HV KVM is the only target), and common-objs-y includes both.

Since this is confusing, this patch adjusts the definitions so that
we now use $(common-objs-y) in the list for the 64-bit kvm.ko
module, emulate.o is removed from common-objs-y and added in the
places that need it, and the inclusion of coalesced_mmio.o now
depends on CONFIG_KVM_MMIO.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index c2024ac..c0a2478 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
 	select ANON_INODES
 	select HAVE_KVM_EVENTFD
 	select SRCU
+	select KVM_VFIO
 
 config KVM_BOOK3S_HANDLER
 	bool
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1f9e552..bd9b82f 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm
 
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-		$(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
 common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 
 CFLAGS_e500_mmu.o := -I.
 CFLAGS_e500_mmu_host.o := -I.
 CFLAGS_emulate.o  := -I.
 CFLAGS_emulate_loadstore.o  := -I.
 
-common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+common-objs-y += powerpc.o emulate_loadstore.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
 
@@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
 
 kvm-e500-objs := \
 	$(common-objs-y) \
+	emulate.o \
 	booke.o \
 	booke_emulate.o \
 	booke_interrupts.o \
@@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
 
 kvm-e500mc-objs := \
 	$(common-objs-y) \
+	emulate.o \
 	booke.o \
 	booke_emulate.o \
 	bookehv_interrupts.o \
@@ -61,9 +63,6 @@ kvm-pr-y := \
 	book3s_32_mmu.o
 
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-kvm-book3s_64-module-objs := \
-	$(KVM)/coalesced_mmio.o
-
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
 	book3s_rmhandlers.o
 endif
@@ -88,11 +87,8 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
 	book3s_xics.o
 
-kvm-book3s_64-module-objs += \
-	$(KVM)/kvm_main.o \
-	$(KVM)/eventfd.o \
-	powerpc.o \
-	emulate_loadstore.o \
+kvm-book3s_64-module-objs := \
+	$(common-objs-y) \
 	book3s.o \
 	book3s_64_vio.o \
 	book3s_rtas.o \
@@ -102,6 +98,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
 	$(common-objs-y) \
+	emulate.o \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s.o \
-- 
cgit v0.10.2


From e64fb7e272885c1ea3cd2f68f267ae12fa04c8b1 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Date: Tue, 2 Aug 2016 14:03:19 +1000
Subject: KVM: PPC: Book3S HV: Move struct kvmppc_vcore from kvm_host.h to
 kvm_book3s.h

The next commit will introduce a member to the kvmppc_vcore struct which
references MAX_SMT_THREADS which is defined in kvm_book3s_asm.h, however
this file isn't included in kvm_host.h directly. Thus compiling for
certain platforms such as pmac32_defconfig and ppc64e_defconfig with KVM
fails due to MAX_SMT_THREADS not being defined.

Move the struct kvmppc_vcore definition to kvm_book3s.h which explicitly
includes kvm_book3s_asm.h.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8f39796..a50c5fe 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -69,6 +69,41 @@ struct hpte_cache {
 	int pagesize;
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
+ */
+struct kvmppc_vcore {
+	int n_runnable;
+	int num_threads;
+	int entry_exit_map;
+	int napping_threads;
+	int first_vcpuid;
+	u16 pcpu;
+	u16 last_cpu;
+	u8 vcore_state;
+	u8 in_guest;
+	struct kvmppc_vcore *master_vcore;
+	struct list_head runnable_threads;
+	struct list_head preempt_list;
+	spinlock_t lock;
+	struct swait_queue_head wq;
+	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
+	u64 stolen_tb;
+	u64 preempt_tb;
+	struct kvm_vcpu *runner;
+	struct kvm *kvm;
+	u64 tb_offset;		/* guest timebase - host timebase */
+	ulong lpcr;
+	u32 arch_compat;
+	ulong pcr;
+	ulong dpdes;		/* doorbell state (POWER8) */
+	ulong conferring_threads;
+};
+
 struct kvmppc_vcpu_book3s {
 	struct kvmppc_sid_map sid_map[SID_MAP_NUM];
 	struct {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e36ce0c..7ff9919 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -277,41 +277,6 @@ struct kvm_arch {
 #endif
 };
 
-/*
- * Struct for a virtual core.
- * Note: entry_exit_map combines a bitmap of threads that have entered
- * in the bottom 8 bits and a bitmap of threads that have exited in the
- * next 8 bits.  This is so that we can atomically set the entry bit
- * iff the exit map is 0 without taking a lock.
- */
-struct kvmppc_vcore {
-	int n_runnable;
-	int num_threads;
-	int entry_exit_map;
-	int napping_threads;
-	int first_vcpuid;
-	u16 pcpu;
-	u16 last_cpu;
-	u8 vcore_state;
-	u8 in_guest;
-	struct kvmppc_vcore *master_vcore;
-	struct list_head runnable_threads;
-	struct list_head preempt_list;
-	spinlock_t lock;
-	struct swait_queue_head wq;
-	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
-	u64 stolen_tb;
-	u64 preempt_tb;
-	struct kvm_vcpu *runner;
-	struct kvm *kvm;
-	u64 tb_offset;		/* guest timebase - host timebase */
-	ulong lpcr;
-	u32 arch_compat;
-	ulong pcr;
-	ulong dpdes;		/* doorbell state (POWER8) */
-	ulong conferring_threads;
-};
-
 #define VCORE_ENTRY_MAP(vc)	((vc)->entry_exit_map & 0xff)
 #define VCORE_EXIT_MAP(vc)	((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)	(VCORE_EXIT_MAP(vc) != 0)
-- 
cgit v0.10.2


From 7b5f8272c792d49da73d98e9ca32f4cbb6d53808 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Date: Tue, 2 Aug 2016 14:03:20 +1000
Subject: KVM: PPC: Book3S HV: Change vcore element runnable_threads from
 linked-list to array

The struct kvmppc_vcore is a structure used to store various information
about a virtual core for a kvm guest. The runnable_threads element of the
struct provides a list of all of the currently runnable vcpus on the core
(those in the KVMPPC_VCPU_RUNNABLE state). The previous implementation of
this list was a linked_list. The next patch requires that the list be able
to be iterated over without holding the vcore lock.

Reimplement the runnable_threads list in the kvmppc_vcore struct as an
array. Implement function to iterate over valid entries in the array and
update access sites accordingly.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index a50c5fe..151f817 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -87,7 +87,7 @@ struct kvmppc_vcore {
 	u8 vcore_state;
 	u8 in_guest;
 	struct kvmppc_vcore *master_vcore;
-	struct list_head runnable_threads;
+	struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
 	struct list_head preempt_list;
 	spinlock_t lock;
 	struct swait_queue_head wq;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7ff9919..851575a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -635,7 +635,6 @@ struct kvm_vcpu_arch {
 	long pgfault_index;
 	unsigned long pgfault_hpte[2];
 
-	struct list_head run_list;
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2fd5580..ebbab1b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -58,6 +58,7 @@
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/module.h>
+#include <linux/compiler.h>
 
 #include "book3s.h"
 
@@ -97,6 +98,26 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+		int *ip)
+{
+	int i = *ip;
+	struct kvm_vcpu *vcpu;
+
+	while (++i < MAX_SMT_THREADS) {
+		vcpu = READ_ONCE(vc->runnable_threads[i]);
+		if (vcpu) {
+			*ip = i;
+			return vcpu;
+		}
+	}
+	return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
 static bool kvmppc_ipi_thread(int cpu)
 {
 	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -1493,7 +1514,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	if (vcore == NULL)
 		return NULL;
 
-	INIT_LIST_HEAD(&vcore->runnable_threads);
 	spin_lock_init(&vcore->lock);
 	spin_lock_init(&vcore->stoltb_lock);
 	init_swait_queue_head(&vcore->wq);
@@ -1802,7 +1822,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 	spin_unlock_irq(&vcpu->arch.tbacct_lock);
 	--vc->n_runnable;
-	list_del(&vcpu->arch.run_list);
+	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
 }
 
 static int kvmppc_grab_hwthread(int cpu)
@@ -2209,10 +2229,10 @@ static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
 
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
+	int i;
+	struct kvm_vcpu *vcpu;
 
-	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-				 arch.run_list) {
+	for_each_runnable_thread(i, vcpu, vc) {
 		if (signal_pending(vcpu->arch.run_task))
 			vcpu->arch.ret = -EINTR;
 		else if (vcpu->arch.vpa.update_pending ||
@@ -2259,15 +2279,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
 
 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
-	int still_running = 0;
+	int still_running = 0, i;
 	u64 now;
 	long ret;
-	struct kvm_vcpu *vcpu, *vnext;
+	struct kvm_vcpu *vcpu;
 
 	spin_lock(&vc->lock);
 	now = get_tb();
-	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-				 arch.run_list) {
+	for_each_runnable_thread(i, vcpu, vc) {
 		/* cancel pending dec exception if dec is positive */
 		if (now < vcpu->arch.dec_expires &&
 		    kvmppc_core_pending_dec(vcpu))
@@ -2307,8 +2326,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 		}
 		if (vc->n_runnable > 0 && vc->runner == NULL) {
 			/* make sure there's a candidate runner awake */
-			vcpu = list_first_entry(&vc->runnable_threads,
-						struct kvm_vcpu, arch.run_list);
+			i = -1;
+			vcpu = next_runnable_thread(vc, &i);
 			wake_up(&vcpu->arch.cpu_run);
 		}
 	}
@@ -2361,7 +2380,7 @@ static inline void kvmppc_set_host_core(int cpu)
  */
 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
+	struct kvm_vcpu *vcpu;
 	int i;
 	int srcu_idx;
 	struct core_info core_info;
@@ -2397,8 +2416,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 */
 	if ((threads_per_core > 1) &&
 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-		list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-					 arch.run_list) {
+		for_each_runnable_thread(i, vcpu, vc) {
 			vcpu->arch.ret = -EBUSY;
 			kvmppc_remove_runnable(vc, vcpu);
 			wake_up(&vcpu->arch.cpu_run);
@@ -2477,8 +2495,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		active |= 1 << thr;
 		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
 			pvc->pcpu = pcpu + thr;
-			list_for_each_entry(vcpu, &pvc->runnable_threads,
-					    arch.run_list) {
+			for_each_runnable_thread(i, vcpu, pvc) {
 				kvmppc_start_thread(vcpu, pvc);
 				kvmppc_create_dtl_entry(vcpu, pvc);
 				trace_kvm_guest_enter(vcpu);
@@ -2611,7 +2628,7 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
 	struct kvm_vcpu *vcpu;
-	int do_sleep = 1;
+	int do_sleep = 1, i;
 	DECLARE_SWAITQUEUE(wait);
 
 	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
@@ -2620,7 +2637,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	 * Check one last time for pending exceptions and ceded state after
 	 * we put ourselves on the wait queue
 	 */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+	for_each_runnable_thread(i, vcpu, vc) {
 		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
 			do_sleep = 0;
 			break;
@@ -2644,9 +2661,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-	int n_ceded;
+	int n_ceded, i;
 	struct kvmppc_vcore *vc;
-	struct kvm_vcpu *v, *vn;
+	struct kvm_vcpu *v;
 
 	trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -2666,7 +2683,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
 	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
 	vcpu->arch.busy_preempt = TB_NIL;
-	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
 	++vc->n_runnable;
 
 	/*
@@ -2706,8 +2723,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
 			continue;
 		}
-		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
-					 arch.run_list) {
+		for_each_runnable_thread(i, v, vc) {
 			kvmppc_core_prepare_to_enter(v);
 			if (signal_pending(v->arch.run_task)) {
 				kvmppc_remove_runnable(vc, v);
@@ -2720,7 +2736,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 			break;
 		n_ceded = 0;
-		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+		for_each_runnable_thread(i, v, vc) {
 			if (!v->arch.pending_exceptions)
 				n_ceded += v->arch.ceded;
 			else
@@ -2759,8 +2775,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
 		/* Wake up some vcpu to run the core */
-		v = list_first_entry(&vc->runnable_threads,
-				     struct kvm_vcpu, arch.run_list);
+		i = -1;
+		v = next_runnable_thread(vc, &i);
 		wake_up(&v->arch.cpu_run);
 	}
 
-- 
cgit v0.10.2


From 0cda69dd7cd64fdd54bdf584b5d6ba53767ba422 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Date: Tue, 2 Aug 2016 14:03:21 +1000
Subject: KVM: PPC: Book3S HV: Implement halt polling

This patch introduces new halt polling functionality into the kvm_hv kernel
module. When a vcore is idle it will poll for some period of time before
scheduling itself out.

When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
idle) we schedule ourselves out to allow something else to run. In the
event that we need to wake up very quickly (for example an interrupt
arrives), we are required to wait until we get scheduled again.

Implement halt polling so that when a vcore is idle, and before scheduling
ourselves, we poll for vcpus in the runnable_threads list which have
pending exceptions or which leave the ceded state. If we poll successfully
then we can get back into the guest very quickly without ever scheduling
ourselves, otherwise we schedule ourselves out as before.

There exists generic halt_polling code in virt/kvm_main.c, however on
powerpc the polling conditions are different to the generic case. It would
be nice if we could just implement an arch specific kvm_check_block()
function, but there is still other arch specific things which need to be
done for kvm_hv (for example manipulating vcore states) which means that a
separate implementation is the best option.

Testing of this patch with a TCP round robin test between two guests with
virtio network interfaces has found a decrease in round trip time of ~15us
on average. A performance gain is only seen when going out of and
back into the guest often and quickly, otherwise there is no net benefit
from the polling. The polling interval is adjusted such that when we are
often scheduled out for long periods of time it is reduced, and when we
often poll successfully it is increased. The rate at which the polling
interval increases or decreases, and the maximum polling interval, can
be set through module parameters.

Based on the implementation in the generic kvm module by Wanpeng Li and
Paolo Bonzini, and on direction from Paul Mackerras.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 151f817..c261f52 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -102,6 +102,7 @@ struct kvmppc_vcore {
 	ulong pcr;
 	ulong dpdes;		/* doorbell state (POWER8) */
 	ulong conferring_threads;
+	unsigned int halt_poll_ns;
 };
 
 struct kvmppc_vcpu_book3s {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 851575a..6ece4a8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -296,6 +296,7 @@ struct kvm_arch {
 #define VCORE_SLEEPING	3
 #define VCORE_RUNNING	4
 #define VCORE_EXITING	5
+#define VCORE_POLLING	6
 
 /*
  * Struct used to manage memory for a virtual processor area
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ebbab1b..3c85c3b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -95,6 +95,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -2621,32 +2638,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
 	finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+	/* 10us base */
+	if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+		vc->halt_poll_ns = 10000;
+	else
+		vc->halt_poll_ns *= halt_poll_ns_grow;
+
+	if (vc->halt_poll_ns > halt_poll_max_ns)
+		vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+	if (halt_poll_ns_shrink == 0)
+		vc->halt_poll_ns = 0;
+	else
+		vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	for_each_runnable_thread(i, vcpu, vc) {
+		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * All the vcpus in this vcore are idle, so wait for a decrementer
  * or external interrupt to one of the vcpus.  vc->lock is held.
  */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu;
-	int do_sleep = 1, i;
+	int do_sleep = 1;
+	ktime_t cur, start;
+	u64 block_ns;
 	DECLARE_SWAITQUEUE(wait);
 
-	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	/* Poll for pending exceptions and ceded state */
+	cur = start = ktime_get();
+	if (vc->halt_poll_ns) {
+		ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
 
-	/*
-	 * Check one last time for pending exceptions and ceded state after
-	 * we put ourselves on the wait queue
-	 */
-	for_each_runnable_thread(i, vcpu, vc) {
-		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
-			do_sleep = 0;
-			break;
-		}
+		vc->vcore_state = VCORE_POLLING;
+		spin_unlock(&vc->lock);
+
+		do {
+			if (kvmppc_vcore_check_block(vc)) {
+				do_sleep = 0;
+				break;
+			}
+			cur = ktime_get();
+		} while (single_task_running() && ktime_before(cur, stop));
+
+		spin_lock(&vc->lock);
+		vc->vcore_state = VCORE_INACTIVE;
+
+		if (!do_sleep)
+			goto out;
 	}
 
-	if (!do_sleep) {
+	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+	if (kvmppc_vcore_check_block(vc)) {
 		finish_swait(&vc->wq, &wait);
-		return;
+		do_sleep = 0;
+		goto out;
 	}
 
 	vc->vcore_state = VCORE_SLEEPING;
@@ -2657,6 +2724,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 	trace_kvmppc_vcore_blocked(vc, 1);
+
+	cur = ktime_get();
+
+out:
+	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+
+	/* Adjust poll time */
+	if (halt_poll_max_ns) {
+		if (block_ns <= vc->halt_poll_ns)
+			;
+		/* We slept and blocked for longer than the max halt time */
+		else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+			shrink_halt_poll_ns(vc);
+		/* We slept and our poll time is too small */
+		else if (vc->halt_poll_ns < halt_poll_max_ns &&
+				block_ns < halt_poll_max_ns)
+			grow_halt_poll_ns(vc);
+	} else
+		vc->halt_poll_ns = 0;
+
+	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 33d9daf..fb21990 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
 		   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
 );
 
+TRACE_EVENT(kvmppc_vcore_wakeup,
+	TP_PROTO(int do_sleep, __u64 ns),
+
+	TP_ARGS(do_sleep, ns),
+
+	TP_STRUCT__entry(
+		__field(__u64,  ns)
+		__field(int,    waited)
+		__field(pid_t,  tgid)
+	),
+
+	TP_fast_assign(
+		__entry->ns     = ns;
+		__entry->waited = do_sleep;
+		__entry->tgid   = current->tgid;
+	),
+
+	TP_printk("%s time %lld ns, tgid=%d",
+		__entry->waited ? "wait" : "poll",
+		__entry->ns, __entry->tgid)
+);
+
 TRACE_EVENT(kvmppc_run_vcpu_enter,
 	TP_PROTO(struct kvm_vcpu *vcpu),
 
-- 
cgit v0.10.2


From 8a7e75d47b68193339f8727cf4503271d0a0b1d0 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Date: Tue, 2 Aug 2016 14:03:22 +1000
Subject: KVM: Add provisioning for ulong vm stats and u64 vcpu stats

vms and vcpus have statistics associated with them which can be viewed
within the debugfs. Currently it is assumed within the vcpu_stat_get() and
vm_stat_get() functions that all of these statistics are represented as
u32s, however the next patch adds some u64 vcpu statistics.

Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly.
Since vcpu statistics are per vcpu, they will only be updated by a single
vcpu at a time so this shouldn't present a problem on 32-bit machines
which can't atomically increment 64-bit numbers. However vm statistics
could potentially be updated by multiple vcpus from that vm at a time.
To avoid the overhead of atomics make all vm statistics ulong such that
they are 64-bit on 64-bit systems where they can be atomically incremented
and are 32-bit on 32-bit systems which may not be able to atomically
increment 64-bit numbers. Modify vm_stat_get() to expect ulongs.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index de338d9..6ad21f04 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -183,15 +183,15 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_vm_stat {
-	u32 remote_tlb_flush;
+	ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 hvc_exit_stat;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 hvc_exit_stat;
 	u64 wfe_exit_stat;
 	u64 wfi_exit_stat;
 	u64 mmio_exit_user;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3eda975..bd94e67 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -290,15 +290,15 @@ struct kvm_vcpu_arch {
 #endif
 
 struct kvm_vm_stat {
-	u32 remote_tlb_flush;
+	ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 hvc_exit_stat;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 hvc_exit_stat;
 	u64 wfe_exit_stat;
 	u64 wfi_exit_stat;
 	u64 mmio_exit_user;
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b54bcad..5f488dc 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -110,32 +110,32 @@
 extern atomic_t kvm_mips_instance;
 
 struct kvm_vm_stat {
-	u32 remote_tlb_flush;
+	ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-	u32 wait_exits;
-	u32 cache_exits;
-	u32 signal_exits;
-	u32 int_exits;
-	u32 cop_unusable_exits;
-	u32 tlbmod_exits;
-	u32 tlbmiss_ld_exits;
-	u32 tlbmiss_st_exits;
-	u32 addrerr_st_exits;
-	u32 addrerr_ld_exits;
-	u32 syscall_exits;
-	u32 resvd_inst_exits;
-	u32 break_inst_exits;
-	u32 trap_inst_exits;
-	u32 msa_fpe_exits;
-	u32 fpe_exits;
-	u32 msa_disabled_exits;
-	u32 flush_dcache_exits;
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
+	u64 wait_exits;
+	u64 cache_exits;
+	u64 signal_exits;
+	u64 int_exits;
+	u64 cop_unusable_exits;
+	u64 tlbmod_exits;
+	u64 tlbmiss_ld_exits;
+	u64 tlbmiss_st_exits;
+	u64 addrerr_st_exits;
+	u64 addrerr_ld_exits;
+	u64 syscall_exits;
+	u64 resvd_inst_exits;
+	u64 break_inst_exits;
+	u64 trap_inst_exits;
+	u64 msa_fpe_exits;
+	u64 fpe_exits;
+	u64 msa_disabled_exits;
+	u64 flush_dcache_exits;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6ece4a8..5ec1dcf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -97,41 +97,41 @@ struct kvmppc_vcpu_book3s;
 struct kvmppc_book3s_shadow_vcpu;
 
 struct kvm_vm_stat {
-	u32 remote_tlb_flush;
+	ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-	u32 sum_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 light_exits;
+	u64 sum_exits;
+	u64 mmio_exits;
+	u64 signal_exits;
+	u64 light_exits;
 	/* Account for special types of light exits: */
-	u32 itlb_real_miss_exits;
-	u32 itlb_virt_miss_exits;
-	u32 dtlb_real_miss_exits;
-	u32 dtlb_virt_miss_exits;
-	u32 syscall_exits;
-	u32 isi_exits;
-	u32 dsi_exits;
-	u32 emulated_inst_exits;
-	u32 dec_exits;
-	u32 ext_intr_exits;
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 dbell_exits;
-	u32 gdbell_exits;
-	u32 ld;
-	u32 st;
+	u64 itlb_real_miss_exits;
+	u64 itlb_virt_miss_exits;
+	u64 dtlb_real_miss_exits;
+	u64 dtlb_virt_miss_exits;
+	u64 syscall_exits;
+	u64 isi_exits;
+	u64 dsi_exits;
+	u64 emulated_inst_exits;
+	u64 dec_exits;
+	u64 ext_intr_exits;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 dbell_exits;
+	u64 gdbell_exits;
+	u64 ld;
+	u64 st;
 #ifdef CONFIG_PPC_BOOK3S
-	u32 pf_storage;
-	u32 pf_instruc;
-	u32 sp_storage;
-	u32 sp_instruc;
-	u32 queue_intr;
-	u32 ld_slow;
-	u32 st_slow;
+	u64 pf_storage;
+	u64 pf_instruc;
+	u64 sp_storage;
+	u64 sp_instruc;
+	u64 queue_intr;
+	u64 ld_slow;
+	u64 st_slow;
 #endif
 };
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8e5daf7..2bbe675 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -245,72 +245,72 @@ struct sie_page {
 } __packed;
 
 struct kvm_vcpu_stat {
-	u32 exit_userspace;
-	u32 exit_null;
-	u32 exit_external_request;
-	u32 exit_external_interrupt;
-	u32 exit_stop_request;
-	u32 exit_validity;
-	u32 exit_instruction;
-	u32 exit_pei;
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 instruction_lctl;
-	u32 instruction_lctlg;
-	u32 instruction_stctl;
-	u32 instruction_stctg;
-	u32 exit_program_interruption;
-	u32 exit_instr_and_program;
-	u32 exit_operation_exception;
-	u32 deliver_external_call;
-	u32 deliver_emergency_signal;
-	u32 deliver_service_signal;
-	u32 deliver_virtio_interrupt;
-	u32 deliver_stop_signal;
-	u32 deliver_prefix_signal;
-	u32 deliver_restart_signal;
-	u32 deliver_program_int;
-	u32 deliver_io_int;
-	u32 exit_wait_state;
-	u32 instruction_pfmf;
-	u32 instruction_stidp;
-	u32 instruction_spx;
-	u32 instruction_stpx;
-	u32 instruction_stap;
-	u32 instruction_storage_key;
-	u32 instruction_ipte_interlock;
-	u32 instruction_stsch;
-	u32 instruction_chsc;
-	u32 instruction_stsi;
-	u32 instruction_stfl;
-	u32 instruction_tprot;
-	u32 instruction_sie;
-	u32 instruction_essa;
-	u32 instruction_sthyi;
-	u32 instruction_sigp_sense;
-	u32 instruction_sigp_sense_running;
-	u32 instruction_sigp_external_call;
-	u32 instruction_sigp_emergency;
-	u32 instruction_sigp_cond_emergency;
-	u32 instruction_sigp_start;
-	u32 instruction_sigp_stop;
-	u32 instruction_sigp_stop_store_status;
-	u32 instruction_sigp_store_status;
-	u32 instruction_sigp_store_adtl_status;
-	u32 instruction_sigp_arch;
-	u32 instruction_sigp_prefix;
-	u32 instruction_sigp_restart;
-	u32 instruction_sigp_init_cpu_reset;
-	u32 instruction_sigp_cpu_reset;
-	u32 instruction_sigp_unknown;
-	u32 diagnose_10;
-	u32 diagnose_44;
-	u32 diagnose_9c;
-	u32 diagnose_258;
-	u32 diagnose_308;
-	u32 diagnose_500;
+	u64 exit_userspace;
+	u64 exit_null;
+	u64 exit_external_request;
+	u64 exit_external_interrupt;
+	u64 exit_stop_request;
+	u64 exit_validity;
+	u64 exit_instruction;
+	u64 exit_pei;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 instruction_lctl;
+	u64 instruction_lctlg;
+	u64 instruction_stctl;
+	u64 instruction_stctg;
+	u64 exit_program_interruption;
+	u64 exit_instr_and_program;
+	u64 exit_operation_exception;
+	u64 deliver_external_call;
+	u64 deliver_emergency_signal;
+	u64 deliver_service_signal;
+	u64 deliver_virtio_interrupt;
+	u64 deliver_stop_signal;
+	u64 deliver_prefix_signal;
+	u64 deliver_restart_signal;
+	u64 deliver_program_int;
+	u64 deliver_io_int;
+	u64 exit_wait_state;
+	u64 instruction_pfmf;
+	u64 instruction_stidp;
+	u64 instruction_spx;
+	u64 instruction_stpx;
+	u64 instruction_stap;
+	u64 instruction_storage_key;
+	u64 instruction_ipte_interlock;
+	u64 instruction_stsch;
+	u64 instruction_chsc;
+	u64 instruction_stsi;
+	u64 instruction_stfl;
+	u64 instruction_tprot;
+	u64 instruction_sie;
+	u64 instruction_essa;
+	u64 instruction_sthyi;
+	u64 instruction_sigp_sense;
+	u64 instruction_sigp_sense_running;
+	u64 instruction_sigp_external_call;
+	u64 instruction_sigp_emergency;
+	u64 instruction_sigp_cond_emergency;
+	u64 instruction_sigp_start;
+	u64 instruction_sigp_stop;
+	u64 instruction_sigp_stop_store_status;
+	u64 instruction_sigp_store_status;
+	u64 instruction_sigp_store_adtl_status;
+	u64 instruction_sigp_arch;
+	u64 instruction_sigp_prefix;
+	u64 instruction_sigp_restart;
+	u64 instruction_sigp_init_cpu_reset;
+	u64 instruction_sigp_cpu_reset;
+	u64 instruction_sigp_unknown;
+	u64 diagnose_10;
+	u64 diagnose_44;
+	u64 diagnose_9c;
+	u64 diagnose_258;
+	u64 diagnose_308;
+	u64 diagnose_500;
 };
 
 #define PGM_OPERATION			0x01
@@ -577,7 +577,7 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_vm_stat {
-	u32 remote_tlb_flush;
+	ulong remote_tlb_flush;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4..67c8f52 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -790,45 +790,45 @@ struct kvm_arch {
 };
 
 struct kvm_vm_stat {
-	u32 mmu_shadow_zapped;
-	u32 mmu_pte_write;
-	u32 mmu_pte_updated;
-	u32 mmu_pde_zapped;
-	u32 mmu_flooded;
-	u32 mmu_recycled;
-	u32 mmu_cache_miss;
-	u32 mmu_unsync;
-	u32 remote_tlb_flush;
-	u32 lpages;
+	ulong mmu_shadow_zapped;
+	ulong mmu_pte_write;
+	ulong mmu_pte_updated;
+	ulong mmu_pde_zapped;
+	ulong mmu_flooded;
+	ulong mmu_recycled;
+	ulong mmu_cache_miss;
+	ulong mmu_unsync;
+	ulong remote_tlb_flush;
+	ulong lpages;
 };
 
 struct kvm_vcpu_stat {
-	u32 pf_fixed;
-	u32 pf_guest;
-	u32 tlb_flush;
-	u32 invlpg;
-
-	u32 exits;
-	u32 io_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 irq_window_exits;
-	u32 nmi_window_exits;
-	u32 halt_exits;
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 request_irq_exits;
-	u32 irq_exits;
-	u32 host_state_reload;
-	u32 efer_reload;
-	u32 fpu_reload;
-	u32 insn_emulation;
-	u32 insn_emulation_fail;
-	u32 hypercalls;
-	u32 irq_injections;
-	u32 nmi_injections;
+	u64 pf_fixed;
+	u64 pf_guest;
+	u64 tlb_flush;
+	u64 invlpg;
+
+	u64 exits;
+	u64 io_exits;
+	u64 mmio_exits;
+	u64 signal_exits;
+	u64 irq_window_exits;
+	u64 nmi_window_exits;
+	u64 halt_exits;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 request_irq_exits;
+	u64 irq_exits;
+	u64 host_state_reload;
+	u64 efer_reload;
+	u64 fpu_reload;
+	u64 insn_emulation;
+	u64 insn_emulation_fail;
+	u64 hypercalls;
+	u64 irq_injections;
+	u64 nmi_injections;
 };
 
 struct x86_instruction_info;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1950782..4171ef3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3619,7 +3619,7 @@ static int vm_stat_get_per_vm(void *data, u64 *val)
 {
 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
 
-	*val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+	*val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
 
 	return 0;
 }
@@ -3649,7 +3649,7 @@ static int vcpu_stat_get_per_vm(void *data, u64 *val)
 	*val = 0;
 
 	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-		*val += *(u32 *)((void *)vcpu + stat_data->offset);
+		*val += *(u64 *)((void *)vcpu + stat_data->offset);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 2a27f514a47d39c50aaa5c07831ab35178955d47 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Date: Tue, 2 Aug 2016 14:03:23 +1000
Subject: KVM: PPC: Implement existing and add new halt polling vcpu stats

vcpu stats are used to collect information about a vcpu which can be viewed
in the debugfs. For example halt_attempted_poll and halt_successful_poll
are used to keep track of the number of times the vcpu attempts to and
successfully polls. These stats are currently not used on powerpc.

Implement incrementation of the halt_attempted_poll and
halt_successful_poll vcpu stats for powerpc. Since these stats are summed
over all the vcpus for all running guests it doesn't matter which vcpu
they are attributed to, thus we choose the current runner vcpu of the
vcore.

Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and
halt_wait_ns to be used to accumulate the total time spend polling
successfully, polling unsuccessfully and waiting respectively, and
halt_successful_wait to accumulate the number of times the vcpu waits.
Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are
expressed in nanoseconds it is necessary to represent these as 64-bit
quantities, otherwise they would overflow after only about 4 seconds.

Given that the total time spend either polling or waiting will be known and
the number of times that each was done, it will be possible to determine
the average poll and wait times. This will give the ability to tune the kvm
module parameters based on the calculated average wait and poll times.

Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5ec1dcf..373003f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -116,8 +116,12 @@ struct kvm_vcpu_stat {
 	u64 emulated_inst_exits;
 	u64 dec_exits;
 	u64 ext_intr_exits;
+	u64 halt_poll_success_ns;
+	u64 halt_poll_fail_ns;
+	u64 halt_wait_ns;
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
+	u64 halt_successful_wait;
 	u64 halt_poll_invalid;
 	u64 halt_wakeup;
 	u64 dbell_exits;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 47018fc..71eb8f3 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "dec",         VCPU_STAT(dec_exits) },
 	{ "ext_intr",    VCPU_STAT(ext_intr_exits) },
 	{ "queue_intr",  VCPU_STAT(queue_intr) },
+	{ "halt_poll_success_ns",	VCPU_STAT(halt_poll_success_ns) },
+	{ "halt_poll_fail_ns",		VCPU_STAT(halt_poll_fail_ns) },
+	{ "halt_wait_ns",		VCPU_STAT(halt_wait_ns) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+	{ "halt_successful_wait",	VCPU_STAT(halt_successful_wait) },
 	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "pf_storage",  VCPU_STAT(pf_storage) },
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3c85c3b..30ff8ab 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2680,15 +2680,16 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
  */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
+	ktime_t cur, start_poll, start_wait;
 	int do_sleep = 1;
-	ktime_t cur, start;
 	u64 block_ns;
 	DECLARE_SWAITQUEUE(wait);
 
 	/* Poll for pending exceptions and ceded state */
-	cur = start = ktime_get();
+	cur = start_poll = ktime_get();
 	if (vc->halt_poll_ns) {
-		ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
+		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+		++vc->runner->stat.halt_attempted_poll;
 
 		vc->vcore_state = VCORE_POLLING;
 		spin_unlock(&vc->lock);
@@ -2704,8 +2705,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 		spin_lock(&vc->lock);
 		vc->vcore_state = VCORE_INACTIVE;
 
-		if (!do_sleep)
+		if (!do_sleep) {
+			++vc->runner->stat.halt_successful_poll;
 			goto out;
+		}
 	}
 
 	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
@@ -2713,9 +2716,14 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	if (kvmppc_vcore_check_block(vc)) {
 		finish_swait(&vc->wq, &wait);
 		do_sleep = 0;
+		/* If we polled, count this as a successful poll */
+		if (vc->halt_poll_ns)
+			++vc->runner->stat.halt_successful_poll;
 		goto out;
 	}
 
+	start_wait = ktime_get();
+
 	vc->vcore_state = VCORE_SLEEPING;
 	trace_kvmppc_vcore_blocked(vc, 0);
 	spin_unlock(&vc->lock);
@@ -2724,11 +2732,29 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 	trace_kvmppc_vcore_blocked(vc, 1);
+	++vc->runner->stat.halt_successful_wait;
 
 	cur = ktime_get();
 
 out:
-	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+	/* Attribute wait time */
+	if (do_sleep) {
+		vc->runner->stat.halt_wait_ns +=
+			ktime_to_ns(cur) - ktime_to_ns(start_wait);
+		/* Attribute failed poll time */
+		if (vc->halt_poll_ns)
+			vc->runner->stat.halt_poll_fail_ns +=
+				ktime_to_ns(start_wait) -
+				ktime_to_ns(start_poll);
+	} else {
+		/* Attribute successful poll time */
+		if (vc->halt_poll_ns)
+			vc->runner->stat.halt_poll_success_ns +=
+				ktime_to_ns(cur) -
+				ktime_to_ns(start_poll);
+	}
 
 	/* Adjust poll time */
 	if (halt_poll_max_ns) {
-- 
cgit v0.10.2


From 0eeede0c63305a33de31bd90b53b023c1d452c17 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 2 Sep 2016 17:20:43 +1000
Subject: powerpc/mm: Speed up computation of base and actual page size for a
 HPTE

This replaces a 2-D search through an array with a simple 8-bit table
lookup for determining the actual and/or base page size for a HPT entry.

The encoding in the second doubleword of the HPTE is designed to encode
the actual and base page sizes without using any more bits than would be
needed for a 4k page number, by using between 1 and 8 low-order bits of
the RPN (real page number) field to encode the page sizes.  A single
"large page" bit in the first doubleword indicates that these low-order
bits are to be interpreted like this.

We can determine the page sizes by using the low-order 8 bits of the RPN
to look up a 256-entry table.  For actual page sizes less than 1MB, some
of the upper bits of these 8 bits are going to be real address bits, but
we can cope with that by replicating the entries for those smaller page
sizes.

While we're at it, let's move the hpte_page_size() and hpte_base_page_size()
functions from a KVM-specific header to a header for 64-bit HPT systems,
since this computation doesn't have anything specifically to do with KVM.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 287a656..e407af2 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -245,6 +245,43 @@ static inline int segment_shift(int ssize)
 }
 
 /*
+ * This array is indexed by the LP field of the HPTE second dword.
+ * Since this field may contain some RPN bits, some entries are
+ * replicated so that we get the same value irrespective of RPN.
+ * The top 4 bits are the page size index (MMU_PAGE_*) for the
+ * actual page size, the bottom 4 bits are the base page size.
+ */
+extern u8 hpte_page_sizes[1 << LP_BITS];
+
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+					     bool is_base_size)
+{
+	unsigned int i, lp;
+
+	if (!(h & HPTE_V_LARGE))
+		return 1ul << 12;
+
+	/* Look at the 8 bit LP value */
+	lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+	i = hpte_page_sizes[lp];
+	if (!i)
+		return 0;
+	if (!is_base_size)
+		i >>= 4;
+	return 1ul << mmu_psize_defs[i & 0xf].shift;
+}
+
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+	return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
+{
+	return __hpte_page_size(h, l, 1);
+}
+
+/*
  * The current system page and segment sizes
  */
 extern int mmu_kernel_ssize;
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 88d17b4..4ffd5a1 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#include <asm/book3s/64/mmu-hash.h>
+
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
 {
@@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
 	hpte[0] = cpu_to_be64(hpte_v);
 }
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-	int i, shift;
-	unsigned int mask;
-
-	/* start from 1 ignoring MMU_PAGE_4K */
-	for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-		/* invalid penc */
-		if (mmu_psize_defs[psize].penc[i] == -1)
-			continue;
-		/*
-		 * encoding bits per actual page size
-		 *        PTE LP     actual page size
-		 *    rrrr rrrz		>=8KB
-		 *    rrrr rrzz		>=16KB
-		 *    rrrr rzzz		>=32KB
-		 *    rrrr zzzz		>=64KB
-		 * .......
-		 */
-		shift = mmu_psize_defs[i].shift - LP_SHIFT;
-		if (shift > LP_BITS)
-			shift = LP_BITS;
-		mask = (1 << shift) - 1;
-		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-			return i;
-	}
-	return -1;
-}
-
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
-	int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
+	int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
 	unsigned int penc;
 	unsigned long rb = 0, va_low, sllp;
 	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
 	if (v & HPTE_V_LARGE) {
-		for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
-
-			/* valid entries have a shift value */
-			if (!mmu_psize_defs[b_psize].shift)
-				continue;
-
-			a_psize = __hpte_actual_psize(lp, b_psize);
-			if (a_psize != -1)
-				break;
-		}
+		i = hpte_page_sizes[lp];
+		b_psize = i & 0xf;
+		a_psize = i >> 4;
 	}
+
 	/*
 	 * Ignore the top 14 bits of va
 	 * v have top two bits covering segment size, hence move
@@ -215,45 +181,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	return rb;
 }
 
-static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
-					     bool is_base_size)
-{
-
-	int size, a_psize;
-	/* Look at the 8 bit LP value */
-	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-	/* only handle 4k, 64k and 16M pages for now */
-	if (!(h & HPTE_V_LARGE))
-		return 1ul << 12;
-	else {
-		for (size = 0; size < MMU_PAGE_COUNT; size++) {
-			/* valid entries have a shift value */
-			if (!mmu_psize_defs[size].shift)
-				continue;
-
-			a_psize = __hpte_actual_psize(lp, size);
-			if (a_psize != -1) {
-				if (is_base_size)
-					return 1ul << mmu_psize_defs[size].shift;
-				return 1ul << mmu_psize_defs[a_psize].shift;
-			}
-		}
-
-	}
-	return 0;
-}
-
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
-{
-	return __hpte_page_size(h, l, 0);
-}
-
-static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
-{
-	return __hpte_page_size(h, l, 1);
-}
-
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
 {
 	return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e2fb408..b78e8d3 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void)
 #define MMU_PAGE_16G	13
 #define MMU_PAGE_64G	14
 
+/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
 #define MMU_PAGE_COUNT	15
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 0e4e965..83ddc0e 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
 }
 #endif
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-	int i, shift;
-	unsigned int mask;
-
-	/* start from 1 ignoring MMU_PAGE_4K */
-	for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-		/* invalid penc */
-		if (mmu_psize_defs[psize].penc[i] == -1)
-			continue;
-		/*
-		 * encoding bits per actual page size
-		 *        PTE LP     actual page size
-		 *    rrrr rrrz		>=8KB
-		 *    rrrr rrzz		>=16KB
-		 *    rrrr rzzz		>=32KB
-		 *    rrrr zzzz		>=64KB
-		 * .......
-		 */
-		shift = mmu_psize_defs[i].shift - LP_SHIFT;
-		if (shift > LP_BITS)
-			shift = LP_BITS;
-		mask = (1 << shift) - 1;
-		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-			return i;
-	}
-	return -1;
-}
-
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 		size   = MMU_PAGE_4K;
 		a_size = MMU_PAGE_4K;
 	} else {
-		for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
-			/* valid entries have a shift value */
-			if (!mmu_psize_defs[size].shift)
-				continue;
-
-			a_size = __hpte_actual_psize(lp, size);
-			if (a_size != -1)
-				break;
-		}
+		size = hpte_page_sizes[lp] & 0xf;
+		a_size = hpte_page_sizes[lp] >> 4;
 	}
 	/* This works for all page sizes, and for 256M and 1T segments */
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0821556..ef3ae89 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -93,6 +93,9 @@ static unsigned long _SDR1;
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 EXPORT_SYMBOL_GPL(mmu_psize_defs);
 
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
@@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void)
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations.  Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE.  For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ *        PTE LP     actual page size
+ *    rrrr rrrz		>=8KB
+ *    rrrr rrzz		>=16KB
+ *    rrrr rzzz		>=32KB
+ *    rrrr zzzz		>=64KB
+ *    ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+	long int ap, bp;
+	long int shift, penc;
+
+	for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+		if (!mmu_psize_defs[bp].shift)
+			continue;	/* not a supported page size */
+		for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+			penc = mmu_psize_defs[bp].penc[ap];
+			if (penc == -1)
+				continue;
+			shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+			if (shift <= 0)
+				continue;	/* should never happen */
+			/*
+			 * For page sizes less than 1MB, this loop
+			 * replicates the entry for all possible values
+			 * of the rrrr bits.
+			 */
+			while (penc < (1 << LP_BITS)) {
+				hpte_page_sizes[penc] = (ap << 4) | bp;
+				penc += 1 << shift;
+			}
+		}
+	}
+}
+
 static void __init htab_init_page_sizes(void)
 {
+	init_hpte_page_sizes();
+
 	if (!debug_pagealloc_enabled()) {
 		/*
 		 * Pick a size for the linear mapping. Currently, we only
-- 
cgit v0.10.2


From 07b1fdf5bd135d94eff2b3a6849b90c358963066 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:45 +1000
Subject: powerpc: Add simple cache inhibited MMIO accessors

Add simple cache inhibited accessors for memory mapped I/O.
Unlike the accessors built from the DEF_MMIO_* macros, these
don't include any hardware memory barriers, callers need to
manage memory barriers on their own. These can only be called
in hypervisor real mode.

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
[paulus@ozlabs.org - added line to comment]
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 2fd1690..f6fda84 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
 #endif
 #endif /* __powerpc64__ */
 
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+	u32 ret;
+
+	__asm__ __volatile__("lwzcix %0,0, %1"
+			     : "=r" (ret) : "r" (addr) : "memory");
+	return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+	__asm__ __volatile__("stbcix %0,0,%1"
+		: : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+	__asm__ __volatile__("stwcix %0,0,%1"
+		: : "r" (val), "r" (addr) : "memory");
+}
+
 /*
  * Low level IO stream instructions are defined out of line for now
  */
-- 
cgit v0.10.2


From 4ee11c1a9f7cc20026bb66ac624533310a605312 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:49 +1000
Subject: powerpc/powernv: Provide facilities for EOI, usable from real mode

This adds a new function pnv_opal_pci_msi_eoi() which does the part of
end-of-interrupt (EOI) handling of an MSI which involves doing an
OPAL call.  This function can be called in real mode.  This doesn't
just export pnv_ioda2_msi_eoi() because that does a call to
icp_native_eoi(), which does not work in real mode.

This also adds a function, is_pnv_opal_msi(), which KVM can call to
check whether an interrupt is one for which we should be calling
pnv_opal_pci_msi_eoi() when we need to do an EOI.

[paulus@ozlabs.org - split out the addition of pnv_opal_pci_msi_eoi()
 from Suresh's patch "KVM: PPC: Book3S HV: Handle passthrough
 interrupts in guest"; added is_pnv_opal_msi(); wrote description.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index 0cbd813..1b46b52 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -12,6 +12,7 @@
 
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
+#include <linux/irq.h>
 #include <misc/cxl-base.h>
 #include <asm/opal-api.h>
 
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
 
 #ifdef CONFIG_CXL_BASE
 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index fd9444f..9ce48ae 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2710,15 +2710,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 }
 
 #ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
 {
-	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-	struct irq_chip *chip = irq_data_get_irq_chip(d);
 	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
 					   ioda.irq_chip);
+
+	return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
 	int64_t rc;
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	struct irq_chip *chip = irq_data_get_irq_chip(d);
 
-	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+	rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
 	WARN_ON_ONCE(rc);
 
 	icp_native_eoi(d);
@@ -2748,6 +2754,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
 	irq_set_chip(virq, &phb->ioda.irq_chip);
 }
 
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+	return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
 				  unsigned int hwirq, unsigned int virq,
 				  unsigned int is_64, struct msi_msg *msg)
-- 
cgit v0.10.2


From 3f2577749948803491874c6895fec559f3473eab Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 11 Aug 2016 15:07:43 +0200
Subject: powerpc: move hmi.c to arch/powerpc/kvm/

hmi.c functions are unused unless sibling_subcore_state is nonzero, and
that in turn happens only if KVM is in use.  So move the code to
arch/powerpc/kvm/, putting it under CONFIG_KVM_BOOK3S_HV_POSSIBLE
rather than CONFIG_PPC_BOOK3S_64.  The sibling_subcore_state is also
included in struct paca_struct only if KVM is supported by the kernel.

Cc: Daniel Axtens <dja@axtens.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: kvm-ppc@vger.kernel.org
Cc: kvm@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
index 88b4901..85b7a1a 100644
--- a/arch/powerpc/include/asm/hmi.h
+++ b/arch/powerpc/include/asm/hmi.h
@@ -21,7 +21,7 @@
 #ifndef __ASM_PPC64_HMI_H__
 #define __ASM_PPC64_HMI_H__
 
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 
 #define	CORE_TB_RESYNC_REQ_BIT		63
 #define MAX_SUBCORE_PER_CORE		4
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 148303e7..6a6792b 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -183,11 +183,6 @@ struct paca_struct {
 	 */
 	u16 in_mce;
 	u8 hmi_event_available;		 /* HMI event is available */
-	/*
-	 * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
-	 * more details
-	 */
-	struct sibling_subcore_state *sibling_subcore_state;
 #endif
 
 	/* Stuff for accurate time accounting */
@@ -202,6 +197,13 @@ struct paca_struct {
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
 #endif
 	struct kvmppc_host_state kvm_hstate;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/*
+	 * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+	 * more details
+	 */
+	struct sibling_subcore_state *sibling_subcore_state;
+#endif
 #endif
 };
 
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b2027a5..fe4c075 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o hmi.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
deleted file mode 100644
index e3f738e..0000000
--- a/arch/powerpc/kernel/hmi.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Hypervisor Maintenance Interrupt (HMI) handling.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.
- *
- * Copyright 2015 IBM Corporation
- * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
- */
-
-#undef DEBUG
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-#include <asm/paca.h>
-#include <asm/hmi.h>
-
-void wait_for_subcore_guest_exit(void)
-{
-	int i;
-
-	/*
-	 * NULL bitmap pointer indicates that KVM module hasn't
-	 * been loaded yet and hence no guests are running.
-	 * If no KVM is in use, no need to co-ordinate among threads
-	 * as all of them will always be in host and no one is going
-	 * to modify TB other than the opal hmi handler.
-	 * Hence, just return from here.
-	 */
-	if (!local_paca->sibling_subcore_state)
-		return;
-
-	for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
-		while (local_paca->sibling_subcore_state->in_guest[i])
-			cpu_relax();
-}
-
-void wait_for_tb_resync(void)
-{
-	if (!local_paca->sibling_subcore_state)
-		return;
-
-	while (test_bit(CORE_TB_RESYNC_REQ_BIT,
-				&local_paca->sibling_subcore_state->flags))
-		cpu_relax();
-}
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1f9e552..855d4b9 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -78,6 +78,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
 
 ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_hv_hmi.o \
 	book3s_hv_rmhandlers.o \
 	book3s_hv_rm_mmu.o \
 	book3s_hv_ras.o \
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
new file mode 100644
index 0000000..e3f738e
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+	int i;
+
+	/*
+	 * NULL bitmap pointer indicates that KVM module hasn't
+	 * been loaded yet and hence no guests are running.
+	 * If no KVM is in use, no need to co-ordinate among threads
+	 * as all of them will always be in host and no one is going
+	 * to modify TB other than the opal hmi handler.
+	 * Hence, just return from here.
+	 */
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+		while (local_paca->sibling_subcore_state->in_guest[i])
+			cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+				&local_paca->sibling_subcore_state->flags))
+		cpu_relax();
+}
-- 
cgit v0.10.2


From 37f55d30df2eef89b97d627e5830beb6983c4101 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:46 +1000
Subject: KVM: PPC: Book3S HV: Convert kvmppc_read_intr to a C function

Modify kvmppc_read_intr to make it a C function.  Because it is called
from kvmppc_check_wake_reason, any of the assembler code that calls
either kvmppc_read_intr or kvmppc_check_wake_reason now has to assume
that the volatile registers might have been modified.

This also adds in the optimization of clearing saved_xirr in the case
where we completely handle and EOI an IPI.  Without this, the next
device interrupt will require two trips through the host interrupt
handling code.

[paulus@ozlabs.org - made kvmppc_check_wake_reason create a stack frame
 when it is calling kvmppc_read_intr, which means we can set r12 to
 the trap number (0x500) after the call to kvmppc_read_intr, instead
 of using r31.  Also moved the deliver_guest_interrupt label so as to
 restore XER and CTR, plus other minor tweaks.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5f0380d..b476a6a 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,7 @@
 #include <asm/xics.h>
 #include <asm/dbell.h>
 #include <asm/cputhreads.h>
+#include <asm/io.h>
 
 #define KVM_CMA_CHUNK_ORDER	18
 
@@ -286,3 +287,86 @@ void kvmhv_commence_exit(int trap)
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *	0 if no interrupt is pending
+ *	1 if an interrupt is pending that needs to be handled by the host
+ *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ */
+
+long kvmppc_read_intr(void)
+{
+	unsigned long xics_phys;
+	u32 h_xirr;
+	__be32 xirr;
+	u32 xisr;
+	u8 host_ipi;
+
+	/* see if a host IPI is pending */
+	host_ipi = local_paca->kvm_hstate.host_ipi;
+	if (host_ipi)
+		return 1;
+
+	/* Now read the interrupt from the ICP */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	if (unlikely(!xics_phys))
+		return 1;
+
+	/*
+	 * Save XIRR for later. Since we get control in reverse endian
+	 * on LE systems, save it byte reversed and fetch it back in
+	 * host endian. Note that xirr is the value read from the
+	 * XIRR register, while h_xirr is the host endian version.
+	 */
+	xirr = _lwzcix(xics_phys + XICS_XIRR);
+	h_xirr = be32_to_cpu(xirr);
+	local_paca->kvm_hstate.saved_xirr = h_xirr;
+	xisr = h_xirr & 0xffffff;
+	/*
+	 * Ensure that the store/load complete to guarantee all side
+	 * effects of loading from XIRR has completed
+	 */
+	smp_mb();
+
+	/* if nothing pending in the ICP */
+	if (!xisr)
+		return 0;
+
+	/* We found something in the ICP...
+	 *
+	 * If it is an IPI, clear the MFRR and EOI it.
+	 */
+	if (xisr == XICS_IPI) {
+		_stbcix(xics_phys + XICS_MFRR, 0xff);
+		_stwcix(xics_phys + XICS_XIRR, xirr);
+		/*
+		 * Need to ensure side effects of above stores
+		 * complete before proceeding.
+		 */
+		smp_mb();
+
+		/*
+		 * We need to re-check host IPI now in case it got set in the
+		 * meantime. If it's clear, we bounce the interrupt to the
+		 * guest
+		 */
+		host_ipi = local_paca->kvm_hstate.host_ipi;
+		if (unlikely(host_ipi != 0)) {
+			/* We raced with the host,
+			 * we need to resend that IPI, bummer
+			 */
+			_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+			/* Let side effects complete */
+			smp_mb();
+			return 1;
+		}
+
+		/* OK, it's an IPI for us */
+		local_paca->kvm_hstate.saved_xirr = 0;
+		return -1;
+	}
+
+	return 1;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9756555..dccfa85 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
 	li	r3, 0		/* Don't wake on privileged (OS) doorbell */
 	b	kvm_do_nap
 
+/*
+ * kvm_novcpu_wakeup
+ *	Entered from kvm_start_guest if kvm_hstate.napping is set
+ *	to NAPPING_NOVCPU
+ *		r2 = kernel TOC
+ *		r13 = paca
+ */
 kvm_novcpu_wakeup:
 	ld	r1, HSTATE_HOST_R1(r13)
 	ld	r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
 	/* check the wake reason */
 	bl	kvmppc_check_wake_reason
 
+	/*
+	 * Restore volatile registers since we could have called
+	 * a C routine in kvmppc_check_wake_reason.
+	 *	r5 = VCORE
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+
 	/* see if any other thread is already exiting */
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -322,6 +336,11 @@ kvm_start_guest:
 
 	/* Check the wake reason in SRR1 to see why we got here */
 	bl	kvmppc_check_wake_reason
+	/*
+	 * kvmppc_check_wake_reason could invoke a C routine, but we
+	 * have no volatile registers to restore when we return.
+	 */
+
 	cmpdi	r3, 0
 	bge	kvm_no_guest
 
@@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	cmpwi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+deliver_guest_interrupt:
 	ld	r6, VCPU_CTR(r4)
 	ld	r7, VCPU_XER(r4)
 
@@ -895,7 +915,6 @@ kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
 	mtspr	SPRN_SRR0, r6
 	mtspr	SPRN_SRR1, r7
 
-deliver_guest_interrupt:
 	/* r11 = vcpu->arch.msr & ~MSR_HV */
 	rldicl	r11, r11, 63 - MSR_HV_LG, 1
 	rotldi	r11, r11, 1 + MSR_HV_LG
@@ -1155,10 +1174,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 * set, we know the host wants us out so let's do it now
 	 */
 	bl	kvmppc_read_intr
+
+	/*
+	 * Restore the active volatile registers after returning from
+	 * a C function.
+	 */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+
+	/*
+	 * kvmppc_read_intr return codes:
+	 *
+	 * Exit to host (r3 > 0)
+	 *   1 An interrupt is pending that needs to be handled by the host
+	 *     Exit guest and return to host by branching to guest_exit_cont
+	 *
+	 * Before returning to guest, we check if any CPU is heading out
+	 * to the host and if so, we head out also. If no CPUs are heading
+	 * check return values <= 0.
+	 *
+	 * Return to guest (r3 <= 0)
+	 *  0 No external interrupt is pending
+	 * -1 A guest wakeup IPI (which has now been cleared)
+	 *    In either case, we return to guest to deliver any pending
+	 *    guest interrupts.
+	 */
+
 	cmpdi	r3, 0
 	bgt	guest_exit_cont
 
-	/* Check if any CPU is heading out to the host, if so head out too */
+	/* Return code <= 0 */
 4:	ld	r5, HSTATE_KVM_VCORE(r13)
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -2213,10 +2258,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	ld	r29, VCPU_GPR(R29)(r4)
 	ld	r30, VCPU_GPR(R30)(r4)
 	ld	r31, VCPU_GPR(R31)(r4)
- 
+
 	/* Check the wake reason in SRR1 to see why we got here */
 	bl	kvmppc_check_wake_reason
 
+	/*
+	 * Restore volatile registers since we could have called a
+	 * C routine in kvmppc_check_wake_reason
+	 *	r4 = VCPU
+	 * r3 tells us whether we need to return to host or not
+	 * WARNING: it gets checked further down:
+	 * should not modify r3 until this check is done.
+	 */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
 	/* clear our bit in vcore->napping_threads */
 34:	ld	r5,HSTATE_KVM_VCORE(r13)
 	lbz	r7,HSTATE_PTID(r13)
@@ -2230,7 +2285,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	li	r0,0
 	stb	r0,HSTATE_NAPPING(r13)
 
-	/* See if the wake reason means we need to exit */
+	/* See if the wake reason saved in r3 means we need to exit */
 	stw	r12, VCPU_TRAP(r4)
 	mr	r9, r4
 	cmpdi	r3, 0
@@ -2300,7 +2355,9 @@ machine_check_realmode:
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
- * Modifies r0, r6, r7, r8.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
  */
 kvmppc_check_wake_reason:
 	mfspr	r6, SPRN_SRR1
@@ -2310,8 +2367,7 @@ FTR_SECTION_ELSE
 	rlwinm	r6, r6, 45-31, 0xe	/* P7 wake reason field is 3 bits */
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
 	cmpwi	r6, 8			/* was it an external interrupt? */
-	li	r12, BOOK3S_INTERRUPT_EXTERNAL
-	beq	kvmppc_read_intr	/* if so, see what it was */
+	beq	7f			/* if so, see what it was */
 	li	r3, 0
 	li	r12, 0
 	cmpwi	r6, 6			/* was it the decrementer? */
@@ -2350,83 +2406,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, 1
 	blr
 
-/*
- * Determine what sort of external interrupt is pending (if any).
- * Returns:
- *	0 if no interrupt is pending
- *	1 if an interrupt is pending that needs to be handled by the host
- *	-1 if there was a guest wakeup IPI (which has now been cleared)
- * Modifies r0, r6, r7, r8, returns value in r3.
- */
-kvmppc_read_intr:
-	/* see if a host IPI is pending */
-	li	r3, 1
-	lbz	r0, HSTATE_HOST_IPI(r13)
-	cmpwi	r0, 0
-	bne	1f
-
-	/* Now read the interrupt from the ICP */
-	ld	r6, HSTATE_XICS_PHYS(r13)
-	li	r7, XICS_XIRR
-	cmpdi	r6, 0
-	beq-	1f
-	lwzcix	r0, r6, r7
-	/*
-	 * Save XIRR for later. Since we get in in reverse endian on LE
-	 * systems, save it byte reversed and fetch it back in host endian.
-	 */
-	li	r3, HSTATE_SAVED_XIRR
-	STWX_BE	r0, r3, r13
-#ifdef __LITTLE_ENDIAN__
-	lwz	r3, HSTATE_SAVED_XIRR(r13)
-#else
-	mr	r3, r0
-#endif
-	rlwinm.	r3, r3, 0, 0xffffff
-	sync
-	beq	1f			/* if nothing pending in the ICP */
-
-	/* We found something in the ICP...
-	 *
-	 * If it's not an IPI, stash it in the PACA and return to
-	 * the host, we don't (yet) handle directing real external
-	 * interrupts directly to the guest
-	 */
-	cmpwi	r3, XICS_IPI		/* if there is, is it an IPI? */
-	bne	42f
-
-	/* It's an IPI, clear the MFRR and EOI it */
-	li	r3, 0xff
-	li	r8, XICS_MFRR
-	stbcix	r3, r6, r8		/* clear the IPI */
-	stwcix	r0, r6, r7		/* EOI it */
-	sync
-
-	/* We need to re-check host IPI now in case it got set in the
-	 * meantime. If it's clear, we bounce the interrupt to the
-	 * guest
-	 */
-	lbz	r0, HSTATE_HOST_IPI(r13)
-	cmpwi	r0, 0
-	bne-	43f
-
-	/* OK, it's an IPI for us */
-	li	r12, 0
-	li	r3, -1
-1:	blr
-
-42:	/* It's not an IPI and it's for the host. We saved a copy of XIRR in
-	 * the PACA earlier, it will be picked up by the host ICP driver
-	 */
-	li	r3, 1
-	b	1b
-
-43:	/* We raced with the host, we need to resend that IPI, bummer */
-	li	r0, IPI_PRIORITY
-	stbcix	r0, r6, r8		/* set the IPI */
-	sync
-	li	r3, 1
-	b	1b
+	/* external interrupt - create a stack frame so we can call C */
+7:	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -PPC_MIN_STKFRM(r1)
+	bl	kvmppc_read_intr
+	nop
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	ld	r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+	addi	r1, r1, PPC_MIN_STKFRM
+	mtlr	r0
+	blr
 
 /*
  * Save away FP, VMX and VSX registers.
-- 
cgit v0.10.2


From 9576730d0e6e301343c5aead5418ad53fcecfd14 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:47 +1000
Subject: KVM: PPC: select IRQ_BYPASS_MANAGER

Select IRQ_BYPASS_MANAGER for PPC when CONFIG_KVM is set.
Add the PPC producer functions for add and del producer.

[paulus@ozlabs.org - Moved new functions from book3s.c to powerpc.c
 so booke compiles; added kvm_arch_has_irq_bypass implementation.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2544eda..94715e2 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,6 +287,10 @@ struct kvmppc_ops {
 	long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
 			      unsigned long arg);
 	int (*hcall_implemented)(unsigned long hcall);
+	int (*irq_bypass_add_producer)(struct irq_bypass_consumer *,
+				       struct irq_bypass_producer *);
+	void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
+					struct irq_bypass_producer *);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index c0a2478..029be26 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -23,6 +23,8 @@ config KVM
 	select HAVE_KVM_EVENTFD
 	select SRCU
 	select KVM_VFIO
+	select IRQ_BYPASS_MANAGER
+	select HAVE_KVM_IRQ_BYPASS
 
 config KVM_BOOK3S_HANDLER
 	bool
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ab2b3d5..0b7d664 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,8 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
@@ -739,6 +741,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+	return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+		(kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				     struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
+
+	if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+		return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+	return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
+
+	if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+		kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
-- 
cgit v0.10.2


From 8daaafc88b46fb3af952e92d7c2816a8950e1363 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:48 +1000
Subject: KVM: PPC: Book3S HV: Introduce kvmppc_passthru_irqmap

This patch introduces an IRQ mapping structure, the
kvmppc_passthru_irqmap structure that is to be used
to map the real hardware IRQ in the host with the virtual
hardware IRQ (gsi) that is injected into a guest by KVM for
passthrough adapters.

Currently, we assume a separate IRQ mapping structure for
each guest. Each kvmppc_passthru_irqmap has a mapping arrays,
containing all defined real<->virtual IRQs.

[paulus@ozlabs.org - removed irq_chip field from struct
 kvmppc_passthru_irqmap; changed parameter for
 kvmppc_get_passthru_irqmap from struct kvm_vcpu * to struct
 kvm *, removed small cached array.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 373003f..89ac1f6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -203,6 +203,8 @@ struct kvmppc_spapr_tce_table {
 struct kvmppc_xics;
 struct kvmppc_icp;
 
+struct kvmppc_passthru_irqmap;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -273,6 +275,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
+	struct kvmppc_passthru_irqmap *pimap;
 #endif
 	struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -369,6 +372,20 @@ struct kvmhv_tb_accumulator {
 	u64	tb_max;		/* max time */
 };
 
+#ifdef CONFIG_PPC_BOOK3S_64
+struct kvmppc_irq_map {
+	u32	r_hwirq;
+	u32	v_hwirq;
+	struct irq_desc *desc;
+};
+
+#define	KVMPPC_PIRQ_MAPPED	1024
+struct kvmppc_passthru_irqmap {
+	int n_mapped;
+	struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED];
+};
+#endif
+
 # ifdef CONFIG_PPC_FSL_BOOK3E
 #define KVMPPC_BOOKE_IAC_NUM	2
 #define KVMPPC_BOOKE_DAC_NUM	2
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 94715e2..4ca2ba3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -457,8 +457,18 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+				struct kvm *kvm)
+{
+	if (kvm)
+		return kvm->arch.pimap;
+	return NULL;
+}
+
 extern void kvmppc_alloc_host_rm_ops(void);
 extern void kvmppc_free_host_rm_ops(void);
+extern void kvmppc_free_pimap(struct kvm *kvm);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -470,8 +480,12 @@ extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
 extern void kvmppc_xics_ipi_action(void);
 extern int h_ipi_redirect;
 #else
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+				struct kvm *kvm)
+	{ return NULL; }
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
+static inline void kvmppc_free_pimap(struct kvm *kvm) {};
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 30ff8ab..d5c7061 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3412,6 +3412,19 @@ static int kvmppc_core_check_processor_compat_hv(void)
 	return 0;
 }
 
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+	kfree(kvm->arch.pimap);
+}
+
+struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+#endif
+
 static long kvm_arch_vm_ioctl_hv(struct file *filp,
 				 unsigned int ioctl, unsigned long arg)
 {
-- 
cgit v0.10.2


From c57875f5f9be2cea1f1cc83f815a3aadabedcdd3 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:50 +1000
Subject: KVM: PPC: Book3S HV: Enable IRQ bypass

Add the irq_bypass_add_producer and irq_bypass_del_producer
functions. These functions get called whenever a GSI is being
defined for a guest. They create/remove the mapping between
host real IRQ numbers and the guest GSI.

Add the following helper functions to manage the
passthrough IRQ map.

kvmppc_set_passthru_irq()
  Creates a mapping in the passthrough IRQ map that maps a host
  IRQ to a guest GSI. It allocates the structure (one per guest VM)
  the first time it is called.

kvmppc_clr_passthru_irq()
  Removes the passthrough IRQ map entry given a guest GSI.
  The passthrough IRQ map structure is not freed even when the
  number of mapped entries goes to zero. It is only freed when
  the VM is destroyed.

[paulus@ozlabs.org - modified to use is_pnv_opal_msi() rather than
 requiring all passed-through interrupts to use the same irq_chip;
 changed deletion so it zeroes out the r_hwirq field rather than
 copying the last entry down and decrementing the number of entries.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d5c7061..a7fe178 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -53,10 +53,13 @@
 #include <asm/smp.h>
 #include <asm/dbell.h>
 #include <asm/hmi.h>
+#include <asm/pnv-pci.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
 #include <linux/module.h>
 #include <linux/compiler.h>
 
@@ -3377,6 +3380,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 	kvmppc_free_vcores(kvm);
 
 	kvmppc_free_hpt(kvm);
+
+	kvmppc_free_pimap(kvm);
 }
 
 /* We don't need to emulate any privileged instructions or dcbz */
@@ -3419,10 +3424,159 @@ void kvmppc_free_pimap(struct kvm *kvm)
 	kfree(kvm->arch.pimap);
 }
 
-struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
 {
 	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
 }
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+	struct irq_desc *desc;
+	struct kvmppc_irq_map *irq_map;
+	struct kvmppc_passthru_irqmap *pimap;
+	struct irq_chip *chip;
+	int i;
+
+	desc = irq_to_desc(host_irq);
+	if (!desc)
+		return -EIO;
+
+	mutex_lock(&kvm->lock);
+
+	pimap = kvm->arch.pimap;
+	if (pimap == NULL) {
+		/* First call, allocate structure to hold IRQ map */
+		pimap = kvmppc_alloc_pimap();
+		if (pimap == NULL) {
+			mutex_unlock(&kvm->lock);
+			return -ENOMEM;
+		}
+		kvm->arch.pimap = pimap;
+	}
+
+	/*
+	 * For now, we only support interrupts for which the EOI operation
+	 * is an OPAL call followed by a write to XIRR, since that's
+	 * what our real-mode EOI code does.
+	 */
+	chip = irq_data_get_irq_chip(&desc->irq_data);
+	if (!chip || !is_pnv_opal_msi(chip)) {
+		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+			host_irq, guest_gsi);
+		mutex_unlock(&kvm->lock);
+		return -ENOENT;
+	}
+
+	/*
+	 * See if we already have an entry for this guest IRQ number.
+	 * If it's mapped to a hardware IRQ number, that's an error,
+	 * otherwise re-use this entry.
+	 */
+	for (i = 0; i < pimap->n_mapped; i++) {
+		if (guest_gsi == pimap->mapped[i].v_hwirq) {
+			if (pimap->mapped[i].r_hwirq) {
+				mutex_unlock(&kvm->lock);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+
+	if (i == KVMPPC_PIRQ_MAPPED) {
+		mutex_unlock(&kvm->lock);
+		return -EAGAIN;		/* table is full */
+	}
+
+	irq_map = &pimap->mapped[i];
+
+	irq_map->v_hwirq = guest_gsi;
+	irq_map->r_hwirq = desc->irq_data.hwirq;
+	irq_map->desc = desc;
+
+	if (i == pimap->n_mapped)
+		pimap->n_mapped++;
+
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+	struct irq_desc *desc;
+	struct kvmppc_passthru_irqmap *pimap;
+	int i;
+
+	desc = irq_to_desc(host_irq);
+	if (!desc)
+		return -EIO;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->arch.pimap == NULL) {
+		mutex_unlock(&kvm->lock);
+		return 0;
+	}
+	pimap = kvm->arch.pimap;
+
+	for (i = 0; i < pimap->n_mapped; i++) {
+		if (guest_gsi == pimap->mapped[i].v_hwirq)
+			break;
+	}
+
+	if (i == pimap->n_mapped) {
+		mutex_unlock(&kvm->lock);
+		return -ENODEV;
+	}
+
+	/* invalidate the entry */
+	pimap->mapped[i].r_hwirq = 0;
+
+	/*
+	 * We don't free this structure even when the count goes to
+	 * zero. The structure is freed when we destroy the VM.
+	 */
+
+	mutex_unlock(&kvm->lock);
+	return 0;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+					     struct irq_bypass_producer *prod)
+{
+	int ret = 0;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	irqfd->producer = prod;
+
+	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+	if (ret)
+		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+			prod->irq, irqfd->gsi, ret);
+
+	return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+					      struct irq_bypass_producer *prod)
+{
+	int ret;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	irqfd->producer = NULL;
+
+	/*
+	 * When producer of consumer is unregistered, we change back to
+	 * default external interrupt handling mode - KVM real mode
+	 * will switch back to host.
+	 */
+	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+	if (ret)
+		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+			prod->irq, irqfd->gsi, ret);
+}
 #endif
 
 static long kvm_arch_vm_ioctl_hv(struct file *filp,
@@ -3543,6 +3697,10 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
 	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
 	.hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
 };
 
 static int kvm_init_subcore_bitmap(void)
-- 
cgit v0.10.2


From e3c13e56a4717ee334837a20c596e527eb6355e1 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:51 +1000
Subject: KVM: PPC: Book3S HV: Handle passthrough interrupts in guest

Currently, KVM switches back to the host to handle any external
interrupt (when the interrupt is received while running in the
guest). This patch updates real-mode KVM to check if an interrupt
is generated by a passthrough adapter that is owned by this guest.
If so, the real mode KVM will directly inject the corresponding
virtual interrupt to the guest VCPU's ICS and also EOI the interrupt
in hardware. In short, the interrupt is handled entirely in real
mode in the guest context without switching back to the host.

In some rare cases, the interrupt cannot be completely handled in
real mode, for instance, a VCPU that is sleeping needs to be woken
up. In this case, KVM simply switches back to the host with trap
reason set to 0x500. This works, but it is clearly not very efficient.
A following patch will distinguish this case and handle it
correctly in the host. Note that we can use the existing
check_too_hard() routine even though we are not in a hypercall to
determine if there is unfinished business that needs to be
completed in host virtual mode.

The patch assumes that the mapping between hardware interrupt IRQ
and virtual IRQ to be injected to the guest already exists for the
PCI passthrough interrupts that need to be handled in real mode.
If the mapping does not exist, KVM falls back to the default
existing behavior.

The KVM real mode code reads mappings from the mapped array in the
passthrough IRQ map without taking any lock.  We carefully order the
loads and stores of the fields in the kvmppc_irq_map data structure
using memory barriers to avoid an inconsistent mapping being seen by
the reader. Thus, although it is possible to miss a map entry, it is
not possible to read a stale value.

[paulus@ozlabs.org - get irq_chip from irq_map rather than pimap,
 pulled out powernv eoi change into a separate patch, made
 kvmppc_read_intr get the vcpu from the paca rather than being
 passed in, rewrote the logic at the end of kvmppc_read_intr to
 avoid deep indentation, simplified logic in book3s_hv_rmhandlers.S
 since we were always restoring SRR0/1 anyway, get rid of the cached
 array (just use the mapped array), removed the kick_all_cpus_sync()
 call, clear saved_xirr PACA field when we handle the interrupt in
 real mode, fix compilation with CONFIG_KVM_XICS=n.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4ca2ba3..4299a1f 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -478,6 +478,9 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
 			struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xics_ipi_action(void);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+				 struct kvmppc_irq_map *irq_map,
+				 struct kvmppc_passthru_irqmap *pimap);
 extern int h_ipi_redirect;
 #else
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a7fe178..a393c2b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3490,9 +3490,15 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	irq_map = &pimap->mapped[i];
 
 	irq_map->v_hwirq = guest_gsi;
-	irq_map->r_hwirq = desc->irq_data.hwirq;
 	irq_map->desc = desc;
 
+	/*
+	 * Order the above two stores before the next to serialize with
+	 * the KVM real mode handler.
+	 */
+	smp_wmb();
+	irq_map->r_hwirq = desc->irq_data.hwirq;
+
 	if (i == pimap->n_mapped)
 		pimap->n_mapped++;
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index b476a6a..7531e29 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -288,12 +288,83 @@ void kvmhv_commence_exit(int trap)
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
 
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+					 u32 xisr)
+{
+	int i;
+
+	/*
+	 * We access the mapped array here without a lock.  That
+	 * is safe because we never reduce the number of entries
+	 * in the array and we never change the v_hwirq field of
+	 * an entry once it is set.
+	 *
+	 * We have also carefully ordered the stores in the writer
+	 * and the loads here in the reader, so that if we find a matching
+	 * hwirq here, the associated GSI and irq_desc fields are valid.
+	 */
+	for (i = 0; i < pimap->n_mapped; i++)  {
+		if (xisr == pimap->mapped[i].r_hwirq) {
+			/*
+			 * Order subsequent reads in the caller to serialize
+			 * with the writer.
+			 */
+			smp_rmb();
+			return &pimap->mapped[i];
+		}
+	}
+	return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+	struct kvmppc_passthru_irqmap *pimap;
+	struct kvmppc_irq_map *irq_map;
+	struct kvm_vcpu *vcpu;
+
+	vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	if (!vcpu)
+		return 1;
+	pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+	if (!pimap)
+		return 1;
+	irq_map = get_irqmap(pimap, xisr);
+	if (!irq_map)
+		return 1;
+
+	/* We're handling this interrupt, generic code doesn't need to */
+	local_paca->kvm_hstate.saved_xirr = 0;
+
+	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+	return 1;
+}
+#endif
+
 /*
  * Determine what sort of external interrupt is pending (if any).
  * Returns:
  *	0 if no interrupt is pending
  *	1 if an interrupt is pending that needs to be handled by the host
  *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ *	-2 if there is PCI passthrough external interrupt that was handled
  */
 
 long kvmppc_read_intr(void)
@@ -368,5 +439,5 @@ long kvmppc_read_intr(void)
 		return -1;
 	}
 
-	return 1;
+	return kvmppc_check_passthru(xisr, xirr);
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 980d8a6..17f5b85 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -19,6 +19,7 @@
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
 #include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
 
 #include "book3s_xics.h"
 
@@ -712,6 +713,49 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	return check_too_hard(xics, icp);
 }
 
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+	unsigned long xics_phys;
+	int64_t rc;
+
+	rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+	if (rc)
+		eoi_rc = rc;
+
+	iosync();
+
+	/* EOI it */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	_stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+				 u32 xirr,
+				 struct kvmppc_irq_map *irq_map,
+				 struct kvmppc_passthru_irqmap *pimap)
+{
+	struct kvmppc_xics *xics;
+	struct kvmppc_icp *icp;
+	u32 irq;
+
+	irq = irq_map->v_hwirq;
+	xics = vcpu->kvm->arch.xics;
+	icp = vcpu->arch.icp;
+
+	icp_rm_deliver_irq(xics, icp, irq);
+
+	/* EOI the interrupt */
+	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+
+	if (check_too_hard(xics, icp) == H_TOO_HARD)
+		return 1;
+	else
+		return -2;
+}
+
 /*  --- Non-real mode XICS-related built-in routines ---  */
 
 /**
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index dccfa85..12fb2af 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1198,6 +1198,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 * -1 A guest wakeup IPI (which has now been cleared)
 	 *    In either case, we return to guest to deliver any pending
 	 *    guest interrupts.
+	 *
+	 * -2 A PCI passthrough external interrupt was handled
+	 *    (interrupt was delivered directly to guest)
+	 *    Return to guest to deliver any pending guest interrupts.
 	 */
 
 	cmpdi	r3, 0
@@ -2352,6 +2356,8 @@ machine_check_realmode:
  *	0 if nothing needs to be done
  *	1 if something happened that needs to be handled by the host
  *	-1 if there was a guest wakeup (IPI or msgsnd)
+ *	-2 if we handled a PCI passthrough interrupt (returned by
+ *		kvmppc_read_intr only)
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
-- 
cgit v0.10.2


From f7af5209b87c592aad81da65bd104241aa43d36a Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:52 +1000
Subject: KVM: PPC: Book3S HV: Complete passthrough interrupt in host

In existing real mode ICP code, when updating the virtual ICP
state, if there is a required action that cannot be completely
handled in real mode, as for instance, a VCPU needs to be woken
up, flags are set in the ICP to indicate the required action.
This is checked when returning from hypercalls to decide whether
the call needs switch back to the host where the action can be
performed in virtual mode. Note that if h_ipi_redirect is enabled,
real mode code will first try to message a free host CPU to
complete this job instead of returning the host to do it ourselves.

Currently, the real mode PCI passthrough interrupt handling code
checks if any of these flags are set and simply returns to the host.
This is not good enough as the trap value (0x500) is treated as an
external interrupt by the host code. It is only when the trap value
is a hypercall that the host code searches for and acts on unfinished
work by calling kvmppc_xics_rm_complete.

This patch introduces a special trap BOOK3S_INTERRUPT_HV_RM_HARD
which is returned by KVM if there is unfinished business to be
completed in host virtual mode after handling a PCI passthrough
interrupt. The host checks for this special interrupt condition
and calls into the kvmppc_xics_rm_complete, which is made an
exported function for this reason.

[paulus@ozlabs.org - moved logic to set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
 in book3s_hv_rmhandlers.S into the end of kvmppc_check_wake_reason.]

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 5bca220..05cabed 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -105,6 +105,15 @@
 #define BOOK3S_INTERRUPT_FAC_UNAVAIL	0xf60
 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL	0xf80
 
+/* book3s_hv */
+
+/*
+ * Special trap used to indicate to host that this is a
+ * passthrough interrupt that could not be handled
+ * completely in the guest.
+ */
+#define BOOK3S_INTERRUPT_HV_RM_HARD	0x5555
+
 #define BOOK3S_IRQPRIO_SYSTEM_RESET		0
 #define BOOK3S_IRQPRIO_DATA_SEGMENT		1
 #define BOOK3S_IRQPRIO_INST_SEGMENT		2
@@ -136,6 +145,7 @@
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
 #define RESUME_FLAG_ARCH1	(1<<2)
+#define RESUME_FLAG_ARCH2	(1<<3)
 
 #define RESUME_GUEST            0
 #define RESUME_GUEST_NV         RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4299a1f..e0ada31 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -469,6 +469,7 @@ static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
 extern void kvmppc_alloc_host_rm_ops(void);
 extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_free_pimap(struct kvm *kvm);
+extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -489,6 +490,8 @@ static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
 static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+	{ return 0; }
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a393c2b..90beb96 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -74,6 +74,8 @@
 
 /* Used to indicate that a guest page fault needs to be handled */
 #define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
 
 /* Used as a "null" value for timebase values */
 #define TB_NIL	(~(u64)0)
@@ -1032,6 +1034,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 		r = RESUME_GUEST;
 		break;
+	case BOOK3S_INTERRUPT_HV_RM_HARD:
+		r = RESUME_PASSTHROUGH;
+		break;
 	default:
 		kvmppc_dump_regs(vcpu);
 		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
@@ -2951,7 +2956,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-		}
+		} else if (r == RESUME_PASSTHROUGH)
+			r = kvmppc_xics_rm_complete(vcpu, 0);
 	} while (is_kvmppc_resume_guest(r));
 
  out:
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7531e29..0c84d6b 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -363,6 +363,7 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
  * Returns:
  *	0 if no interrupt is pending
  *	1 if an interrupt is pending that needs to be handled by the host
+ *	2 Passthrough that needs completion in the host
  *	-1 if there was a guest wakeup IPI (which has now been cleared)
  *	-2 if there is PCI passthrough external interrupt that was handled
  */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 17f5b85..3b8d7ac 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -751,7 +751,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
 
 	if (check_too_hard(xics, icp) == H_TOO_HARD)
-		return 1;
+		return 2;
 	else
 		return -2;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 12fb2af..7cc924b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1189,6 +1189,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 *   1 An interrupt is pending that needs to be handled by the host
 	 *     Exit guest and return to host by branching to guest_exit_cont
 	 *
+	 *   2 Passthrough that needs completion in the host
+	 *     Exit guest and return to host by branching to guest_exit_cont
+	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+	 *     to indicate to the host to complete handling the interrupt
+	 *
 	 * Before returning to guest, we check if any CPU is heading out
 	 * to the host and if so, we head out also. If no CPUs are heading
 	 * check return values <= 0.
@@ -1204,6 +1209,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 *    Return to guest to deliver any pending guest interrupts.
 	 */
 
+	cmpdi	r3, 1
+	ble	1f
+
+	/* Return code = 2 */
+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
+	stw	r12, VCPU_TRAP(r9)
+	b	guest_exit_cont
+
+1:	/* Return code <= 1 */
 	cmpdi	r3, 0
 	bgt	guest_exit_cont
 
@@ -2419,6 +2433,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	bl	kvmppc_read_intr
 	nop
 	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	cmpdi	r3, 1
+	ble	1f
+
+	/*
+	 * Return code of 2 means PCI passthrough interrupt, but
+	 * we need to return back to host to complete handling the
+	 * interrupt. Trap reason is expected in r12 by guest
+	 * exit code.
+	 */
+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
 	ld	r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
 	addi	r1, r1, PPC_MIN_STKFRM
 	mtlr	r0
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 686147e..4edbe7b 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -812,7 +812,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	return H_SUCCESS;
 }
 
-static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -841,6 +841,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 
 	return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
 
 int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 {
-- 
cgit v0.10.2


From af893c7dc941f2510d4f23e76e312c77c434b2f0 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:53 +1000
Subject: KVM: PPC: Book3S HV: Dump irqmap in debugfs

Dump the passthrough irqmap structure associated with a
guest as part of /sys/kernel/debug/powerpc/kvm-xics-*.

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 4edbe7b..be5179c 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -893,6 +893,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
 
 /* -- Initialisation code etc. -- */
 
+static void xics_debugfs_irqmap(struct seq_file *m,
+				struct kvmppc_passthru_irqmap *pimap)
+{
+	int i;
+
+	if (!pimap)
+		return;
+	seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+				pimap->n_mapped);
+	for (i = 0; i < pimap->n_mapped; i++)  {
+		seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+			pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+	}
+}
+
 static int xics_debug_show(struct seq_file *m, void *private)
 {
 	struct kvmppc_xics *xics = m->private;
@@ -914,6 +929,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	t_check_resend = 0;
 	t_reject = 0;
 
+	xics_debugfs_irqmap(m, kvm->arch.pimap);
+
 	seq_printf(m, "=========\nICP state\n=========\n");
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-- 
cgit v0.10.2


From 644abbb254b1ab171f777431b23e6fb5879599d0 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:54 +1000
Subject: KVM: PPC: Book3S HV: Tunable to disable KVM IRQ bypass

Add a  module parameter kvm_irq_bypass for kvm_hv.ko to
disable IRQ bypass for passthrough interrupts. The default
value of this tunable is 1 - that is enable the feature.

Since the tunable is used by built-in kernel code, we use
the module_param_cb macro to achieve this.

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index c261f52..cef2b89 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -227,6 +227,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
 				 struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
 				   struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e0ada31..97b9bad 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -461,7 +461,7 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
 				struct kvm *kvm)
 {
-	if (kvm)
+	if (kvm && kvm_irq_bypass)
 		return kvm->arch.pimap;
 	return NULL;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 90beb96..03721ed 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -95,6 +95,10 @@ static struct kernel_param_ops module_param_ops = {
 	.get = param_get_int,
 };
 
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+							S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 							S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
@@ -3443,6 +3447,9 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	struct irq_chip *chip;
 	int i;
 
+	if (!kvm_irq_bypass)
+		return 1;
+
 	desc = irq_to_desc(host_irq);
 	if (!desc)
 		return -EIO;
@@ -3519,6 +3526,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	struct kvmppc_passthru_irqmap *pimap;
 	int i;
 
+	if (!kvm_irq_bypass)
+		return 0;
+
 	desc = irq_to_desc(host_irq);
 	if (!desc)
 		return -EIO;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 3b8d7ac..00b9dfd 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -27,6 +27,8 @@
 
 int h_ipi_redirect = 1;
 EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);
-- 
cgit v0.10.2


From 366274f59c4de018f72ab44bb41ccaf3d657eb52 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:55 +1000
Subject: KVM: PPC: Book3S HV: Update irq stats for IRQs handled in real mode

When a passthrough IRQ is handled completely within KVM real
mode code, it has to also update the IRQ stats since this
does not go through the generic IRQ handling code.

However, the per CPU kstat_irqs field is an allocated (not static)
field and so cannot be directly accessed in real mode safely.

The function this_cpu_inc_rm() is introduced to safely increment
per CPU fields (currently coded for unsigned integers only) that
are allocated and could thus be vmalloced also.

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 00b9dfd..554cdfa 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/kvm_host.h>
 #include <linux/err.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_ppc.h>
@@ -18,6 +19,7 @@
 #include <asm/debug.h>
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
+#include <asm/pgtable.h>
 #include <asm/ppc-opcode.h>
 #include <asm/pnv-pci.h>
 
@@ -734,6 +736,53 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
 	_stwcix(xics_phys + XICS_XIRR, xirr);
 }
 
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+	unsigned long l;
+	unsigned int *raddr;
+	int cpu = smp_processor_id();
+
+	raddr = per_cpu_ptr(addr, cpu);
+	l = (unsigned long)raddr;
+
+	if (REGION_ID(l) == VMALLOC_REGION_ID) {
+		l = vmalloc_to_phys(raddr);
+		raddr = (unsigned int *)l;
+	}
+	++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ *  - state flags represent internal IRQ state and are not expected to be
+ *    updated outside the IRQ subsystem
+ *  - more importantly, these are useful for edge triggered interrupts,
+ *    IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ *    and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+	this_cpu_inc_rm(desc->kstat_irqs);
+	__this_cpu_inc(kstat.irqs_sum);
+}
+
 long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 				 u32 xirr,
 				 struct kvmppc_irq_map *irq_map,
@@ -747,6 +796,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 	xics = vcpu->kvm->arch.xics;
 	icp = vcpu->arch.icp;
 
+	kvmppc_rm_handle_irq_desc(irq_map->desc);
 	icp_rm_deliver_irq(xics, icp, irq);
 
 	/* EOI the interrupt */
-- 
cgit v0.10.2


From 5d375199ea963fa2a972eae9c7d83db36ed37082 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 19 Aug 2016 15:35:56 +1000
Subject: KVM: PPC: Book3S HV: Set server for passed-through interrupts

When a guest has a PCI pass-through device with an interrupt, it
will direct the interrupt to a particular guest VCPU.  In fact the
physical interrupt might arrive on any CPU, and then get
delivered to the target VCPU in the emulated XICS (guest interrupt
controller), and eventually delivered to the target VCPU.

Now that we have code to handle device interrupts in real mode
without exiting to the host kernel, there is an advantage to having
the device interrupt arrive on the same sub(core) as the target
VCPU is running on.  In this situation, the interrupt can be
delivered to the target VCPU without any exit to the host kernel
(using a hypervisor doorbell interrupt between threads if
necessary).

This patch aims to get passed-through device interrupts arriving
on the correct core by setting the interrupt server in the real
hardware XICS for the interrupt to the first thread in the (sub)core
where its target VCPU is running.  We do this in the real-mode H_EOI
code because the H_EOI handler already needs to look at the
emulated ICS state for the interrupt (whereas the H_XIRR handler
doesn't), and we know we are running in the target VCPU context
at that point.

We set the server CPU in hardware using an OPAL call, regardless of
what the IRQ affinity mask for the interrupt says, and without
updating the affinity mask.  This amounts to saying that when an
interrupt is passed through to a guest, as a matter of policy we
allow the guest's affinity for the interrupt to override the host's.

This is inspired by an earlier patch from Suresh Warrier, although
none of this code came from that earlier patch.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 97b9bad..f6e4964 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -479,6 +479,10 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
 			struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xics_ipi_action(void);
+extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+				   unsigned long host_irq);
+extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+				   unsigned long host_irq);
 extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
 				 struct kvmppc_irq_map *irq_map,
 				 struct kvmppc_passthru_irqmap *pimap);
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ee05bd2..e958b70 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func,
 int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
 				   uint64_t offset, uint32_t data);
 int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
+int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
 int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
 int64_t opal_register_exception_handler(uint64_t opal_exception,
 					uint64_t handler_address,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 03721ed..9b3bba6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3515,6 +3515,8 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	if (i == pimap->n_mapped)
 		pimap->n_mapped++;
 
+	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
 	mutex_unlock(&kvm->lock);
 
 	return 0;
@@ -3551,6 +3553,8 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -ENODEV;
 	}
 
+	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
 	/* invalidate the entry */
 	pimap->mapped[i].r_hwirq = 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 554cdfa..5f7527e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -22,6 +22,7 @@
 #include <asm/pgtable.h>
 #include <asm/ppc-opcode.h>
 #include <asm/pnv-pci.h>
+#include <asm/opal.h>
 
 #include "book3s_xics.h"
 
@@ -34,6 +35,7 @@ EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
 
 /* -- ICS routines -- */
 static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -713,6 +715,13 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 		icp->rm_action |= XICS_RM_NOTIFY_EOI;
 		icp->rm_eoied_irq = irq;
 	}
+
+	if (state->host_irq && state->intr_cpu != -1) {
+		int pcpu = cpu_first_thread_sibling(raw_smp_processor_id());
+		if (state->intr_cpu != pcpu)
+			xics_opal_rm_set_server(state->host_irq, pcpu);
+		state->intr_cpu = -1;
+	}
  bail:
 	return check_too_hard(xics, icp);
 }
@@ -736,6 +745,13 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
 	_stwcix(xics_phys + XICS_XIRR, xirr);
 }
 
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+{
+	unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+	return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
 /*
  * Increment a per-CPU 32-bit unsigned integer variable.
  * Safe to call in real-mode. Handles vmalloc'ed addresses
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index be5179c..3bdc639 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 		return 0;
 	}
 
+	/* Record which CPU this arrived on for passed-through interrupts */
+	if (state->host_irq)
+		state->intr_cpu = raw_smp_processor_id();
+
 	/* Attempt delivery */
 	icp_deliver_irq(xics, NULL, irq);
 
@@ -1438,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	return pin;
 }
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+			    unsigned long host_irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	u16 idx;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return;
+
+	ics->irq_state[idx].host_irq = host_irq;
+	ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+			    unsigned long host_irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	u16 idx;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return;
+
+	ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index a46b954..2a50320 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -42,6 +42,8 @@ struct ics_irq_state {
 	u8  lsi;		/* level-sensitive interrupt */
 	u8  asserted; /* Only for LSI */
 	u8  exists;
+	int intr_cpu;
+	u32 host_irq;
 };
 
 /* Atomic ICP state, updated with a single compare & swap */
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3d29d40..44d2d84 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte,		OPAL_PCI_CONFIG_WRITE_BYTE);
 OPAL_CALL(opal_pci_config_write_half_word,	OPAL_PCI_CONFIG_WRITE_HALF_WORD);
 OPAL_CALL(opal_pci_config_write_word,		OPAL_PCI_CONFIG_WRITE_WORD);
 OPAL_CALL(opal_set_xive,			OPAL_SET_XIVE);
+OPAL_CALL_REAL(opal_rm_set_xive,		OPAL_SET_XIVE);
 OPAL_CALL(opal_get_xive,			OPAL_GET_XIVE);
 OPAL_CALL(opal_register_exception_handler,	OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
 OPAL_CALL(opal_pci_eeh_freeze_status,		OPAL_PCI_EEH_FREEZE_STATUS);
-- 
cgit v0.10.2


From 65e7026a6c90484fbaa076d2c51e61baf7241960 Mon Sep 17 00:00:00 2001
From: Suresh Warrier <warrier@linux.vnet.ibm.com>
Date: Fri, 19 Aug 2016 15:35:57 +1000
Subject: KVM: PPC: Book3S HV: Counters for passthrough IRQ stats

Add VCPU stat counters to track affinity for passthrough
interrupts.

pthru_all: Counts all passthrough interrupts whose IRQ mappings are
           in the kvmppc_passthru_irq_map structure.
pthru_host: Counts all cached passthrough interrupts that were injected
	    from the host through kvm_set_irq (i.e. not handled in
	    real mode).
pthru_bad_aff: Counts how many cached passthrough interrupts have
               bad affinity (receiving CPU is not running VCPU that is
	       the target of the virtual interrupt in the guest).

Signed-off-by: Suresh Warrier <warrier@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 89ac1f6..ed30d2e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -137,6 +137,9 @@ struct kvm_vcpu_stat {
 	u64 ld_slow;
 	u64 st_slow;
 #endif
+	u64 pthru_all;
+	u64 pthru_host;
+	u64 pthru_bad_aff;
 };
 
 enum kvm_exit_types {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 71eb8f3..ba231a1 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -68,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "ld_slow",     VCPU_STAT(ld_slow) },
 	{ "st",          VCPU_STAT(st) },
 	{ "st_slow",     VCPU_STAT(st_slow) },
+	{ "pthru_all",       VCPU_STAT(pthru_all) },
+	{ "pthru_host",      VCPU_STAT(pthru_host) },
+	{ "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
 	{ NULL }
 };
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 5f7527e..82ff5de 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -716,11 +716,19 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 		icp->rm_eoied_irq = irq;
 	}
 
-	if (state->host_irq && state->intr_cpu != -1) {
-		int pcpu = cpu_first_thread_sibling(raw_smp_processor_id());
-		if (state->intr_cpu != pcpu)
-			xics_opal_rm_set_server(state->host_irq, pcpu);
-		state->intr_cpu = -1;
+	if (state->host_irq) {
+		++vcpu->stat.pthru_all;
+		if (state->intr_cpu != -1) {
+			int pcpu = raw_smp_processor_id();
+
+			pcpu = cpu_first_thread_sibling(pcpu);
+			++vcpu->stat.pthru_host;
+			if (state->intr_cpu != pcpu) {
+				++vcpu->stat.pthru_bad_aff;
+				xics_opal_rm_set_server(state->host_irq, pcpu);
+			}
+			state->intr_cpu = -1;
+		}
 	}
  bail:
 	return check_too_hard(xics, icp);
-- 
cgit v0.10.2


From f3c0ce86a8da6a43c3551a2223abd42040316dd8 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 28 Aug 2016 16:30:07 +0200
Subject: KVM: PPC: e500: Use kmalloc_array() in kvm_vcpu_ioctl_config_tlb()

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kmalloc_array".

  This issue was detected by using the Coccinelle software.

* Replace the specification of a data type by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 29911a0..26f3737 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -779,7 +779,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
 	num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
 		    cfg->array / PAGE_SIZE;
-	pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+	pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 46d4e7479252d3fd82a0ea121daef75164a91939 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 28 Aug 2016 17:34:46 +0200
Subject: KVM: PPC: e500: Less function calls in kvm_vcpu_ioctl_config_tlb()
 after error detection

The kfree() function was called in two cases by the
kvm_vcpu_ioctl_config_tlb() function during error handling
even if the passed data structure element contained a null pointer.

* Split a condition check for memory allocation failures.

* Adjust jump targets according to the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 26f3737..b65a894 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -785,35 +785,39 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
 	ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
 	if (ret < 0)
-		goto err_pages;
+		goto free_pages;
 
 	if (ret != num_pages) {
 		num_pages = ret;
 		ret = -EFAULT;
-		goto err_put_page;
+		goto put_pages;
 	}
 
 	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
 	if (!virt) {
 		ret = -ENOMEM;
-		goto err_put_page;
+		goto put_pages;
 	}
 
 	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
 			   GFP_KERNEL);
+	if (!privs[0]) {
+		ret = -ENOMEM;
+		goto put_pages;
+	}
+
 	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
 			   GFP_KERNEL);
-
-	if (!privs[0] || !privs[1]) {
+	if (!privs[1]) {
 		ret = -ENOMEM;
-		goto err_privs;
+		goto free_privs_first;
 	}
 
 	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
 	                     GFP_KERNEL);
 	if (!g2h_bitmap) {
 		ret = -ENOMEM;
-		goto err_privs;
+		goto free_privs_second;
 	}
 
 	free_gtlb(vcpu_e500);
@@ -845,16 +849,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
 	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
-
-err_privs:
-	kfree(privs[0]);
+ free_privs_second:
 	kfree(privs[1]);
-
-err_put_page:
+ free_privs_first:
+	kfree(privs[0]);
+ put_pages:
 	for (i = 0; i < num_pages; i++)
 		put_page(pages[i]);
-
-err_pages:
+ free_pages:
 	kfree(pages);
 	return ret;
 }
-- 
cgit v0.10.2


From cfb60813fb8363a1681da2c10947e5f5b4165c47 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 28 Aug 2016 17:37:10 +0200
Subject: KVM: PPC: e500: Delete an unnecessary initialisation in
 kvm_vcpu_ioctl_config_tlb()

The local variable "g2h_bitmap" will be set to an appropriate value
a bit later. Thus omit the explicit initialisation at the beginning.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index b65a894..e9c19e9 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	char *virt;
 	struct page **pages;
 	struct tlbe_priv *privs[2] = {};
-	u64 *g2h_bitmap = NULL;
+	u64 *g2h_bitmap;
 	size_t array_len;
 	u32 sets;
 	int num_pages, ret, i;
-- 
cgit v0.10.2


From b0ac477bc4efd6b9a3eccc106e597edf27546c11 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 28 Aug 2016 18:30:38 +0200
Subject: KVM: PPC: e500: Replace kzalloc() calls by kcalloc() in two functions

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kcalloc".

  Suggested-by: Paolo Bonzini <pbonzini@redhat.com>

  This issue was detected also by using the Coccinelle software.

* Replace the specification of data structures by pointer dereferences
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index e9c19e9..2be2afc4 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -799,22 +799,21 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 		goto put_pages;
 	}
 
-	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
-			   GFP_KERNEL);
+	privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
 	if (!privs[0]) {
 		ret = -ENOMEM;
 		goto put_pages;
 	}
 
-	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
-			   GFP_KERNEL);
+	privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
 	if (!privs[1]) {
 		ret = -ENOMEM;
 		goto free_privs_first;
 	}
 
-	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
-	                     GFP_KERNEL);
+	g2h_bitmap = kcalloc(params.tlb_sizes[1],
+			     sizeof(*g2h_bitmap),
+			     GFP_KERNEL);
 	if (!g2h_bitmap) {
 		ret = -ENOMEM;
 		goto free_privs_second;
@@ -929,20 +928,20 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	vcpu_e500->gtlb_offset[0] = 0;
 	vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
 
-	vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
-					  vcpu_e500->gtlb_params[0].entries,
+	vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+					  sizeof(struct tlbe_ref),
 					  GFP_KERNEL);
 	if (!vcpu_e500->gtlb_priv[0])
 		goto err;
 
-	vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
-					  vcpu_e500->gtlb_params[1].entries,
+	vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+					  sizeof(struct tlbe_ref),
 					  GFP_KERNEL);
 	if (!vcpu_e500->gtlb_priv[1])
 		goto err;
 
-	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
-					  vcpu_e500->gtlb_params[1].entries,
+	vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+					  sizeof(*vcpu_e500->g2h_tlb1_map),
 					  GFP_KERNEL);
 	if (!vcpu_e500->g2h_tlb1_map)
 		goto err;
-- 
cgit v0.10.2


From 90235dc19e5ccd85ad17ebbf51041770e87540a0 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 28 Aug 2016 18:40:08 +0200
Subject: KVM: PPC: e500: Use kmalloc_array() in kvmppc_e500_tlb_init()

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus use the corresponding function "kmalloc_array".

* Replace the specification of a data structure by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 2be2afc4..0a2eeb1 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -905,8 +905,6 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
-	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
-	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
 
 	if (e500_mmu_host_init(vcpu_e500))
 		goto err;
@@ -921,7 +919,10 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
 	vcpu_e500->gtlb_params[1].sets = 1;
 
-	vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+	vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+					     KVM_E500_TLB1_SIZE,
+					     sizeof(*vcpu_e500->gtlb_arch),
+					     GFP_KERNEL);
 	if (!vcpu_e500->gtlb_arch)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From aad9e5ba243336d2b133e7fa9aace1a51d6fdae6 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 12 Sep 2016 22:33:53 +0200
Subject: KVM: PPC: e500: Rename jump labels in kvmppc_e500_tlb_init()

Adjust jump labels according to the current Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 0a2eeb1..ddbf8f0 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -907,7 +907,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
 
 	if (e500_mmu_host_init(vcpu_e500))
-		goto err;
+		goto free_vcpu;
 
 	vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
 	vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -933,26 +933,25 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 					  sizeof(struct tlbe_ref),
 					  GFP_KERNEL);
 	if (!vcpu_e500->gtlb_priv[0])
-		goto err;
+		goto free_vcpu;
 
 	vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
 					  sizeof(struct tlbe_ref),
 					  GFP_KERNEL);
 	if (!vcpu_e500->gtlb_priv[1])
-		goto err;
+		goto free_vcpu;
 
 	vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
 					  sizeof(*vcpu_e500->g2h_tlb1_map),
 					  GFP_KERNEL);
 	if (!vcpu_e500->g2h_tlb1_map)
-		goto err;
+		goto free_vcpu;
 
 	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
 
 	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
-
-err:
+ free_vcpu:
 	free_gtlb(vcpu_e500);
 	return -1;
 }
-- 
cgit v0.10.2