From 1f5580986a3667e9d67b65d916bb4249fd86a400 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 8 Oct 2013 18:38:13 +0100
Subject: ARM: KVM: Yield CPU when vcpu executes a WFE

On an (even slightly) oversubscribed system, spinlocks are quickly
becoming a bottleneck, as some vcpus are spinning, waiting for a
lock to be released, while the vcpu holding the lock may not be
running at all.

This creates contention, and the observed slowdown is 40x for
hackbench. No, this isn't a typo.

The solution is to trap blocking WFEs and tell KVM that we're
now spinning. This ensures that other vpus will get a scheduling
boost, allowing the lock to be released more quickly. Also, using
CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT slightly improves the performance
when the VM is severely overcommited.

Quick test to estimate the performance: hackbench 1 process 1000

2xA15 host (baseline):	1.843s

2xA15 guest w/o patch:	2.083s
4xA15 guest w/o patch:	80.212s
8xA15 guest w/o patch:	Could not be bothered to find out

2xA15 guest w/ patch:	2.102s
4xA15 guest w/ patch:	3.205s
8xA15 guest w/ patch:	6.887s

So we go from a 40x degradation to 1.5x in the 2x overcommit case,
which is vaguely more acceptable.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index d556f03..fe395b7 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -67,7 +67,7 @@
  */
 #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
 			HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
-			HCR_SWIO | HCR_TIDCP)
+			HCR_TWE | HCR_SWIO | HCR_TIDCP)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 
 /* System Control Register (SCTLR) bits */
@@ -208,6 +208,8 @@
 #define HSR_EC_DABT	(0x24)
 #define HSR_EC_DABT_HYP	(0x25)
 
+#define HSR_WFI_IS_WFE		(1U << 0)
+
 #define HSR_HVC_IMM_MASK	((1UL << 16) - 1)
 
 #define HSR_DABT_S1PTW		(1U << 7)
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index ebf5015..466bd29 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select KVM_MMIO
 	select KVM_ARM_HOST
 	depends on ARM_VIRT_EXT && ARM_LPAE
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index df4c82d..c4c496f 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -84,7 +84,11 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
 static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	trace_kvm_wfi(*vcpu_pc(vcpu));
-	kvm_vcpu_block(vcpu);
+	if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE)
+		kvm_vcpu_on_spin(vcpu);
+	else
+		kvm_vcpu_block(vcpu);
+
 	return 1;
 }
 
-- 
cgit v0.10.2


From 82ea046c95a3c3ddcfa058c8a270b9afb6e93700 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 15 Oct 2013 18:10:42 -0700
Subject: KVM: ARM: Update comments for kvm_handle_wfi

Update comments to reflect what is really going on and add the TWE bit
to the comments in kvm_arm.h.

Also renames the function to kvm_handle_wfx like is done on arm64 for
consistency and uber-correctness.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index fe395b7..1d3153c 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -57,6 +57,7 @@
  * TSC:		Trap SMC
  * TSW:		Trap cache operations by set/way
  * TWI:		Trap WFI
+ * TWE:		Trap WFE
  * TIDCP:	Trap L2CTLR/L2ECTLR
  * BSU_IS:	Upgrade barriers to the inner shareable domain
  * FB:		Force broadcast of all maintainance operations
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index c4c496f..a920790 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -73,15 +73,17 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run)
 }
 
 /**
- * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest
+ * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests
  * @vcpu:	the vcpu pointer
  * @run:	the kvm_run structure pointer
  *
- * Simply sets the wait_for_interrupts flag on the vcpu structure, which will
- * halt execution of world-switches and schedule other host processes until
- * there is an incoming IRQ or FIQ to the VM.
+ * WFE: Yield the CPU and come back to this vcpu when the scheduler
+ * decides to.
+ * WFI: Simply call kvm_vcpu_block(), which will halt execution of
+ * world-switches and schedule other host processes until there is an
+ * incoming IRQ or FIQ to the VM.
  */
-static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
+static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	trace_kvm_wfi(*vcpu_pc(vcpu));
 	if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE)
@@ -93,7 +95,7 @@ static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run)
 }
 
 static exit_handle_fn arm_exit_handlers[] = {
-	[HSR_EC_WFI]		= kvm_handle_wfi,
+	[HSR_EC_WFI]		= kvm_handle_wfx,
 	[HSR_EC_CP15_32]	= kvm_handle_cp15_32,
 	[HSR_EC_CP15_64]	= kvm_handle_cp15_64,
 	[HSR_EC_CP14_MR]	= kvm_handle_cp14_access,
-- 
cgit v0.10.2


From 2d1d841bd44e24b58a3d3cc4fa793670aaa38fbf Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 18 Oct 2013 18:19:04 +0100
Subject: ARM: KVM: Fix MPIDR computing to support virtual clusters

In order to be able to support more than 4 A7 or A15 CPUs,
we need to fix the MPIDR computing to reflect the fact that
both A15 and A7 can only exist in clusters of at most 4 CPUs.

Fix the MPIDR computing to allow virtual clusters to be exposed
to the guest.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index a629f2c..631e6bd 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -74,11 +74,13 @@ int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run)
 static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
 {
 	/*
-	 * Compute guest MPIDR. No need to mess around with different clusters
-	 * but we read the 'U' bit from the underlying hardware directly.
+	 * Compute guest MPIDR. We build a virtual cluster out of the
+	 * vcpu_id, but we read the 'U' bit from the underlying
+	 * hardware directly.
 	 */
-	vcpu->arch.cp15[c0_MPIDR] = (read_cpuid_mpidr() & MPIDR_SMP_BITMASK)
-					| vcpu->vcpu_id;
+	vcpu->arch.cp15[c0_MPIDR] = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) |
+				     ((vcpu->vcpu_id >> 2) << MPIDR_LEVEL_BITS) |
+				     (vcpu->vcpu_id & 3));
 }
 
 /* TRM entries A7:4.3.31 A15:4.3.28 - RO WI */
-- 
cgit v0.10.2


From 9cbb6d969cb6561de45d917b8bb9281cb374bb35 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 18 Oct 2013 18:19:05 +0100
Subject: ARM: KVM: fix L2CTLR to be per-cluster

The L2CTLR register contains the number of CPUs in this cluster.

Make sure the register content is actually relevant to the vcpu
that is being configured by computing the number of cores that are
part of its cluster.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 631e6bd..78c0885 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -124,6 +124,10 @@ static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
 	asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr));
 	l2ctlr &= ~(3 << 24);
 	ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1;
+	/* How many cores in the current cluster and the next ones */
+	ncores -= (vcpu->vcpu_id & ~3);
+	/* Cap it to the maximum number of cores in a single cluster */
+	ncores = min(ncores, 3U);
 	l2ctlr |= (ncores & 3) << 24;
 
 	vcpu->arch.cp15[c9_L2CTLR] = l2ctlr;
-- 
cgit v0.10.2


From 7999b4d18211bcfb40e3574cf75e94518e9fa2c6 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 18 Oct 2013 18:19:06 +0100
Subject: ARM: KVM: drop limitation to 4 CPU VMs

Now that the KVM/arm code knows about affinity, remove the hard
limit of 4 vcpus per VM.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index d9bbd83..2c5add0 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -33,8 +33,6 @@
  * Cortex-A15 and Cortex-A7 Reset Values
  */
 
-static const int cortexa_max_cpu_idx = 3;
-
 static struct kvm_regs cortexa_regs_reset = {
 	.usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
 };
@@ -64,8 +62,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	switch (vcpu->arch.target) {
 	case KVM_ARM_TARGET_CORTEX_A7:
 	case KVM_ARM_TARGET_CORTEX_A15:
-		if (vcpu->vcpu_id > cortexa_max_cpu_idx)
-			return -EINVAL;
 		cpu_reset = &cortexa_regs_reset;
 		vcpu->arch.midr = read_cpuid_id();
 		cpu_vtimer_irq = &cortexa_vtimer_irq;
-- 
cgit v0.10.2


From 79c648806f9034abf54332b78043bb242189d953 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 18 Oct 2013 18:19:03 +0100
Subject: arm/arm64: KVM: PSCI: use MPIDR to identify a target CPU

The KVM PSCI code blindly assumes that vcpu_id and MPIDR are
the same thing. This is true when vcpus are organized as a flat
topology, but is wrong when trying to emulate any other topology
(such as A15 clusters).

Change the KVM PSCI CPU_ON code to look at the MPIDR instead
of the vcpu_id to pick a target CPU.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>

diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index a464e8d..708e4d8 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -157,4 +157,9 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
 }
 
+static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cp15[c0_MPIDR];
+}
+
 #endif /* __ARM_KVM_EMULATE_H__ */
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 86a693a..3112631 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -18,6 +18,7 @@
 #include <linux/kvm_host.h>
 #include <linux/wait.h>
 
+#include <asm/cputype.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_psci.h>
 
@@ -34,22 +35,30 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
 	struct kvm *kvm = source_vcpu->kvm;
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu = NULL, *tmp;
 	wait_queue_head_t *wq;
 	unsigned long cpu_id;
+	unsigned long mpidr;
 	phys_addr_t target_pc;
+	int i;
 
 	cpu_id = *vcpu_reg(source_vcpu, 1);
 	if (vcpu_mode_is_32bit(source_vcpu))
 		cpu_id &= ~((u32) 0);
 
-	if (cpu_id >= atomic_read(&kvm->online_vcpus))
+	kvm_for_each_vcpu(i, tmp, kvm) {
+		mpidr = kvm_vcpu_get_mpidr(tmp);
+		if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) {
+			vcpu = tmp;
+			break;
+		}
+	}
+
+	if (!vcpu)
 		return KVM_PSCI_RET_INVAL;
 
 	target_pc = *vcpu_reg(source_vcpu, 2);
 
-	vcpu = kvm_get_vcpu(kvm, cpu_id);
-
 	wq = kvm_arch_vcpu_wq(vcpu);
 	if (!waitqueue_active(wq))
 		return KVM_PSCI_RET_INVAL;
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index eec0738..6df93cd 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -177,4 +177,9 @@ static inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
 	return kvm_vcpu_get_hsr(vcpu) & ESR_EL2_FSC_TYPE;
 }
 
+static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu)
+{
+	return vcpu_sys_reg(vcpu, MPIDR_EL1);
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
-- 
cgit v0.10.2