From 91269b8f94eedce1767b2f208d656e5a5683326a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 25 Jul 2010 14:51:16 +0300
Subject: KVM: x86 emulator: fix handling for unemulated instructions

If an instruction is present in the decode tables but not in the execution
switch, it will be emulated as a NOP.  An example is IRET (0xcf).

Fix by adding default: labels to the execution switches.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98a..70e47d3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3028,6 +3028,8 @@ special_insn:
 		if (c->modrm_reg == 5)
 			goto jump_far;
 		goto grp45;
+	default:
+		goto cannot_emulate;
 	}
 
 writeback:
@@ -3353,6 +3355,8 @@ twobyte_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	default:
+		goto cannot_emulate;
 	}
 	goto writeback;
 
-- 
cgit v0.10.2


From 83babbca4617ab086621fe65a71a2168420f1d88 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:39 +0300
Subject: KVM: x86 emulator: add macros for repetitive instructions

Some instructions are repetitive in the opcode space, add macros for
consolidating them.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 70e47d3..c5c42e0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -94,6 +94,15 @@
 #define Src2One     (3<<29)
 #define Src2Mask    (7<<29)
 
+#define X2(x) (x), (x)
+#define X3(x) X2(x), (x)
+#define X4(x) X2(x), X2(x)
+#define X5(x) X4(x), (x)
+#define X6(x) X4(x), X2(x)
+#define X7(x) X4(x), X3(x)
+#define X8(x) X4(x), X4(x)
+#define X16(x) X8(x), X8(x)
+
 enum {
 	Group1_80, Group1_81, Group1_82, Group1_83,
 	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
-- 
cgit v0.10.2


From 749358a6b4691bfd2abfa9e4be2142af4697de3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:40 +0300
Subject: KVM: x86 emulator: consolidate inc/dec reg decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c5c42e0..65d8960 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -147,10 +147,8 @@ static u32 opcode_table[256] = {
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
 	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
 	0, 0,
-	/* 0x40 - 0x47 */
-	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
-	/* 0x48 - 0x4F */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
+	/* 0x40 - 0x4F */
+	X16(DstReg),
 	/* 0x50 - 0x57 */
 	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
 	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-- 
cgit v0.10.2


From 3849186c381e2e6291828579c382662520b44696 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:41 +0300
Subject: KVM: x86 emulator: consolidate push/pop reg decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 65d8960..68e5b73 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -150,11 +150,9 @@ static u32 opcode_table[256] = {
 	/* 0x40 - 0x4F */
 	X16(DstReg),
 	/* 0x50 - 0x57 */
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+	X8(SrcReg | Stack),
 	/* 0x58 - 0x5F */
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+	X8(DstReg | Stack),
 	/* 0x60 - 0x67 */
 	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
 	0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-- 
cgit v0.10.2


From b3ab3405fe3d40ae9c5350ee014c7c086fcf3d97 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:42 +0300
Subject: KVM: x86 emulator: consolidate Jcc rel8 decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 68e5b73..7870821 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,12 +161,8 @@ static u32 opcode_table[256] = {
 	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
 	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
 	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
-	/* 0x70 - 0x77 */
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	/* 0x78 - 0x7F */
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+	/* 0x70 - 0x7F */
+	X16(SrcImmByte),
 	/* 0x80 - 0x87 */
 	Group | Group1_80, Group | Group1_81,
 	Group | Group1_82, Group | Group1_83,
-- 
cgit v0.10.2


From b6e6153885d6463896d9b465e59b361eac60efa0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:43 +0300
Subject: KVM: x86 emulator: consolidate MOV reg, imm decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7870821..a6ce7f1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -188,15 +188,9 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
 	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	X8(ByteOp | DstReg | SrcImm | Mov),
 	/* 0xB8 - 0xBF */
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	X8(DstReg | SrcImm | Mov),
 	/* 0xC0 - 0xC7 */
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
 	0, ImplicitOps | Stack, 0, 0,
-- 
cgit v0.10.2


From be8eacddbd8ee60506a6f940b3efb93cb61d7861 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:44 +0300
Subject: KVM: x86 emulator: consolidate CMOVcc decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a6ce7f1..0526be1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -238,16 +238,8 @@ static u32 twobyte_table[256] = {
 	ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
 	ImplicitOps, ImplicitOps | Priv, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x40 - 0x47 */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x48 - 0x4F */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	/* 0x40 - 0x4F */
+	X16(DstReg | SrcMem | ModRM | Mov),
 	/* 0x50 - 0x5F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x60 - 0x6F */
-- 
cgit v0.10.2


From 880a1883785d37287e13e4faf3fe92b294404de0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:45 +0300
Subject: KVM: x86 emulator: consolidate Jcc rel32 decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0526be1..fd40735 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -247,8 +247,7 @@ static u32 twobyte_table[256] = {
 	/* 0x70 - 0x7F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x80 - 0x8F */
-	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
-	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+	X16(SrcImm),
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
-- 
cgit v0.10.2


From 2ce495365f6cdd5792c4db0ddb8ac8544950b671 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:46 +0300
Subject: KVM: x86 emulator: Make group storage bits separate from operand bits

Currently group bits are stored in bits 0:7, where operand bits are stored.

Make group bits be 0:3, and move the existing bits 0:3 to 16:19, so we can
mix group and operand bits.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fd40735..61139e2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -46,15 +46,15 @@
  */
 
 /* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<0)	/* 8-bit operands. */
+#define ByteOp      (1<<16)	/* 8-bit operands. */
 /* Destination operand type. */
-#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)	/* Register operand. */
-#define DstMem      (3<<1)	/* Memory operand. */
-#define DstAcc      (4<<1)      /* Destination Accumulator */
-#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
-#define DstMem64    (6<<1)	/* 64bit memory operand */
-#define DstMask     (7<<1)
+#define ImplicitOps (1<<17)	/* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<17)	/* Register operand. */
+#define DstMem      (3<<17)	/* Memory operand. */
+#define DstAcc      (4<<17)	/* Destination Accumulator */
+#define DstDI       (5<<17)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<17)	/* 64bit memory operand */
+#define DstMask     (7<<17)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
 #define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
@@ -82,7 +82,7 @@
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
-#define GroupMask   0xff        /* Group number stored in bits 0:7 */
+#define GroupMask   0x0f        /* Group number stored in bits 0:3 */
 /* Misc flags */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
-- 
cgit v0.10.2


From 047a4818094217a1323d8f31f9318ea2e142f745 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:47 +0300
Subject: KVM: x86 emulator: add Undefined decode flag

Add a decode flag to indicate the instruction is invalid.  Will come in useful
later, when we mix decode bits from the opcode and group table.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 61139e2..b1e3e8c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -84,6 +84,7 @@
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0x0f        /* Group number stored in bits 0:3 */
 /* Misc flags */
+#define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
@@ -1065,7 +1066,7 @@ done_prefixes:
 	}
 
 	/* Unrecognised? */
-	if (c->d == 0) {
+	if (c->d == 0 || (c->d & Undefined)) {
 		DPRINTF("Cannot emulate %02x\n", c->b);
 		return -1;
 	}
-- 
cgit v0.10.2


From 52811d7de565b2db988257591fbf2a6be31c1459 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:48 +0300
Subject: KVM: x86 emulator: mix decode bits from opcode and group decode
 tables

Allow bits that are common to all members of a group to be specified in the
opcode table instead of the group table.  This allows some simplification
of the decode tables.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b1e3e8c..ef2b5af 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -955,7 +955,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group;
+	int def_op_bytes, def_ad_bytes, group, dual;
 
 
 	/* we cannot decode insn before we complete previous rep insn */
@@ -1055,14 +1055,16 @@ done_prefixes:
 
 	if (c->d & Group) {
 		group = c->d & GroupMask;
+		dual = c->d & GroupDual;
 		c->modrm = insn_fetch(u8, 1, c->eip);
 		--c->eip;
 
 		group = (group << 3) + ((c->modrm >> 3) & 7);
-		if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
-			c->d = group2_table[group];
+		c->d &= ~(Group | GroupDual | GroupMask);
+		if (dual && (c->modrm >> 6) == 3)
+			c->d |= group2_table[group];
 		else
-			c->d = group_table[group];
+			c->d |= group_table[group];
 	}
 
 	/* Unrecognised? */
-- 
cgit v0.10.2


From 4968ec4e26007770d8759fbface4d4712a27b5d4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:49 +0300
Subject: KVM: x86 emulator: simplify Group 1 decoding

Move operand decoding to the opcode table, keep lock decoding in the group
table.  This allows us to get consolidate the four variants of Group 1 into one
group.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ef2b5af..1ce9c6d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,8 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	Group1_80, Group1_81, Group1_82, Group1_83,
-	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+	Group1, Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 	Group8, Group9,
 };
 
@@ -165,8 +164,10 @@ static u32 opcode_table[256] = {
 	/* 0x70 - 0x7F */
 	X16(SrcImmByte),
 	/* 0x80 - 0x87 */
-	Group | Group1_80, Group | Group1_81,
-	Group | Group1_82, Group | Group1_83,
+	ByteOp | DstMem | SrcImm | ModRM | Group | Group1,
+	DstMem | SrcImm | ModRM | Group | Group1,
+	ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1,
+	DstMem | SrcImmByte | ModRM | Group | Group1,
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	/* 0x88 - 0x8F */
@@ -285,42 +286,8 @@ static u32 twobyte_table[256] = {
 };
 
 static u32 group_table[] = {
-	[Group1_80*8] =
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM,
-	[Group1_81*8] =
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM,
-	[Group1_82*8] =
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64,
-	[Group1_83*8] =
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM,
+	[Group1*8] =
+	X7(Lock), 0,
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
-- 
cgit v0.10.2


From dfe11481d8f1b6a7354c34cb252ff1a8af233cfe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:50 +0300
Subject: KVM: x86 emulator: Allow LOCK prefix for NEG and NOT

Opcodes F6/2, F6/3, F7/2, F7/3.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1ce9c6d..bbe2d09 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -292,11 +292,11 @@ static u32 group_table[] = {
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
 	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
-	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0,
 	[Group3*8] =
 	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0,
 	[Group4*8] =
 	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-- 
cgit v0.10.2


From e071edd5ba8dd7a493eef229d495cf6232b09534 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:51 +0300
Subject: KVM: x86 emulator: unify the two Group 3 variants

Use just one group table for byte (F6) and word (F7) opcodes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bbe2d09..7f615c5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,8 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	Group1, Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
-	Group8, Group9,
+	Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 static u32 opcode_table[256] = {
@@ -217,7 +216,7 @@ static u32 opcode_table[256] = {
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
-	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
+	ImplicitOps | Priv, ImplicitOps, ByteOp | Group | Group3, Group | Group3,
 	/* 0xF8 - 0xFF */
 	ImplicitOps, 0, ImplicitOps, ImplicitOps,
 	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -290,14 +289,10 @@ static u32 group_table[] = {
 	X7(Lock), 0,
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
-	[Group3_Byte*8] =
-	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
-	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0,
 	[Group3*8] =
 	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
 	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0,
+	X4(Undefined),
 	[Group4*8] =
 	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0, 0, 0,
-- 
cgit v0.10.2


From d359192feaf02861327339a9dda6b2b2d765c2bc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 18:32:39 +0300
Subject: KVM: VMX: Use host_gdt variable wherever we need the host gdt

Now that we have the host gdt conveniently stored in a variable, make use
of it instead of querying the cpu.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab..751a2d2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -706,11 +706,10 @@ static void reload_tss(void)
 	/*
 	 * VT restores TR but not its size.  Useless.
 	 */
-	struct desc_ptr gdt;
+	struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 	struct desc_struct *descs;
 
-	native_store_gdt(&gdt);
-	descs = (void *)gdt.address;
+	descs = (void *)gdt->address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
 }
@@ -753,7 +752,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 
 static unsigned long segment_base(u16 selector)
 {
-	struct desc_ptr gdt;
+	struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 	struct desc_struct *d;
 	unsigned long table_base;
 	unsigned long v;
@@ -761,8 +760,7 @@ static unsigned long segment_base(u16 selector)
 	if (!(selector & ~3))
 		return 0;
 
-	native_store_gdt(&gdt);
-	table_base = gdt.address;
+	table_base = gdt->address;
 
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
@@ -897,7 +895,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (vcpu->cpu != cpu) {
-		struct desc_ptr dt;
+		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
 		kvm_migrate_timers(vcpu);
@@ -913,8 +911,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		native_store_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
+		vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-- 
cgit v0.10.2


From 19ada5c4b6170bbc7ac4f2f38dba0068fdc7755a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 27 Jul 2010 11:21:18 +0800
Subject: KVM: MMU: remove valueless output message

After commit 53383eaad08d, the '*spte' has updated before call
rmap_remove()(in most case it's 'shadow_trap_nonpresent_pte'), so
remove this information from error message

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6da..82f7622 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -645,18 +645,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
 		BUG();
 	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p 1->0\n", spte);
 		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
+			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
 			BUG();
 		}
 		*rmapp = 0;
 	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p many->many\n", spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
@@ -670,7 +669,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 			prev_desc = desc;
 			desc = desc->more;
 		}
-		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
+		pr_err("rmap_remove: %p many->many\n", spte);
 		BUG();
 	}
 }
-- 
cgit v0.10.2


From 3f6a9d1693deaeef28d98109bc92c98dd94a8523 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 27 Jul 2010 18:14:20 +0200
Subject: KVM: SVM: Sync efer back into nested vmcb

This patch fixes a bug in a nested hypervisor that heavily
switches between real-mode and long-mode. The problem is
fixed by syncing back efer into the guest vmcb on emulated
vmexit.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f6..09704a0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1896,6 +1896,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.ds     = vmcb->save.ds;
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
+	nested_vmcb->save.efer   = svm->vcpu.arch.efer;
 	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
-- 
cgit v0.10.2


From 7a190667bb316653cbb782fff95cfdfcf51ded45 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 27 Jul 2010 18:14:21 +0200
Subject: KVM: SVM: Emulate next_rip svm feature

This patch implements the emulations of the svm next_rip
feature in the nested svm implementation in kvm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 09704a0..116e034 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1918,6 +1918,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
 
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3360,7 +3361,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 				   ASID emulation to nested SVM */
 		entry->ecx = 0; /* Reserved */
-		entry->edx = 0; /* Do not support any additional features */
+		entry->edx = 0; /* Per default do not support any
+				   additional features */
+
+		/* Support next_rip if host supports it */
+		if (svm_has(SVM_FEATURE_NRIP))
+			entry->edx |= SVM_FEATURE_NRIP;
 
 		break;
 	}
-- 
cgit v0.10.2


From 62bd430e6d41ac84ff2fb719f5783c3692718f47 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 28 Jul 2010 12:38:40 +0300
Subject: KVM: x86 emulator: Add IRET instruction

Ths patch adds IRET instruction (opcode 0xcf).
Currently, only IRET in real mode is emulated. Protected mode support is to be added later if needed.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Reviewed-by: Avi Kivity <avi@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7f615c5..b0f45bc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -341,6 +341,9 @@ static u32 group2_table[] = {
 #define EFLG_PF (1<<2)
 #define EFLG_CF (1<<0)
 
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+#define EFLG_RESERVED_ONE_MASK 2
+
 /*
  * Instruction emulation:
  * Most instructions are emulated directly via a fragment of inline assembly
@@ -1729,6 +1732,78 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	unsigned long temp_eip = 0;
+	unsigned long temp_eflags = 0;
+	unsigned long cs = 0;
+	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+
+	/* TODO: Add stack limit check */
+
+	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (temp_eip & ~0xffff) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->eip = temp_eip;
+
+
+	if (c->op_bytes == 4)
+		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+	else if (c->op_bytes == 2) {
+		ctxt->eflags &= ~0xffff;
+		ctxt->eflags |= temp_eflags;
+	}
+
+	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+	return rc;
+}
+
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops* ops)
+{
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_iret_real(ctxt, ops);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
+	default:
+		/* iret from protected mode unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
 static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
@@ -2860,6 +2935,12 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	case 0xcf:		/* iret */
+		rc = emulate_iret(ctxt, ops);
+
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		c->src.val = 1;
 		emulate_grp2(ctxt);
-- 
cgit v0.10.2


From ea9ef04e19c7c441b1ce9fe28ff6d9522c848baa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:34 +0300
Subject: KVM: x86 emulator: drop parentheses in repreat macros

The parenthese make is impossible to use the macros with initializers that
require braces.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b0f45bc..3bfba94 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -95,10 +95,10 @@
 #define Src2One     (3<<29)
 #define Src2Mask    (7<<29)
 
-#define X2(x) (x), (x)
-#define X3(x) X2(x), (x)
+#define X2(x) x, x
+#define X3(x) X2(x), x
 #define X4(x) X2(x), X2(x)
-#define X5(x) X4(x), (x)
+#define X5(x) X4(x), x
 #define X6(x) X4(x), X2(x)
 #define X7(x) X4(x), X3(x)
 #define X8(x) X4(x), X4(x)
-- 
cgit v0.10.2


From d65b1dee408243daa45110ee494d204508d31657 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:35 +0300
Subject: KVM: x86 emulator: introduce 'struct opcode'

This will hold all the information known about the opcode.  Currently, this
is just the decode flags.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3bfba94..da7df34 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -108,7 +108,11 @@ enum {
 	Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
-static u32 opcode_table[256] = {
+struct opcode {
+	u32 flags;
+};
+
+static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -222,7 +226,7 @@ static u32 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
-static u32 twobyte_table[256] = {
+static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	0, Group | GroupDual | Group7, 0, 0,
 	0, ImplicitOps, ImplicitOps | Priv, 0,
@@ -284,7 +288,7 @@ static u32 twobyte_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-static u32 group_table[] = {
+static struct opcode group_table[] = {
 	[Group1*8] =
 	X7(Lock), 0,
 	[Group1A*8] =
@@ -313,7 +317,7 @@ static u32 group_table[] = {
 	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
 };
 
-static u32 group2_table[] = {
+static struct opcode group2_table[] = {
 	[Group7*8] =
 	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
@@ -1008,13 +1012,13 @@ done_prefixes:
 			c->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
-	c->d = opcode_table[c->b];
+	c->d = opcode_table[c->b].flags;
 	if (c->d == 0) {
 		/* Two-byte opcode? */
 		if (c->b == 0x0f) {
 			c->twobyte = 1;
 			c->b = insn_fetch(u8, 1, c->eip);
-			c->d = twobyte_table[c->b];
+			c->d = twobyte_table[c->b].flags;
 		}
 	}
 
@@ -1027,9 +1031,9 @@ done_prefixes:
 		group = (group << 3) + ((c->modrm >> 3) & 7);
 		c->d &= ~(Group | GroupDual | GroupMask);
 		if (dual && (c->modrm >> 6) == 3)
-			c->d |= group2_table[group];
+			c->d |= group2_table[group].flags;
 		else
-			c->d |= group_table[group];
+			c->d |= group_table[group].flags;
 	}
 
 	/* Unrecognised? */
-- 
cgit v0.10.2


From fd853310a1ebaef257956208165873494bb805dc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:36 +0300
Subject: KVM: x86 emulator: Add wrappers for easily defining opcodes

Once 'struct opcode' grows, its initializer will become more complicated.
Wrap the simple initializers in a D() macro, and replace the empty initializers
with an even simpler N macro.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index da7df34..7059b16 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -112,220 +112,226 @@ struct opcode {
 	u32 flags;
 };
 
+#define D(_y) { .flags = (_y) }
+#define N    D(0)
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x08 - 0x0F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), N,
 	/* 0x10 - 0x17 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x18 - 0x1F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x20 - 0x27 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
 	/* 0x28 - 0x2F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
 	/* 0x30 - 0x37 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
 	/* 0x38 - 0x3F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	N, N,
 	/* 0x40 - 0x4F */
-	X16(DstReg),
+	X16(D(DstReg)),
 	/* 0x50 - 0x57 */
-	X8(SrcReg | Stack),
+	X8(D(SrcReg | Stack)),
 	/* 0x58 - 0x5F */
-	X8(DstReg | Stack),
+	X8(D(DstReg | Stack)),
 	/* 0x60 - 0x67 */
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
-	0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-	0, 0, 0, 0,
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, N, N, N,
 	/* 0x68 - 0x6F */
-	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
-	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
-	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
+	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
+	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
+	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
-	X16(SrcImmByte),
+	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
-	ByteOp | DstMem | SrcImm | ModRM | Group | Group1,
-	DstMem | SrcImm | ModRM | Group | Group1,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1,
-	DstMem | SrcImmByte | ModRM | Group | Group1,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+	D(ByteOp | DstMem | SrcImm | ModRM | Group | Group1),
+	D(DstMem | SrcImm | ModRM | Group | Group1),
+	D(ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1),
+	D(DstMem | SrcImmByte | ModRM | Group | Group1),
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
-	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
+	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
+	D(ImplicitOps | SrcMem16 | ModRM), D(Group | Group1A),
 	/* 0x90 - 0x97 */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
+	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
 	/* 0x98 - 0x9F */
-	0, 0, SrcImmFAddr | No64, 0,
-	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+	N, N, D(SrcImmFAddr | No64), N,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
-	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
-	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
-	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
+	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
+	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
+	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
-	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
-	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
-	ByteOp | DstDI | String, DstDI | String,
+	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
+	D(ByteOp | DstDI | String), D(DstDI | String),
 	/* 0xB0 - 0xB7 */
-	X8(ByteOp | DstReg | SrcImm | Mov),
+	X8(D(ByteOp | DstReg | SrcImm | Mov)),
 	/* 0xB8 - 0xBF */
-	X8(DstReg | SrcImm | Mov),
+	X8(D(DstReg | SrcImm | Mov)),
 	/* 0xC0 - 0xC7 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	0, ImplicitOps | Stack, 0, 0,
-	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
+	N, D(ImplicitOps | Stack), N, N,
+	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
-	0, 0, 0, ImplicitOps | Stack,
-	ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
+	N, N, N, D(ImplicitOps | Stack),
+	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	0, 0, 0, 0,
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	N, N, N, N,
 	/* 0xD8 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0,
-	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
-	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+	N, N, N, N,
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
 	/* 0xE8 - 0xEF */
-	SrcImm | Stack, SrcImm | ImplicitOps,
-	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
-	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
-	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
 	/* 0xF0 - 0xF7 */
-	0, 0, 0, 0,
-	ImplicitOps | Priv, ImplicitOps, ByteOp | Group | Group3, Group | Group3,
+	N, N, N, N,
+	D(ImplicitOps | Priv), D(ImplicitOps), D(ByteOp | Group | Group3), D(Group | Group3),
 	/* 0xF8 - 0xFF */
-	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
+	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5),
 };
 
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
-	0, Group | GroupDual | Group7, 0, 0,
-	0, ImplicitOps, ImplicitOps | Priv, 0,
-	ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
-	0, ImplicitOps | ModRM, 0, 0,
+	N, D(Group | GroupDual | Group7), N, N,
+	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+	N, D(ImplicitOps | ModRM), N, N,
 	/* 0x10 - 0x1F */
-	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	ModRM | ImplicitOps | Priv, ModRM | Priv,
-	ModRM | ImplicitOps | Priv, ModRM | Priv,
-	0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	N, N, N, N,
+	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-	ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
-	ImplicitOps, ImplicitOps | Priv, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
+	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
+	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
-	X16(DstReg | SrcMem | ModRM | Mov),
+	X16(D(DstReg | SrcMem | ModRM | Mov)),
 	/* 0x50 - 0x5F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0x60 - 0x6F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0x70 - 0x7F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0x80 - 0x8F */
-	X16(SrcImm),
+	X16(D(SrcImm)),
 	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xA0 - 0xA7 */
-	ImplicitOps | Stack, ImplicitOps | Stack,
-	0, DstMem | SrcReg | ModRM | BitOp,
-	DstMem | SrcReg | Src2ImmByte | ModRM,
-	DstMem | SrcReg | Src2CL | ModRM, 0, 0,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
-	ImplicitOps | Stack, ImplicitOps | Stack,
-	0, DstMem | SrcReg | ModRM | BitOp | Lock,
-	DstMem | SrcReg | Src2ImmByte | ModRM,
-	DstMem | SrcReg | Src2CL | ModRM,
-	ModRM, 0,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM),
+	D(ModRM), N,
 	/* 0xB0 - 0xB7 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	0, DstMem | SrcReg | ModRM | BitOp | Lock,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
-	0, 0,
-	Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
+	N, N,
+	D(Group | Group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-	0, 0, 0, DstMem | SrcReg | ModRM | Mov,
-	0, 0, 0, Group | GroupDual | Group9,
-	0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
+	N, N, N, D(Group | GroupDual | Group9),
+	N, N, N, N, N, N, N, N,
 	/* 0xD0 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xEF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xF0 - 0xFF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
 
 static struct opcode group_table[] = {
 	[Group1*8] =
-	X7(Lock), 0,
+	X7(D(Lock)), N,
 	[Group1A*8] =
-	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
 	[Group3*8] =
-	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	X4(Undefined),
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(Undefined)),
 	[Group4*8] =
-	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0, 0, 0,
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
 	[Group5*8] =
-	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
-	SrcMem | ModRM | Stack, 0,
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
 	[Group7*8] =
-	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
-	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
 	[Group8*8] =
-	0, 0, 0, 0,
-	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
 	[Group9*8] =
-	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
 };
 
 static struct opcode group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
-	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov | Priv, 0,
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
 	[Group9*8] =
-	0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N,
 };
 
+#undef D
+#undef N
+
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
-- 
cgit v0.10.2


From 42a1c5209570ead6d89abecd99ab12947a41d20a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:37 +0300
Subject: KVM: x86 emulator: move group tables to top

No code changes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7059b16..edf0938 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -115,6 +115,44 @@ struct opcode {
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 
+static struct opcode group_table[] = {
+	[Group1*8] =
+	X7(D(Lock)), N,
+	[Group1A*8] =
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+	[Group3*8] =
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(Undefined)),
+	[Group4*8] =
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
+	[Group5*8] =
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
+	[Group7*8] =
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+	[Group8*8] =
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+	[Group9*8] =
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+};
+
+static struct opcode group2_table[] = {
+	[Group7*8] =
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+	[Group9*8] =
+	N, N, N, N, N, N, N, N,
+};
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
@@ -291,44 +329,6 @@ static struct opcode twobyte_table[256] = {
 	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
 
-static struct opcode group_table[] = {
-	[Group1*8] =
-	X7(D(Lock)), N,
-	[Group1A*8] =
-	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
-	[Group3*8] =
-	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	X4(D(Undefined)),
-	[Group4*8] =
-	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
-	N, N, N, N, N, N,
-	[Group5*8] =
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack), N,
-	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
-	D(SrcMem | ModRM | Stack), N,
-	[Group7*8] =
-	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
-	[Group8*8] =
-	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
-	[Group9*8] =
-	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
-};
-
-static struct opcode group2_table[] = {
-	[Group7*8] =
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), N,
-	[Group9*8] =
-	N, N, N, N, N, N, N, N,
-};
-
 #undef D
 #undef N
 
-- 
cgit v0.10.2


From 793d5a8d6baad9062b0a03e034944b31e50dfe5c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:38 +0300
Subject: KVM: x86 emulator: reserve group code 0

We'll be using that to distinguish between new-style and old-style groups.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index edf0938..5e49612 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
-- 
cgit v0.10.2


From 120df8902dbe91cc1b3b7886481e350fae7334fe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:39 +0300
Subject: KVM: x86 emulator: allow specifying group directly in opcode

Instead of having a group number, store the group table pointer directly in
the opcode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e49612..f3b9844 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -110,10 +110,21 @@ enum {
 
 struct opcode {
 	u32 flags;
+	union {
+		struct opcode *group;
+		struct group_dual *gdual;
+	} u;
+};
+
+struct group_dual {
+	struct opcode mod012[8];
+	struct opcode mod3[8];
 };
 
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
 
 static struct opcode group_table[] = {
 	[Group1*8] =
@@ -331,6 +342,8 @@ static struct opcode twobyte_table[256] = {
 
 #undef D
 #undef N
+#undef G
+#undef GD
 
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
@@ -930,8 +943,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group, dual;
-
+	int def_op_bytes, def_ad_bytes, group, dual, goffset;
+	struct opcode opcode, *g_mod012, *g_mod3;
 
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
@@ -1018,15 +1031,16 @@ done_prefixes:
 			c->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
-	c->d = opcode_table[c->b].flags;
-	if (c->d == 0) {
+	opcode = opcode_table[c->b];
+	if (opcode.flags == 0) {
 		/* Two-byte opcode? */
 		if (c->b == 0x0f) {
 			c->twobyte = 1;
 			c->b = insn_fetch(u8, 1, c->eip);
-			c->d = twobyte_table[c->b].flags;
+			opcode = twobyte_table[c->b];
 		}
 	}
+	c->d = opcode.flags;
 
 	if (c->d & Group) {
 		group = c->d & GroupMask;
@@ -1034,12 +1048,27 @@ done_prefixes:
 		c->modrm = insn_fetch(u8, 1, c->eip);
 		--c->eip;
 
-		group = (group << 3) + ((c->modrm >> 3) & 7);
+		if (group) {
+			g_mod012 = g_mod3 = &group_table[group * 8];
+			if (c->d & GroupDual)
+				g_mod3 = &group2_table[group * 8];
+		} else {
+			if (c->d & GroupDual) {
+				g_mod012 = opcode.u.gdual->mod012;
+				g_mod3 = opcode.u.gdual->mod3;
+			} else
+				g_mod012 = g_mod3 = opcode.u.group;
+		}
+
 		c->d &= ~(Group | GroupDual | GroupMask);
-		if (dual && (c->modrm >> 6) == 3)
-			c->d |= group2_table[group].flags;
+
+		goffset = (c->modrm >> 3) & 7;
+
+		if ((c->modrm >> 6) == 3)
+			opcode = g_mod3[goffset];
 		else
-			c->d |= group_table[group].flags;
+			opcode = g_mod012[goffset];
+		c->d |= opcode.flags;
 	}
 
 	/* Unrecognised? */
-- 
cgit v0.10.2


From 5b92b5faff8ec66c75f3716ae7c4bf1e2b99d7e6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:40 +0300
Subject: KVM: x86 emulator: convert group 1 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3b9844..6cc4af1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -126,9 +126,11 @@ struct group_dual {
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
 
+static struct opcode group1[] = {
+	X7(D(Lock)), N
+};
+
 static struct opcode group_table[] = {
-	[Group1*8] =
-	X7(D(Lock)), N,
 	[Group1A*8] =
 	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
 	[Group3*8] =
@@ -219,10 +221,10 @@ static struct opcode opcode_table[256] = {
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
-	D(ByteOp | DstMem | SrcImm | ModRM | Group | Group1),
-	D(DstMem | SrcImm | ModRM | Group | Group1),
-	D(ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1),
-	D(DstMem | SrcImmByte | ModRM | Group | Group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+	G(DstMem | SrcImm | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-- 
cgit v0.10.2


From 99880c5cd54b28a26fd6ed949f545cc0075e4393 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:41 +0300
Subject: KVM: x86 emulator: convert group 1A to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6cc4af1..618fdc8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -130,9 +130,11 @@ static struct opcode group1[] = {
 	X7(D(Lock)), N
 };
 
-static struct opcode group_table[] = {
-	[Group1A*8] =
+static struct opcode group1A[] = {
 	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group_table[] = {
 	[Group3*8] =
 	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
@@ -231,7 +233,7 @@ static struct opcode opcode_table[256] = {
 	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
-	D(ImplicitOps | SrcMem16 | ModRM), D(Group | Group1A),
+	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
 	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
 	/* 0x98 - 0x9F */
-- 
cgit v0.10.2


From ee70ea30ee81dda2cf5fbc2e143ce3cb303187ce Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:42 +0300
Subject: KVM: x86 emulator: convert group 3 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 618fdc8..a0606a4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -134,11 +134,13 @@ static struct opcode group1A[] = {
 	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
 };
 
-static struct opcode group_table[] = {
-	[Group3*8] =
+static struct opcode group3[] = {
 	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
 	X4(D(Undefined)),
+};
+
+static struct opcode group_table[] = {
 	[Group4*8] =
 	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
 	N, N, N, N, N, N,
@@ -276,7 +278,7 @@ static struct opcode opcode_table[256] = {
 	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
 	/* 0xF0 - 0xF7 */
 	N, N, N, N,
-	D(ImplicitOps | Priv), D(ImplicitOps), D(ByteOp | Group | Group3), D(Group | Group3),
+	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
 	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
 	D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5),
-- 
cgit v0.10.2


From 591c9d20a37db54c7234742bff925cb2e6fdca4b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:43 +0300
Subject: KVM: x86 emulator: convert group 4 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a0606a4..8bb74ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -140,10 +140,12 @@ static struct opcode group3[] = {
 	X4(D(Undefined)),
 };
 
-static struct opcode group_table[] = {
-	[Group4*8] =
+static struct opcode group4[] = {
 	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
 	N, N, N, N, N, N,
+};
+
+static struct opcode group_table[] = {
 	[Group5*8] =
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
 	D(SrcMem | ModRM | Stack), N,
@@ -281,7 +283,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
 	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
-	D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), D(Group | Group5),
 };
 
 static struct opcode twobyte_table[256] = {
-- 
cgit v0.10.2


From b67f9f0741e288c97f73cdc9e39e2c4943004332 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:44 +0300
Subject: KVM: x86 emulator: convert group 5 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8bb74ea..9674d97 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group5, Group7, Group8, Group9,
+	NoGrp, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -145,12 +145,14 @@ static struct opcode group4[] = {
 	N, N, N, N, N, N,
 };
 
-static struct opcode group_table[] = {
-	[Group5*8] =
+static struct opcode group5[] = {
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
 	D(SrcMem | ModRM | Stack), N,
 	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
 	D(SrcMem | ModRM | Stack), N,
+};
+
+static struct opcode group_table[] = {
 	[Group7*8] =
 	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
@@ -283,7 +285,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
 	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
-	D(ImplicitOps), D(ImplicitOps), G(0, group4), D(Group | Group5),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
 static struct opcode twobyte_table[256] = {
-- 
cgit v0.10.2


From 2f3a9bc9ebd42e00929f370e1a56e40028a8d651 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:45 +0300
Subject: KVM: x86 emulator: convert group 7 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9674d97..5e7a02d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group7, Group8, Group9,
+	NoGrp, Group8, Group9,
 };
 
 struct opcode {
@@ -152,11 +152,17 @@ static struct opcode group5[] = {
 	D(SrcMem | ModRM | Stack), N,
 };
 
-static struct opcode group_table[] = {
-	[Group7*8] =
+static struct group_dual group7 = { {
 	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
 	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+}, {
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group_table[] = {
 	[Group8*8] =
 	N, N, N, N,
 	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
@@ -166,10 +172,6 @@ static struct opcode group_table[] = {
 };
 
 static struct opcode group2_table[] = {
-	[Group7*8] =
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), N,
 	[Group9*8] =
 	N, N, N, N, N, N, N, N,
 };
@@ -290,7 +292,7 @@ static struct opcode opcode_table[256] = {
 
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
-	N, D(Group | GroupDual | Group7), N, N,
+	N, GD(0, &group7), N, N,
 	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
 	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
 	N, D(ImplicitOps | ModRM), N, N,
-- 
cgit v0.10.2


From 2cb20bc8af313b400e5c2c94886e0d87e2ec4e4d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:46 +0300
Subject: KVM: x86 emulator: convert group 8 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e7a02d..b5599b5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group8, Group9,
+	NoGrp, Group9,
 };
 
 struct opcode {
@@ -162,11 +162,13 @@ static struct group_dual group7 = { {
 	D(SrcMem16 | ModRM | Mov | Priv), N,
 } };
 
-static struct opcode group_table[] = {
-	[Group8*8] =
+static struct opcode group8[] = {
 	N, N, N, N,
 	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
 	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct opcode group_table[] = {
 	[Group9*8] =
 	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
 };
@@ -337,7 +339,7 @@ static struct opcode twobyte_table[256] = {
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	D(Group | Group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-- 
cgit v0.10.2


From 9f5d3220e3047536f702ed67309f6a581c0bed8b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:47 +0300
Subject: KVM: x86 emulator: convert group 9 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b5599b5..2fe731c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group9,
+	NoGrp,
 };
 
 struct opcode {
@@ -168,14 +168,16 @@ static struct opcode group8[] = {
 	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
 };
 
-static struct opcode group_table[] = {
-	[Group9*8] =
+static struct group_dual group9 = { {
 	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+	N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode group_table[] = {
 };
 
 static struct opcode group2_table[] = {
-	[Group9*8] =
-	N, N, N, N, N, N, N, N,
 };
 
 static struct opcode opcode_table[256] = {
@@ -344,7 +346,7 @@ static struct opcode twobyte_table[256] = {
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
-	N, N, N, D(Group | GroupDual | Group9),
+	N, N, N, GD(0, &group9),
 	N, N, N, N, N, N, N, N,
 	/* 0xD0 - 0xDF */
 	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-- 
cgit v0.10.2


From 3885d530b0eb26c82b6f085c181442b0aa6f8fed Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:48 +0300
Subject: KVM: x86 emulator: drop support for old-style groups

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2fe731c..20a7a16 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -82,7 +82,6 @@
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
-#define GroupMask   0x0f        /* Group number stored in bits 0:3 */
 /* Misc flags */
 #define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
@@ -104,10 +103,6 @@
 #define X8(x) X4(x), X4(x)
 #define X16(x) X8(x), X8(x)
 
-enum {
-	NoGrp,
-};
-
 struct opcode {
 	u32 flags;
 	union {
@@ -174,12 +169,6 @@ static struct group_dual group9 = { {
 	N, N, N, N, N, N, N, N,
 } };
 
-static struct opcode group_table[] = {
-};
-
-static struct opcode group2_table[] = {
-};
-
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
@@ -959,7 +948,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group, dual, goffset;
+	int def_op_bytes, def_ad_bytes, dual, goffset;
 	struct opcode opcode, *g_mod012, *g_mod3;
 
 	/* we cannot decode insn before we complete previous rep insn */
@@ -1059,24 +1048,17 @@ done_prefixes:
 	c->d = opcode.flags;
 
 	if (c->d & Group) {
-		group = c->d & GroupMask;
 		dual = c->d & GroupDual;
 		c->modrm = insn_fetch(u8, 1, c->eip);
 		--c->eip;
 
-		if (group) {
-			g_mod012 = g_mod3 = &group_table[group * 8];
-			if (c->d & GroupDual)
-				g_mod3 = &group2_table[group * 8];
-		} else {
-			if (c->d & GroupDual) {
-				g_mod012 = opcode.u.gdual->mod012;
-				g_mod3 = opcode.u.gdual->mod3;
-			} else
-				g_mod012 = g_mod3 = opcode.u.group;
-		}
+		if (c->d & GroupDual) {
+			g_mod012 = opcode.u.gdual->mod012;
+			g_mod3 = opcode.u.gdual->mod3;
+		} else
+			g_mod012 = g_mod3 = opcode.u.group;
 
-		c->d &= ~(Group | GroupDual | GroupMask);
+		c->d &= ~(Group | GroupDual);
 
 		goffset = (c->modrm >> 3) & 7;
 
-- 
cgit v0.10.2


From ab85b12b1a7fd125588f9447653a71ec8e1b5024 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:49 +0300
Subject: KVM: x86 emulator: move ByteOp and Dst back to bits 0:3

Now that the group index no longer exists, the space is free.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 20a7a16..d7e3ea479 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -46,15 +46,15 @@
  */
 
 /* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<16)	/* 8-bit operands. */
+#define ByteOp      (1<<0)	/* 8-bit operands. */
 /* Destination operand type. */
-#define ImplicitOps (1<<17)	/* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<17)	/* Register operand. */
-#define DstMem      (3<<17)	/* Memory operand. */
-#define DstAcc      (4<<17)	/* Destination Accumulator */
-#define DstDI       (5<<17)	/* Destination is in ES:(E)DI */
-#define DstMem64    (6<<17)	/* 64bit memory operand */
-#define DstMask     (7<<17)
+#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1)	/* Register operand. */
+#define DstMem      (3<<1)	/* Memory operand. */
+#define DstAcc      (4<<1)	/* Destination Accumulator */
+#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<1)	/* 64bit memory operand */
+#define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
 #define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
-- 
cgit v0.10.2


From 9aabc88fc8687ba3a520e2ec459821d05f72474e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:50 +0300
Subject: KVM: x86 emulator: store x86_emulate_ops in emulation context

It doesn't ever change, so we don't need to pass it around everywhere.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecf..9ddfa5e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,8 @@ struct decode_cache {
 };
 
 struct x86_emulate_ctxt {
+	struct x86_emulate_ops *ops;
+
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
 
@@ -249,12 +251,9 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
 #endif
 
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
-		    struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
-		     struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d7e3ea479..3689f34 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -943,8 +943,9 @@ done:
 }
 
 int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
@@ -2586,10 +2587,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
@@ -2619,8 +2620,9 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 }
 
 int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	u64 msr_data;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c62..33deb75 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3998,7 +3998,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
 
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
@@ -4048,7 +4048,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 restart:
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
 
 	if (r) { /* emulation failed */
 		if (reexecute_instruction(vcpu, cr2))
@@ -5067,7 +5067,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	memset(c, 0, sizeof(struct decode_cache));
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
-	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
 				   tss_selector, reason, has_error_code,
 				   error_code);
 
@@ -5424,6 +5424,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
+	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-- 
cgit v0.10.2


From ef65c88912cafe56de2737c440aefc764fd8f202 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:51 +0300
Subject: KVM: x86 emulator: allow storing emulator execution function in
 decode tables

Instead of looking up the opcode twice (once for decode flags, once for
the big execution switch) look up both flags and function in the decode tables.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 9ddfa5e..0f901c1 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -190,6 +190,7 @@ struct decode_cache {
 	bool has_seg_override;
 	u8 seg_override;
 	unsigned int d;
+	int (*execute)(struct x86_emulate_ctxt *ctxt);
 	unsigned long regs[NR_VCPU_REGS];
 	unsigned long eip;
 	/* modrm */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3689f34..799e895 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -106,6 +106,7 @@
 struct opcode {
 	u32 flags;
 	union {
+		int (*execute)(struct x86_emulate_ctxt *ctxt);
 		struct opcode *group;
 		struct group_dual *gdual;
 	} u;
@@ -120,6 +121,7 @@ struct group_dual {
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 
 static struct opcode group1[] = {
 	X7(D(Lock)), N
@@ -349,6 +351,7 @@ static struct opcode twobyte_table[256] = {
 #undef N
 #undef G
 #undef GD
+#undef I
 
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
@@ -1070,6 +1073,8 @@ done_prefixes:
 		c->d |= opcode.flags;
 	}
 
+	c->execute = opcode.u.execute;
+
 	/* Unrecognised? */
 	if (c->d == 0 || (c->d & Undefined)) {
 		DPRINTF("Cannot emulate %02x\n", c->b);
@@ -2705,6 +2710,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
+	if (c->execute) {
+		rc = c->execute(ctxt);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		goto writeback;
+	}
+
 	if (c->twobyte)
 		goto twobyte_insn;
 
-- 
cgit v0.10.2


From dde7e6d12a9ef9f727d05ce824f4fe75ca2a5b3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:52 +0300
Subject: KVM: x86 emulator: move x86_decode_insn() downwards

No code changes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 799e895..c6f4359 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -945,917 +945,545 @@ done:
 	return rc;
 }
 
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 unsigned long addr, void *dest, unsigned size)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, dual, goffset;
-	struct opcode opcode, *g_mod012, *g_mod3;
+	int rc;
+	struct read_cache *mc = &ctxt->decode.mem_read;
+	u32 err;
 
-	/* we cannot decode insn before we complete previous rep insn */
-	WARN_ON(ctxt->restart);
+	while (size) {
+		int n = min(size, 8u);
+		size -= n;
+		if (mc->pos < mc->end)
+			goto read_cached;
 
-	c->eip = ctxt->eip;
-	c->fetch.start = c->fetch.end = c->eip;
-	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt, addr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		mc->end += n;
 
-	switch (mode) {
-	case X86EMUL_MODE_REAL:
-	case X86EMUL_MODE_VM86:
-	case X86EMUL_MODE_PROT16:
-		def_op_bytes = def_ad_bytes = 2;
-		break;
-	case X86EMUL_MODE_PROT32:
-		def_op_bytes = def_ad_bytes = 4;
-		break;
-#ifdef CONFIG_X86_64
-	case X86EMUL_MODE_PROT64:
-		def_op_bytes = 4;
-		def_ad_bytes = 8;
-		break;
-#endif
-	default:
-		return -1;
+	read_cached:
+		memcpy(dest, mc->data + mc->pos, n);
+		mc->pos += n;
+		dest += n;
+		addr += n;
 	}
+	return X86EMUL_CONTINUE;
+}
 
-	c->op_bytes = def_op_bytes;
-	c->ad_bytes = def_ad_bytes;
-
-	/* Legacy prefixes. */
-	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->eip)) {
-		case 0x66:	/* operand-size override */
-			/* switch between 2/4 bytes */
-			c->op_bytes = def_op_bytes ^ 6;
-			break;
-		case 0x67:	/* address-size override */
-			if (mode == X86EMUL_MODE_PROT64)
-				/* switch between 4/8 bytes */
-				c->ad_bytes = def_ad_bytes ^ 12;
-			else
-				/* switch between 2/4 bytes */
-				c->ad_bytes = def_ad_bytes ^ 6;
-			break;
-		case 0x26:	/* ES override */
-		case 0x2e:	/* CS override */
-		case 0x36:	/* SS override */
-		case 0x3e:	/* DS override */
-			set_seg_override(c, (c->b >> 3) & 3);
-			break;
-		case 0x64:	/* FS override */
-		case 0x65:	/* GS override */
-			set_seg_override(c, c->b & 7);
-			break;
-		case 0x40 ... 0x4f: /* REX */
-			if (mode != X86EMUL_MODE_PROT64)
-				goto done_prefixes;
-			c->rex_prefix = c->b;
-			continue;
-		case 0xf0:	/* LOCK */
-			c->lock_prefix = 1;
-			break;
-		case 0xf2:	/* REPNE/REPNZ */
-			c->rep_prefix = REPNE_PREFIX;
-			break;
-		case 0xf3:	/* REP/REPE/REPZ */
-			c->rep_prefix = REPE_PREFIX;
-			break;
-		default:
-			goto done_prefixes;
-		}
-
-		/* Any legacy prefix after a REX prefix nullifies its effect. */
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   unsigned int size, unsigned short port,
+			   void *dest)
+{
+	struct read_cache *rc = &ctxt->decode.io_read;
 
-		c->rex_prefix = 0;
+	if (rc->pos == rc->end) { /* refill pio read ahead */
+		struct decode_cache *c = &ctxt->decode;
+		unsigned int in_page, n;
+		unsigned int count = c->rep_prefix ?
+			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		in_page = (ctxt->eflags & EFLG_DF) ?
+			offset_in_page(c->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+			count);
+		if (n == 0)
+			n = 1;
+		rc->pos = rc->end = 0;
+		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+			return 0;
+		rc->end = n * size;
 	}
 
-done_prefixes:
+	memcpy(dest, rc->data + rc->pos, size);
+	rc->pos += size;
+	return 1;
+}
 
-	/* REX prefix. */
-	if (c->rex_prefix)
-		if (c->rex_prefix & 8)
-			c->op_bytes = 8;	/* REX.W */
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+	u32 limit = get_desc_limit(desc);
 
-	/* Opcode byte(s). */
-	opcode = opcode_table[c->b];
-	if (opcode.flags == 0) {
-		/* Two-byte opcode? */
-		if (c->b == 0x0f) {
-			c->twobyte = 1;
-			c->b = insn_fetch(u8, 1, c->eip);
-			opcode = twobyte_table[c->b];
-		}
-	}
-	c->d = opcode.flags;
+	return desc->g ? (limit << 12) | 0xfff : limit;
+}
 
-	if (c->d & Group) {
-		dual = c->d & GroupDual;
-		c->modrm = insn_fetch(u8, 1, c->eip);
-		--c->eip;
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+				     struct x86_emulate_ops *ops,
+				     u16 selector, struct desc_ptr *dt)
+{
+	if (selector & 1 << 2) {
+		struct desc_struct desc;
+		memset (dt, 0, sizeof *dt);
+		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+			return;
 
-		if (c->d & GroupDual) {
-			g_mod012 = opcode.u.gdual->mod012;
-			g_mod3 = opcode.u.gdual->mod3;
-		} else
-			g_mod012 = g_mod3 = opcode.u.group;
+		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+		dt->address = get_desc_base(&desc);
+	} else
+		ops->get_gdt(dt, ctxt->vcpu);
+}
 
-		c->d &= ~(Group | GroupDual);
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	int ret;
+	u32 err;
+	ulong addr;
 
-		goffset = (c->modrm >> 3) & 7;
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-		if ((c->modrm >> 6) == 3)
-			opcode = g_mod3[goffset];
-		else
-			opcode = g_mod012[goffset];
-		c->d |= opcode.flags;
+	if (dt.size < index * 8 + 7) {
+		emulate_gp(ctxt, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
 	}
+	addr = dt.address + index * 8;
+	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt, addr, err);
 
-	c->execute = opcode.u.execute;
+       return ret;
+}
 
-	/* Unrecognised? */
-	if (c->d == 0 || (c->d & Undefined)) {
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		return -1;
-	}
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	u32 err;
+	ulong addr;
+	int ret;
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-		c->op_bytes = 8;
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-	/* ModRM and SIB bytes. */
-	if (c->d & ModRM)
-		rc = decode_modrm(ctxt, ops);
-	else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops);
-	if (rc != X86EMUL_CONTINUE)
-		goto done;
+	if (dt.size < index * 8 + 7) {
+		emulate_gp(ctxt, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
-	if (!c->has_seg_override)
-		set_seg_override(c, VCPU_SREG_DS);
+	addr = dt.address + index * 8;
+	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt, addr, err);
 
-	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, ops, c);
+	return ret;
+}
 
-	if (c->ad_bytes != 8)
-		c->modrm_ea = (u32)c->modrm_ea;
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, int seg)
+{
+	struct desc_struct seg_desc;
+	u8 dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	u32 err_code = 0;
+	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
+	memset(&seg_desc, 0, sizeof seg_desc);
 
-	/*
-	 * Decode and fetch the source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & SrcMask) {
-	case SrcNone:
+	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+	    || ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		goto load;
+	}
+
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	    && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	if (null_selector) /* for NULL selector skip all following checks */
+		goto load;
+
+	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+		goto exception;
+
+	if (!seg_desc.p) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
+	}
+
+	rpl = selector & 3;
+	dpl = seg_desc.dpl;
+	cpl = ops->cpl(ctxt->vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
 		break;
-	case SrcReg:
-		decode_register_operand(&c->src, c, 0);
+	case VCPU_SREG_CS:
+		if (!(seg_desc.type & 8))
+			goto exception;
+
+		if (seg_desc.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
 		break;
-	case SrcMem16:
-		c->src.bytes = 2;
-		goto srcmem_common;
-	case SrcMem32:
-		c->src.bytes = 4;
-		goto srcmem_common;
-	case SrcMem:
-		c->src.bytes = (c->d & ByteOp) ? 1 :
-							   c->op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
-			break;
-	srcmem_common:
+	case VCPU_SREG_TR:
+		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (seg_desc.s || seg_desc.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
 		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
 		 */
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->src.type = OP_REG;
-			c->src.val = c->modrm_val;
-			c->src.ptr = c->modrm_ptr;
-			break;
-		}
-		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
-		c->src.val = 0;
+		if ((seg_desc.type & 0xa) == 0x8 ||
+		    (((seg_desc.type & 0xc) != 0xc) &&
+		     (rpl > dpl && cpl > dpl)))
+			goto exception;
 		break;
-	case SrcImm:
-	case SrcImmU:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (c->src.bytes) {
+	}
+
+	if (seg_desc.s) {
+		/* mark segment as accessed */
+		seg_desc.type |= 1;
+		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		if (ret != X86EMUL_CONTINUE)
+			return ret;
+	}
+load:
+	ops->set_segment_selector(selector, seg, ctxt->vcpu);
+	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+	return X86EMUL_CONTINUE;
+exception:
+	emulate_exception(ctxt, err_vec, err_code, true);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+			    struct x86_emulate_ops *ops)
+{
+	int rc;
+	struct decode_cache *c = &ctxt->decode;
+	u32 err;
+
+	switch (c->dst.type) {
+	case OP_REG:
+		/* The 4-byte case *is* correct:
+		 * in 64-bit mode we zero-extend.
+		 */
+		switch (c->dst.bytes) {
 		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
+			*(u8 *)c->dst.ptr = (u8)c->dst.val;
 			break;
 		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
+			*(u16 *)c->dst.ptr = (u16)c->dst.val;
 			break;
 		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
+			*c->dst.ptr = (u32)c->dst.val;
+			break;	/* 64b: zero-ext */
+		case 8:
+			*c->dst.ptr = c->dst.val;
 			break;
 		}
-		if ((c->d & SrcMask) == SrcImmU) {
-			switch (c->src.bytes) {
-			case 1:
-				c->src.val &= 0xff;
-				break;
-			case 2:
-				c->src.val &= 0xffff;
-				break;
-			case 4:
-				c->src.val &= 0xffffffff;
-				break;
-			}
-		}
 		break;
-	case SrcImmByte:
-	case SrcImmUByte:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = 1;
-		if ((c->d & SrcMask) == SrcImmByte)
-			c->src.val = insn_fetch(s8, 1, c->eip);
+	case OP_MEM:
+		if (c->lock_prefix)
+			rc = ops->cmpxchg_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.orig_val,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
 		else
-			c->src.val = insn_fetch(u8, 1, c->eip);
-		break;
-	case SrcAcc:
-		c->src.type = OP_REG;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->src.bytes) {
-			case 1:
-				c->src.val = *(u8 *)c->src.ptr;
-				break;
-			case 2:
-				c->src.val = *(u16 *)c->src.ptr;
-				break;
-			case 4:
-				c->src.val = *(u32 *)c->src.ptr;
-				break;
-			case 8:
-				c->src.val = *(u64 *)c->src.ptr;
-				break;
-		}
-		break;
-	case SrcOne:
-		c->src.bytes = 1;
-		c->src.val = 1;
-		break;
-	case SrcSI:
-		c->src.type = OP_MEM;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)
-			register_address(c,  seg_override_base(ctxt, ops, c),
-					 c->regs[VCPU_REGS_RSI]);
-		c->src.val = 0;
+			rc = ops->write_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt,
+					      (unsigned long)c->dst.ptr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
 		break;
-	case SrcImmFAddr:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = c->op_bytes + 2;
-		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+	case OP_NONE:
+		/* no writeback */
 		break;
-	case SrcMemFAddr:
-		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
-		c->src.bytes = c->op_bytes + 2;
+	default:
 		break;
 	}
+	return X86EMUL_CONTINUE;
+}
 
-	/*
-	 * Decode and fetch the second source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & Src2Mask) {
-	case Src2None:
-		break;
-	case Src2CL:
-		c->src2.bytes = 1;
-		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
-		break;
-	case Src2ImmByte:
-		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
-		c->src2.bytes = 1;
-		c->src2.val = insn_fetch(u8, 1, c->eip);
-		break;
-	case Src2One:
-		c->src2.bytes = 1;
-		c->src2.val = 1;
-		break;
-	}
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
 
-	/* Decode and fetch the destination operand: register or memory. */
-	switch (c->d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		return 0;
-	case DstReg:
-		decode_register_operand(&c->dst, c,
-			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
-		break;
-	case DstMem:
-	case DstMem64:
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-			c->dst.type = OP_REG;
-			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.ptr = c->modrm_ptr;
-			break;
-		}
-		c->dst.type = OP_MEM;
-		c->dst.ptr = (unsigned long *)c->modrm_ea;
-		if ((c->d & DstMask) == DstMem64)
-			c->dst.bytes = 8;
-		else
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+	c->dst.type  = OP_MEM;
+	c->dst.bytes = c->op_bytes;
+	c->dst.val = c->src.val;
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
+	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
+					       c->regs[VCPU_REGS_RSP]);
+}
 
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		break;
-	case DstAcc:
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->dst.bytes) {
-			case 1:
-				c->dst.val = *(u8 *)c->dst.ptr;
-				break;
-			case 2:
-				c->dst.val = *(u16 *)c->dst.ptr;
-				break;
-			case 4:
-				c->dst.val = *(u32 *)c->dst.ptr;
-				break;
-			case 8:
-				c->dst.val = *(u64 *)c->dst.ptr;
-				break;
-		}
-		c->dst.orig_val = c->dst.val;
-		break;
-	case DstDI:
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)
-			register_address(c, es_base(ctxt, ops),
-					 c->regs[VCPU_REGS_RDI]);
-		c->dst.val = 0;
-		break;
-	}
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
 
-done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+						       c->regs[VCPU_REGS_RSP]),
+			   dest, len);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+	return rc;
 }
 
-static int read_emulated(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
-			 unsigned long addr, void *dest, unsigned size)
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
 {
 	int rc;
-	struct read_cache *mc = &ctxt->decode.mem_read;
-	u32 err;
+	unsigned long val, change_mask;
+	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+	int cpl = ops->cpl(ctxt->vcpu);
 
-	while (size) {
-		int n = min(size, 8u);
-		size -= n;
-		if (mc->pos < mc->end)
-			goto read_cached;
+	rc = emulate_pop(ctxt, ops, &val, len);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, addr, err);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		mc->end += n;
+	change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+		| EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
 
-	read_cached:
-		memcpy(dest, mc->data + mc->pos, n);
-		mc->pos += n;
-		dest += n;
-		addr += n;
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_PROT64:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT16:
+		if (cpl == 0)
+			change_mask |= EFLG_IOPL;
+		if (cpl <= iopl)
+			change_mask |= EFLG_IF;
+		break;
+	case X86EMUL_MODE_VM86:
+		if (iopl < 3) {
+			emulate_gp(ctxt, 0);
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+		change_mask |= EFLG_IF;
+		break;
+	default: /* real mode */
+		change_mask |= (EFLG_IOPL | EFLG_IF);
+		break;
 	}
-	return X86EMUL_CONTINUE;
+
+	*(unsigned long *)dest =
+		(ctxt->eflags & ~change_mask) | (val & change_mask);
+
+	return rc;
 }
 
-static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops,
-			   unsigned int size, unsigned short port,
-			   void *dest)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
-	struct read_cache *rc = &ctxt->decode.io_read;
+	struct decode_cache *c = &ctxt->decode;
 
-	if (rc->pos == rc->end) { /* refill pio read ahead */
-		struct decode_cache *c = &ctxt->decode;
-		unsigned int in_page, n;
-		unsigned int count = c->rep_prefix ?
-			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
-		in_page = (ctxt->eflags & EFLG_DF) ?
-			offset_in_page(c->regs[VCPU_REGS_RDI]) :
-			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
-		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
-			count);
-		if (n == 0)
-			n = 1;
-		rc->pos = rc->end = 0;
-		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
-			return 0;
-		rc->end = n * size;
-	}
+	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
 
-	memcpy(dest, rc->data + rc->pos, size);
-	rc->pos += size;
-	return 1;
+	emulate_push(ctxt, ops);
 }
 
-static u32 desc_limit_scaled(struct desc_struct *desc)
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops, int seg)
 {
-	u32 limit = get_desc_limit(desc);
+	struct decode_cache *c = &ctxt->decode;
+	unsigned long selector;
+	int rc;
 
-	return desc->g ? (limit << 12) | 0xfff : limit;
+	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+	return rc;
 }
 
-static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
-				     struct x86_emulate_ops *ops,
-				     u16 selector, struct desc_ptr *dt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops)
 {
-	if (selector & 1 << 2) {
-		struct desc_struct desc;
-		memset (dt, 0, sizeof *dt);
-		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
-			return;
+	struct decode_cache *c = &ctxt->decode;
+	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	int rc = X86EMUL_CONTINUE;
+	int reg = VCPU_REGS_RAX;
 
-		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
-		dt->address = get_desc_base(&desc);
-	} else
-		ops->get_gdt(dt, ctxt->vcpu);
-}
+	while (reg <= VCPU_REGS_RDI) {
+		(reg == VCPU_REGS_RSP) ?
+		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
-				   u16 selector, struct desc_struct *desc)
-{
-	struct desc_ptr dt;
-	u16 index = selector >> 3;
-	int ret;
-	u32 err;
-	ulong addr;
+		emulate_push(ctxt, ops);
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+		rc = writeback(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
 
-	if (dt.size < index * 8 + 7) {
-		emulate_gp(ctxt, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
+		++reg;
 	}
-	addr = dt.address + index * 8;
-	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
 
-       return ret;
+	/* Disable writeback. */
+	c->dst.type = OP_NONE;
+
+	return rc;
 }
 
-/* allowed just for 8 bytes segments */
-static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops *ops,
-				    u16 selector, struct desc_struct *desc)
+static int emulate_popa(struct x86_emulate_ctxt *ctxt,
+			struct x86_emulate_ops *ops)
 {
-	struct desc_ptr dt;
-	u16 index = selector >> 3;
-	u32 err;
-	ulong addr;
-	int ret;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	int reg = VCPU_REGS_RDI;
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+	while (reg >= VCPU_REGS_RAX) {
+		if (reg == VCPU_REGS_RSP) {
+			register_address_increment(c, &c->regs[VCPU_REGS_RSP],
+							c->op_bytes);
+			--reg;
+		}
 
-	if (dt.size < index * 8 + 7) {
-		emulate_gp(ctxt, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
+		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+		if (rc != X86EMUL_CONTINUE)
+			break;
+		--reg;
 	}
-
-	addr = dt.address + index * 8;
-	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
-
-	return ret;
+	return rc;
 }
 
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
-				   u16 selector, int seg)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
 {
-	struct desc_struct seg_desc;
-	u8 dpl, rpl, cpl;
-	unsigned err_vec = GP_VECTOR;
-	u32 err_code = 0;
-	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
-	int ret;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	unsigned long temp_eip = 0;
+	unsigned long temp_eflags = 0;
+	unsigned long cs = 0;
+	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
 
-	memset(&seg_desc, 0, sizeof seg_desc);
+	/* TODO: Add stack limit check */
 
-	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
-	    || ctxt->mode == X86EMUL_MODE_REAL) {
-		/* set real mode segment descriptor */
-		set_desc_base(&seg_desc, selector << 4);
-		set_desc_limit(&seg_desc, 0xffff);
-		seg_desc.type = 3;
-		seg_desc.p = 1;
-		seg_desc.s = 1;
-		goto load;
-	}
+	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
 
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	/* TR should be in GDT only */
-	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
-		goto exception;
+	if (temp_eip & ~0xffff) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
-	if (null_selector) /* for NULL selector skip all following checks */
-		goto load;
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
 
-	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
-	if (ret != X86EMUL_CONTINUE)
-		return ret;
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	err_code = selector & 0xfffc;
-	err_vec = GP_VECTOR;
+	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
 
-	/* can't load system descriptor into segment selecor */
-	if (seg <= VCPU_SREG_GS && !seg_desc.s)
-		goto exception;
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	if (!seg_desc.p) {
-		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
-		goto exception;
-	}
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
 
-	rpl = selector & 3;
-	dpl = seg_desc.dpl;
-	cpl = ops->cpl(ctxt->vcpu);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	switch (seg) {
-	case VCPU_SREG_SS:
-		/*
-		 * segment is not a writable data segment or segment
-		 * selector's RPL != CPL or segment selector's RPL != CPL
-		 */
-		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
-			goto exception;
-		break;
-	case VCPU_SREG_CS:
-		if (!(seg_desc.type & 8))
-			goto exception;
+	c->eip = temp_eip;
 
-		if (seg_desc.type & 4) {
-			/* conforming */
-			if (dpl > cpl)
-				goto exception;
-		} else {
-			/* nonconforming */
-			if (rpl > cpl || dpl != cpl)
-				goto exception;
-		}
-		/* CS(RPL) <- CPL */
-		selector = (selector & 0xfffc) | cpl;
-		break;
-	case VCPU_SREG_TR:
-		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
-			goto exception;
-		break;
-	case VCPU_SREG_LDTR:
-		if (seg_desc.s || seg_desc.type != 2)
-			goto exception;
-		break;
-	default: /*  DS, ES, FS, or GS */
-		/*
-		 * segment is not a data or readable code segment or
-		 * ((segment is a data or nonconforming code segment)
-		 * and (both RPL and CPL > DPL))
-		 */
-		if ((seg_desc.type & 0xa) == 0x8 ||
-		    (((seg_desc.type & 0xc) != 0xc) &&
-		     (rpl > dpl && cpl > dpl)))
-			goto exception;
-		break;
-	}
 
-	if (seg_desc.s) {
-		/* mark segment as accessed */
-		seg_desc.type |= 1;
-		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
-		if (ret != X86EMUL_CONTINUE)
-			return ret;
+	if (c->op_bytes == 4)
+		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+	else if (c->op_bytes == 2) {
+		ctxt->eflags &= ~0xffff;
+		ctxt->eflags |= temp_eflags;
 	}
-load:
-	ops->set_segment_selector(selector, seg, ctxt->vcpu);
-	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
-	return X86EMUL_CONTINUE;
-exception:
-	emulate_exception(ctxt, err_vec, err_code, true);
-	return X86EMUL_PROPAGATE_FAULT;
+
+	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+	return rc;
 }
 
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-			    struct x86_emulate_ops *ops)
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops* ops)
 {
-	int rc;
-	struct decode_cache *c = &ctxt->decode;
-	u32 err;
-
-	switch (c->dst.type) {
-	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
-		break;
-	case OP_MEM:
-		if (c->lock_prefix)
-			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.orig_val,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		else
-			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		break;
-	case OP_NONE:
-		/* no writeback */
-		break;
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_iret_real(ctxt, ops);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
 	default:
-		break;
+		/* iret from protected mode unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
 	}
-	return X86EMUL_CONTINUE;
 }
 
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 
-	c->dst.type  = OP_MEM;
-	c->dst.bytes = c->op_bytes;
-	c->dst.val = c->src.val;
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
-					       c->regs[VCPU_REGS_RSP]);
+	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 }
 
-static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops,
-		       void *dest, int len)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc;
-
-	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
-						       c->regs[VCPU_REGS_RSP]),
-			   dest, len);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
-	return rc;
-}
-
-static int emulate_popf(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops,
-		       void *dest, int len)
-{
-	int rc;
-	unsigned long val, change_mask;
-	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = ops->cpl(ctxt->vcpu);
-
-	rc = emulate_pop(ctxt, ops, &val, len);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
-		| EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
-
-	switch(ctxt->mode) {
-	case X86EMUL_MODE_PROT64:
-	case X86EMUL_MODE_PROT32:
-	case X86EMUL_MODE_PROT16:
-		if (cpl == 0)
-			change_mask |= EFLG_IOPL;
-		if (cpl <= iopl)
-			change_mask |= EFLG_IF;
-		break;
-	case X86EMUL_MODE_VM86:
-		if (iopl < 3) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
-		change_mask |= EFLG_IF;
-		break;
-	default: /* real mode */
-		change_mask |= (EFLG_IOPL | EFLG_IF);
-		break;
-	}
-
-	*(unsigned long *)dest =
-		(ctxt->eflags & ~change_mask) | (val & change_mask);
-
-	return rc;
-}
-
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops, int seg)
-{
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
-
-	emulate_push(ctxt, ops);
-}
-
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops, int seg)
-{
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long selector;
-	int rc;
-
-	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
-	return rc;
-}
-
-static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
-			  struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
-	int rc = X86EMUL_CONTINUE;
-	int reg = VCPU_REGS_RAX;
-
-	while (reg <= VCPU_REGS_RDI) {
-		(reg == VCPU_REGS_RSP) ?
-		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
-
-		emulate_push(ctxt, ops);
-
-		rc = writeback(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-
-		++reg;
-	}
-
-	/* Disable writeback. */
-	c->dst.type = OP_NONE;
-
-	return rc;
-}
-
-static int emulate_popa(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	int reg = VCPU_REGS_RDI;
-
-	while (reg >= VCPU_REGS_RAX) {
-		if (reg == VCPU_REGS_RSP) {
-			register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-							c->op_bytes);
-			--reg;
-		}
-
-		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			break;
-		--reg;
-	}
-	return rc;
-}
-
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	unsigned long temp_eip = 0;
-	unsigned long temp_eflags = 0;
-	unsigned long cs = 0;
-	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
-			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
-			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
-	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
-
-	/* TODO: Add stack limit check */
-
-	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	if (temp_eip & ~0xffff) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	c->eip = temp_eip;
-
-
-	if (c->op_bytes == 4)
-		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
-	else if (c->op_bytes == 2) {
-		ctxt->eflags &= ~0xffff;
-		ctxt->eflags |= temp_eflags;
-	}
-
-	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
-	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
-
-	return rc;
-}
-
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops* ops)
-{
-	switch(ctxt->mode) {
-	case X86EMUL_MODE_REAL:
-		return emulate_iret_real(ctxt, ops);
-	case X86EMUL_MODE_VM86:
-	case X86EMUL_MODE_PROT16:
-	case X86EMUL_MODE_PROT32:
-	case X86EMUL_MODE_PROT64:
-	default:
-		/* iret from protected mode unimplemented yet */
-		return X86EMUL_UNHANDLEABLE;
-	}
-}
-
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-
-	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
-}
-
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	switch (c->modrm_reg) {
@@ -2625,6 +2253,378 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 }
 
 int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+{
+	struct x86_emulate_ops *ops = ctxt->ops;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	int mode = ctxt->mode;
+	int def_op_bytes, def_ad_bytes, dual, goffset;
+	struct opcode opcode, *g_mod012, *g_mod3;
+
+	/* we cannot decode insn before we complete previous rep insn */
+	WARN_ON(ctxt->restart);
+
+	c->eip = ctxt->eip;
+	c->fetch.start = c->fetch.end = c->eip;
+	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+
+	switch (mode) {
+	case X86EMUL_MODE_REAL:
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+		def_op_bytes = def_ad_bytes = 2;
+		break;
+	case X86EMUL_MODE_PROT32:
+		def_op_bytes = def_ad_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86EMUL_MODE_PROT64:
+		def_op_bytes = 4;
+		def_ad_bytes = 8;
+		break;
+#endif
+	default:
+		return -1;
+	}
+
+	c->op_bytes = def_op_bytes;
+	c->ad_bytes = def_ad_bytes;
+
+	/* Legacy prefixes. */
+	for (;;) {
+		switch (c->b = insn_fetch(u8, 1, c->eip)) {
+		case 0x66:	/* operand-size override */
+			/* switch between 2/4 bytes */
+			c->op_bytes = def_op_bytes ^ 6;
+			break;
+		case 0x67:	/* address-size override */
+			if (mode == X86EMUL_MODE_PROT64)
+				/* switch between 4/8 bytes */
+				c->ad_bytes = def_ad_bytes ^ 12;
+			else
+				/* switch between 2/4 bytes */
+				c->ad_bytes = def_ad_bytes ^ 6;
+			break;
+		case 0x26:	/* ES override */
+		case 0x2e:	/* CS override */
+		case 0x36:	/* SS override */
+		case 0x3e:	/* DS override */
+			set_seg_override(c, (c->b >> 3) & 3);
+			break;
+		case 0x64:	/* FS override */
+		case 0x65:	/* GS override */
+			set_seg_override(c, c->b & 7);
+			break;
+		case 0x40 ... 0x4f: /* REX */
+			if (mode != X86EMUL_MODE_PROT64)
+				goto done_prefixes;
+			c->rex_prefix = c->b;
+			continue;
+		case 0xf0:	/* LOCK */
+			c->lock_prefix = 1;
+			break;
+		case 0xf2:	/* REPNE/REPNZ */
+			c->rep_prefix = REPNE_PREFIX;
+			break;
+		case 0xf3:	/* REP/REPE/REPZ */
+			c->rep_prefix = REPE_PREFIX;
+			break;
+		default:
+			goto done_prefixes;
+		}
+
+		/* Any legacy prefix after a REX prefix nullifies its effect. */
+
+		c->rex_prefix = 0;
+	}
+
+done_prefixes:
+
+	/* REX prefix. */
+	if (c->rex_prefix)
+		if (c->rex_prefix & 8)
+			c->op_bytes = 8;	/* REX.W */
+
+	/* Opcode byte(s). */
+	opcode = opcode_table[c->b];
+	if (opcode.flags == 0) {
+		/* Two-byte opcode? */
+		if (c->b == 0x0f) {
+			c->twobyte = 1;
+			c->b = insn_fetch(u8, 1, c->eip);
+			opcode = twobyte_table[c->b];
+		}
+	}
+	c->d = opcode.flags;
+
+	if (c->d & Group) {
+		dual = c->d & GroupDual;
+		c->modrm = insn_fetch(u8, 1, c->eip);
+		--c->eip;
+
+		if (c->d & GroupDual) {
+			g_mod012 = opcode.u.gdual->mod012;
+			g_mod3 = opcode.u.gdual->mod3;
+		} else
+			g_mod012 = g_mod3 = opcode.u.group;
+
+		c->d &= ~(Group | GroupDual);
+
+		goffset = (c->modrm >> 3) & 7;
+
+		if ((c->modrm >> 6) == 3)
+			opcode = g_mod3[goffset];
+		else
+			opcode = g_mod012[goffset];
+		c->d |= opcode.flags;
+	}
+
+	c->execute = opcode.u.execute;
+
+	/* Unrecognised? */
+	if (c->d == 0 || (c->d & Undefined)) {
+		DPRINTF("Cannot emulate %02x\n", c->b);
+		return -1;
+	}
+
+	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+		c->op_bytes = 8;
+
+	/* ModRM and SIB bytes. */
+	if (c->d & ModRM)
+		rc = decode_modrm(ctxt, ops);
+	else if (c->d & MemAbs)
+		rc = decode_abs(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
+	if (!c->has_seg_override)
+		set_seg_override(c, VCPU_SREG_DS);
+
+	if (!(!c->twobyte && c->b == 0x8d))
+		c->modrm_ea += seg_override_base(ctxt, ops, c);
+
+	if (c->ad_bytes != 8)
+		c->modrm_ea = (u32)c->modrm_ea;
+
+	if (c->rip_relative)
+		c->modrm_ea += c->eip;
+
+	/*
+	 * Decode and fetch the source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & SrcMask) {
+	case SrcNone:
+		break;
+	case SrcReg:
+		decode_register_operand(&c->src, c, 0);
+		break;
+	case SrcMem16:
+		c->src.bytes = 2;
+		goto srcmem_common;
+	case SrcMem32:
+		c->src.bytes = 4;
+		goto srcmem_common;
+	case SrcMem:
+		c->src.bytes = (c->d & ByteOp) ? 1 :
+							   c->op_bytes;
+		/* Don't fetch the address for invlpg: it could be unmapped. */
+		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+			break;
+	srcmem_common:
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->src.type = OP_REG;
+			c->src.val = c->modrm_val;
+			c->src.ptr = c->modrm_ptr;
+			break;
+		}
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.val = 0;
+		break;
+	case SrcImm:
+	case SrcImmU:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if (c->src.bytes == 8)
+			c->src.bytes = 4;
+		/* NB. Immediates are sign-extended as necessary. */
+		switch (c->src.bytes) {
+		case 1:
+			c->src.val = insn_fetch(s8, 1, c->eip);
+			break;
+		case 2:
+			c->src.val = insn_fetch(s16, 2, c->eip);
+			break;
+		case 4:
+			c->src.val = insn_fetch(s32, 4, c->eip);
+			break;
+		}
+		if ((c->d & SrcMask) == SrcImmU) {
+			switch (c->src.bytes) {
+			case 1:
+				c->src.val &= 0xff;
+				break;
+			case 2:
+				c->src.val &= 0xffff;
+				break;
+			case 4:
+				c->src.val &= 0xffffffff;
+				break;
+			}
+		}
+		break;
+	case SrcImmByte:
+	case SrcImmUByte:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = 1;
+		if ((c->d & SrcMask) == SrcImmByte)
+			c->src.val = insn_fetch(s8, 1, c->eip);
+		else
+			c->src.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case SrcAcc:
+		c->src.type = OP_REG;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->src.bytes) {
+			case 1:
+				c->src.val = *(u8 *)c->src.ptr;
+				break;
+			case 2:
+				c->src.val = *(u16 *)c->src.ptr;
+				break;
+			case 4:
+				c->src.val = *(u32 *)c->src.ptr;
+				break;
+			case 8:
+				c->src.val = *(u64 *)c->src.ptr;
+				break;
+		}
+		break;
+	case SrcOne:
+		c->src.bytes = 1;
+		c->src.val = 1;
+		break;
+	case SrcSI:
+		c->src.type = OP_MEM;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = (unsigned long *)
+			register_address(c,  seg_override_base(ctxt, ops, c),
+					 c->regs[VCPU_REGS_RSI]);
+		c->src.val = 0;
+		break;
+	case SrcImmFAddr:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = c->op_bytes + 2;
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		break;
+	case SrcMemFAddr:
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.bytes = c->op_bytes + 2;
+		break;
+	}
+
+	/*
+	 * Decode and fetch the second source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & Src2Mask) {
+	case Src2None:
+		break;
+	case Src2CL:
+		c->src2.bytes = 1;
+		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		break;
+	case Src2ImmByte:
+		c->src2.type = OP_IMM;
+		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.bytes = 1;
+		c->src2.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case Src2One:
+		c->src2.bytes = 1;
+		c->src2.val = 1;
+		break;
+	}
+
+	/* Decode and fetch the destination operand: register or memory. */
+	switch (c->d & DstMask) {
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+		return 0;
+	case DstReg:
+		decode_register_operand(&c->dst, c,
+			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+		break;
+	case DstMem:
+	case DstMem64:
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+			c->dst.type = OP_REG;
+			c->dst.val = c->dst.orig_val = c->modrm_val;
+			c->dst.ptr = c->modrm_ptr;
+			break;
+		}
+		c->dst.type = OP_MEM;
+		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		if ((c->d & DstMask) == DstMem64)
+			c->dst.bytes = 8;
+		else
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.val = 0;
+		if (c->d & BitOp) {
+			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+			c->dst.ptr = (void *)c->dst.ptr +
+						   (c->src.val & mask) / 8;
+		}
+		break;
+	case DstAcc:
+		c->dst.type = OP_REG;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->dst.bytes) {
+			case 1:
+				c->dst.val = *(u8 *)c->dst.ptr;
+				break;
+			case 2:
+				c->dst.val = *(u16 *)c->dst.ptr;
+				break;
+			case 4:
+				c->dst.val = *(u32 *)c->dst.ptr;
+				break;
+			case 8:
+				c->dst.val = *(u64 *)c->dst.ptr;
+				break;
+		}
+		c->dst.orig_val = c->dst.val;
+		break;
+	case DstDI:
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)
+			register_address(c, es_base(ctxt, ops),
+					 c->regs[VCPU_REGS_RDI]);
+		c->dst.val = 0;
+		break;
+	}
+
+done:
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
-- 
cgit v0.10.2


From 73fba5f4fe3e08bd7acb18a65b53643445c8f028 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:53 +0300
Subject: KVM: x86 emulator: move decode tables downwards

So they can reference execution functions.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c6f4359..70a7cb4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -117,242 +117,6 @@ struct group_dual {
 	struct opcode mod3[8];
 };
 
-#define D(_y) { .flags = (_y) }
-#define N    D(0)
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
-#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
-
-static struct opcode group1[] = {
-	X7(D(Lock)), N
-};
-
-static struct opcode group1A[] = {
-	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
-};
-
-static struct opcode group3[] = {
-	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	X4(D(Undefined)),
-};
-
-static struct opcode group4[] = {
-	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
-	N, N, N, N, N, N,
-};
-
-static struct opcode group5[] = {
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack), N,
-	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
-	D(SrcMem | ModRM | Stack), N,
-};
-
-static struct group_dual group7 = { {
-	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
-}, {
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), N,
-} };
-
-static struct opcode group8[] = {
-	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
-};
-
-static struct group_dual group9 = { {
-	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
-}, {
-	N, N, N, N, N, N, N, N,
-} };
-
-static struct opcode opcode_table[256] = {
-	/* 0x00 - 0x07 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	/* 0x08 - 0x0F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), N,
-	/* 0x10 - 0x17 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	/* 0x18 - 0x1F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	/* 0x20 - 0x27 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
-	/* 0x28 - 0x2F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
-	/* 0x30 - 0x37 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
-	/* 0x38 - 0x3F */
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	N, N,
-	/* 0x40 - 0x4F */
-	X16(D(DstReg)),
-	/* 0x50 - 0x57 */
-	X8(D(SrcReg | Stack)),
-	/* 0x58 - 0x5F */
-	X8(D(DstReg | Stack)),
-	/* 0x60 - 0x67 */
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
-	N, N, N, N,
-	/* 0x68 - 0x6F */
-	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
-	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
-	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
-	/* 0x70 - 0x7F */
-	X16(D(SrcImmByte)),
-	/* 0x80 - 0x87 */
-	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
-	G(DstMem | SrcImm | ModRM | Group, group1),
-	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
-	G(DstMem | SrcImmByte | ModRM | Group, group1),
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	/* 0x88 - 0x8F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
-	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
-	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
-	/* 0x90 - 0x97 */
-	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
-	/* 0x98 - 0x9F */
-	N, N, D(SrcImmFAddr | No64), N,
-	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
-	/* 0xA0 - 0xA7 */
-	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
-	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
-	/* 0xA8 - 0xAF */
-	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
-	D(ByteOp | DstDI | String), D(DstDI | String),
-	/* 0xB0 - 0xB7 */
-	X8(D(ByteOp | DstReg | SrcImm | Mov)),
-	/* 0xB8 - 0xBF */
-	X8(D(DstReg | SrcImm | Mov)),
-	/* 0xC0 - 0xC7 */
-	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
-	N, D(ImplicitOps | Stack), N, N,
-	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
-	/* 0xC8 - 0xCF */
-	N, N, N, D(ImplicitOps | Stack),
-	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
-	/* 0xD0 - 0xD7 */
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
-	N, N, N, N,
-	/* 0xD8 - 0xDF */
-	N, N, N, N, N, N, N, N,
-	/* 0xE0 - 0xE7 */
-	N, N, N, N,
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	/* 0xE8 - 0xEF */
-	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
-	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	/* 0xF0 - 0xF7 */
-	N, N, N, N,
-	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
-	/* 0xF8 - 0xFF */
-	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
-	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
-};
-
-static struct opcode twobyte_table[256] = {
-	/* 0x00 - 0x0F */
-	N, GD(0, &group7), N, N,
-	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
-	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
-	N, D(ImplicitOps | ModRM), N, N,
-	/* 0x10 - 0x1F */
-	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
-	/* 0x20 - 0x2F */
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
-	N, N, N, N,
-	N, N, N, N, N, N, N, N,
-	/* 0x30 - 0x3F */
-	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
-	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
-	N, N, N, N, N, N, N, N,
-	/* 0x40 - 0x4F */
-	X16(D(DstReg | SrcMem | ModRM | Mov)),
-	/* 0x50 - 0x5F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0x60 - 0x6F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0x70 - 0x7F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0x80 - 0x8F */
-	X16(D(SrcImm)),
-	/* 0x90 - 0x9F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0xA0 - 0xA7 */
-	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
-	N, D(DstMem | SrcReg | ModRM | BitOp),
-	D(DstMem | SrcReg | Src2ImmByte | ModRM),
-	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
-	/* 0xA8 - 0xAF */
-	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
-	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	D(DstMem | SrcReg | Src2ImmByte | ModRM),
-	D(DstMem | SrcReg | Src2CL | ModRM),
-	D(ModRM), N,
-	/* 0xB0 - 0xB7 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
-	/* 0xB8 - 0xBF */
-	N, N,
-	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
-	/* 0xC0 - 0xCF */
-	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
-	N, N, N, GD(0, &group9),
-	N, N, N, N, N, N, N, N,
-	/* 0xD0 - 0xDF */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0xE0 - 0xEF */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0xF0 - 0xFF */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
-};
-
-#undef D
-#undef N
-#undef G
-#undef GD
-#undef I
-
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
@@ -2252,6 +2016,242 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
+#define D(_y) { .flags = (_y) }
+#define N    D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+
+static struct opcode group1[] = {
+	X7(D(Lock)), N
+};
+
+static struct opcode group1A[] = {
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group3[] = {
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(Undefined)),
+};
+
+static struct opcode group4[] = {
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
+};
+
+static struct opcode group5[] = {
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
+};
+
+static struct group_dual group7 = { {
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+}, {
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group8[] = {
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct group_dual group9 = { {
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+	N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode opcode_table[256] = {
+	/* 0x00 - 0x07 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x08 - 0x0F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), N,
+	/* 0x10 - 0x17 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x18 - 0x1F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x20 - 0x27 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	/* 0x28 - 0x2F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	/* 0x30 - 0x37 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	/* 0x38 - 0x3F */
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	N, N,
+	/* 0x40 - 0x4F */
+	X16(D(DstReg)),
+	/* 0x50 - 0x57 */
+	X8(D(SrcReg | Stack)),
+	/* 0x58 - 0x5F */
+	X8(D(DstReg | Stack)),
+	/* 0x60 - 0x67 */
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, N, N, N,
+	/* 0x68 - 0x6F */
+	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
+	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
+	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+	/* 0x70 - 0x7F */
+	X16(D(SrcImmByte)),
+	/* 0x80 - 0x87 */
+	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+	G(DstMem | SrcImm | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+	G(DstMem | SrcImmByte | ModRM | Group, group1),
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	/* 0x88 - 0x8F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
+	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+	/* 0x90 - 0x97 */
+	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
+	/* 0x98 - 0x9F */
+	N, N, D(SrcImmFAddr | No64), N,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+	/* 0xA0 - 0xA7 */
+	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
+	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
+	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
+	/* 0xA8 - 0xAF */
+	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
+	D(ByteOp | DstDI | String), D(DstDI | String),
+	/* 0xB0 - 0xB7 */
+	X8(D(ByteOp | DstReg | SrcImm | Mov)),
+	/* 0xB8 - 0xBF */
+	X8(D(DstReg | SrcImm | Mov)),
+	/* 0xC0 - 0xC7 */
+	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
+	N, D(ImplicitOps | Stack), N, N,
+	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
+	/* 0xC8 - 0xCF */
+	N, N, N, D(ImplicitOps | Stack),
+	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+	/* 0xD0 - 0xD7 */
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	N, N, N, N,
+	/* 0xD8 - 0xDF */
+	N, N, N, N, N, N, N, N,
+	/* 0xE0 - 0xE7 */
+	N, N, N, N,
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	/* 0xE8 - 0xEF */
+	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	/* 0xF0 - 0xF7 */
+	N, N, N, N,
+	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+	/* 0xF8 - 0xFF */
+	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static struct opcode twobyte_table[256] = {
+	/* 0x00 - 0x0F */
+	N, GD(0, &group7), N, N,
+	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+	N, D(ImplicitOps | ModRM), N, N,
+	/* 0x10 - 0x1F */
+	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
+	/* 0x20 - 0x2F */
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	N, N, N, N,
+	N, N, N, N, N, N, N, N,
+	/* 0x30 - 0x3F */
+	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
+	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+	N, N, N, N, N, N, N, N,
+	/* 0x40 - 0x4F */
+	X16(D(DstReg | SrcMem | ModRM | Mov)),
+	/* 0x50 - 0x5F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x60 - 0x6F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x70 - 0x7F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x80 - 0x8F */
+	X16(D(SrcImm)),
+	/* 0x90 - 0x9F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xA0 - 0xA7 */
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+	/* 0xA8 - 0xAF */
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM),
+	D(ModRM), N,
+	/* 0xB0 - 0xB7 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
+	/* 0xB8 - 0xBF */
+	N, N,
+	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
+	/* 0xC0 - 0xCF */
+	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
+	N, N, N, GD(0, &group9),
+	N, N, N, N, N, N, N, N,
+	/* 0xD0 - 0xDF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xE0 - 0xEF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xF0 - 0xFF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+
 int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
-- 
cgit v0.10.2


From d0e533255d3811382c97b594ff7ab19b9b036814 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:54 +0300
Subject: KVM: x86 emulator: allow repeat macro arguments to contain commas

Needed for repeating instructions with execution functions.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 70a7cb4..7e9bcda 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -94,14 +94,14 @@
 #define Src2One     (3<<29)
 #define Src2Mask    (7<<29)
 
-#define X2(x) x, x
-#define X3(x) X2(x), x
-#define X4(x) X2(x), X2(x)
-#define X5(x) X4(x), x
-#define X6(x) X4(x), X2(x)
-#define X7(x) X4(x), X3(x)
-#define X8(x) X4(x), X4(x)
-#define X16(x) X8(x), X8(x)
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
 
 struct opcode {
 	u32 flags;
-- 
cgit v0.10.2


From 63540382ccb83d2857964858c1ac7eb7d37de497 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:55 +0300
Subject: KVM: x86 emulator: convert some push instructions to direct decode

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e9bcda..904fc1c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2016,6 +2016,12 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_push(ctxt, ctxt->ops);
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2111,7 +2117,7 @@ static struct opcode opcode_table[256] = {
 	/* 0x40 - 0x4F */
 	X16(D(DstReg)),
 	/* 0x50 - 0x57 */
-	X8(D(SrcReg | Stack)),
+	X8(I(SrcReg | Stack, em_push)),
 	/* 0x58 - 0x5F */
 	X8(D(DstReg | Stack)),
 	/* 0x60 - 0x67 */
@@ -2119,7 +2125,8 @@ static struct opcode opcode_table[256] = {
 	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
 	N, N, N, N,
 	/* 0x68 - 0x6F */
-	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
+	I(SrcImm | Mov | Stack, em_push), N,
+	I(SrcImmByte | Mov | Stack, em_push), N,
 	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
 	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
@@ -2786,9 +2793,6 @@ special_insn:
 	case 0x48 ... 0x4f: /* dec r16/r32 */
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
-	case 0x50 ... 0x57:  /* push reg */
-		emulate_push(ctxt, ops);
-		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
@@ -2810,10 +2814,6 @@ special_insn:
 			goto cannot_emulate;
 		c->dst.val = (s32) c->src.val;
 		break;
-	case 0x68: /* push imm */
-	case 0x6a: /* push imm8 */
-		emulate_push(ctxt, ops);
-		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
 		c->dst.bytes = min(c->dst.bytes, 4u);
-- 
cgit v0.10.2


From e85d28f8e8cef09b8e424448ccedb7244cfbf147 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 29 Jul 2010 15:11:52 +0300
Subject: KVM: x86 emulator: don't update vcpu state if instruction is
 restarted

No need to update vcpu state since instruction is in the middle of the
emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33deb75..3cbe803 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4057,32 +4057,27 @@ restart:
 		return handle_emulation_failure(vcpu);
 	}
 
-	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
-	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+	r = EMULATE_DONE;
 
-	if (vcpu->arch.emulate_ctxt.exception >= 0) {
+	if (vcpu->arch.emulate_ctxt.exception >= 0)
 		inject_emulated_exception(vcpu);
-		return EMULATE_DONE;
-	}
-
-	if (vcpu->arch.pio.count) {
+	else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->mmio_needed) {
+		r = EMULATE_DO_MMIO;
+	} else if (vcpu->mmio_needed) {
 		if (vcpu->mmio_is_write)
 			vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->arch.emulate_ctxt.restart)
+		r = EMULATE_DO_MMIO;
+	} else if (vcpu->arch.emulate_ctxt.restart)
 		goto restart;
 
-	return EMULATE_DONE;
+	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+	return r;
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-- 
cgit v0.10.2


From 9928ff608b1b6ba10fafde85f57970a83a181331 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 18:35:24 +0300
Subject: KVM: x86 emulator: fix LMSW able to clear cr0.pe

LMSW is documented not to be able to clear cr0.pe; make it so.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 904fc1c..4d49514 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3211,7 +3211,7 @@ twobyte_insn:
 			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
-			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
 				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
-- 
cgit v0.10.2


From 4fc40f076f4fa289dd546990b597351c9cdad985 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 2 Aug 2010 12:47:51 +0300
Subject: KVM: x86 emulator: check io permissions only once for string pio

Do not recheck io permission on every iteration.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f901c1..8762411 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -224,6 +224,7 @@ struct x86_emulate_ctxt {
 	int interruptibility;
 
 	bool restart; /* restart string instruction after writeback */
+	bool perm_ok; /* do not check permissions if true */
 
 	int exception; /* exception that happens during emulation or -1 */
 	u32 error_code; /* error code for exception */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4d49514..760e2b0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1621,9 +1621,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
+	if (ctxt->perm_ok)
+		return true;
+
 	if (emulator_bad_iopl(ctxt, ops))
 		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
 			return false;
+
+	ctxt->perm_ok = true;
+
 	return true;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3cbe803..35c0f4e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3997,6 +3997,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
+		vcpu->arch.emulate_ctxt.perm_ok = false;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
 		trace_kvm_emulate_insn_start(vcpu);
-- 
cgit v0.10.2


From 251464c464cf7df7d6d548f1065f49a3ecd08118 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:12:08 +0800
Subject: KVM: MMU: using kvm_set_pfn_accessed() instead of
 mark_page_accessed()

It's a small cleanup that using using kvm_set_pfn_accessed() instead
of mark_page_accessed()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 82f7622..e430a38 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -309,7 +309,7 @@ static void update_spte(u64 *sptep, u64 new_spte)
 	else {
 		old_spte = __xchg_spte(sptep, new_spte);
 		if (old_spte & shadow_accessed_mask)
-			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+			kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	}
 }
 
-- 
cgit v0.10.2


From 8672b7217a234c41d425a63b171af809e1169842 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:14:04 +0800
Subject: KVM: MMU: move bits lost judgement into a separate function

Introduce spte_has_volatile_bits() function to judge whether spte
bits will miss, it's more readable and can help us to cleanup code
later

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e430a38..c07b9a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -299,6 +299,20 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static bool spte_has_volatile_bits(u64 spte)
+{
+	if (!shadow_accessed_mask)
+		return false;
+
+	if (!is_shadow_present_pte(spte))
+		return false;
+
+	if (spte & shadow_accessed_mask)
+		return false;
+
+	return true;
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte;
@@ -679,14 +693,14 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn_t pfn;
 	u64 old_spte = *sptep;
 
-	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
-	      old_spte & shadow_accessed_mask) {
+	if (!spte_has_volatile_bits(old_spte))
 		__set_spte(sptep, new_spte);
-	} else
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
 
 	if (!is_rmap_spte(old_spte))
 		return;
+
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-- 
cgit v0.10.2


From 4132779b1718f066ec2d06a71c8958039865cd49 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:15:08 +0800
Subject: KVM: MMU: mark page dirty only when page is really written

Mark page dirty only when this page is really written, it's more exacter,
and also can fix dirty page marking in speculation path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c07b9a2..ff95d41 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -307,24 +307,42 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (!is_shadow_present_pte(spte))
 		return false;
 
-	if (spte & shadow_accessed_mask)
+	if ((spte & shadow_accessed_mask) &&
+	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
 		return false;
 
 	return true;
 }
 
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
-	u64 old_spte;
+	u64 mask, old_spte = *sptep;
+
+	WARN_ON(!is_rmap_spte(new_spte));
 
-	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
-	      !is_rmap_spte(*sptep))
+	new_spte |= old_spte & shadow_dirty_mask;
+
+	mask = shadow_accessed_mask;
+	if (is_writable_pte(old_spte))
+		mask |= shadow_dirty_mask;
+
+	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
 		__set_spte(sptep, new_spte);
-	else {
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
-		if (old_spte & shadow_accessed_mask)
-			kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	}
+
+	if (!shadow_accessed_mask)
+		return;
+
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -704,7 +722,7 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(old_spte))
+	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
 		kvm_set_pfn_dirty(pfn);
 }
 
@@ -759,13 +777,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 	}
-	if (write_protected) {
-		pfn_t pfn;
-
-		spte = rmap_next(kvm, rmapp, NULL);
-		pfn = spte_to_pfn(*spte);
-		kvm_set_pfn_dirty(pfn);
-	}
 
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
@@ -1938,7 +1949,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = shadow_base_present_pte | shadow_dirty_mask;
+	spte = shadow_base_present_pte;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
 	if (!dirty)
@@ -1999,8 +2010,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
-		kvm_set_pfn_dirty(pfn);
 	update_spte(sptep, spte);
 done:
 	return ret;
-- 
cgit v0.10.2


From 52c65a30a5c6f31cd66dba57c22d18cafa5e327f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 2 Aug 2010 16:46:44 +0200
Subject: KVM: SVM: Check for nested vmrun intercept before emulating vmrun

This patch lets the nested vmrun fail if the L1 hypervisor
has not intercepted vmrun. This fixes the "vmrun intercept
check" unit test.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 116e034..a0e5c7e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2014,6 +2014,14 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	return true;
 }
 
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+		return false;
+
+	return true;
+}
+
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
@@ -2028,6 +2036,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return false;
 
+	if (!nested_vmcb_checks(nested_vmcb)) {
+		nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
+		nested_vmcb->control.exit_code_hi = 0;
+		nested_vmcb->control.exit_info_1  = 0;
+		nested_vmcb->control.exit_info_2  = 0;
+
+		nested_svm_unmap(page);
+
+		return false;
+	}
+
 	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
-- 
cgit v0.10.2


From dbe7758482a870f30a86bdeefebf4fc260afef11 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 2 Aug 2010 16:46:45 +0200
Subject: KVM: SVM: Check for asid != 0 on nested vmrun

This patch lets a nested vmrun fail if the L1 hypervisor
left the asid zero. This fixes the asid_zero unit test.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a0e5c7e..af5b9ea 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2019,6 +2019,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
 		return false;
 
+	if (vmcb->control.asid == 0)
+		return false;
+
 	return true;
 }
 
-- 
cgit v0.10.2


From 09ee57cdae3156aa3b74f378a0c57ef657c90f38 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:07:29 +0300
Subject: KVM: x86 emulator: push segment override out of decode_modrm()

Let it compute modrm_seg instead, and have the caller apply it.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8762411..cbdf767 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -198,6 +198,7 @@ struct decode_cache {
 	u8 modrm_mod;
 	u8 modrm_reg;
 	u8 modrm_rm;
+	u8 modrm_seg;
 	u8 use_modrm_ea;
 	bool rip_relative;
 	unsigned long modrm_ea;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 760e2b0..471f12a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -593,6 +593,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_rm |= (c->modrm & 0x07);
 	c->modrm_ea = 0;
 	c->use_modrm_ea = 1;
+	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
 		c->modrm_ptr = decode_register(c->modrm_rm,
@@ -649,8 +650,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		}
 		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
 		    (c->modrm_rm == 6 && c->modrm_mod != 0))
-			if (!c->has_seg_override)
-				set_seg_override(c, VCPU_SREG_SS);
+			c->modrm_seg = VCPU_SREG_SS;
 		c->modrm_ea = (u16)c->modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
@@ -2405,9 +2405,11 @@ done_prefixes:
 		c->op_bytes = 8;
 
 	/* ModRM and SIB bytes. */
-	if (c->d & ModRM)
+	if (c->d & ModRM) {
 		rc = decode_modrm(ctxt, ops);
-	else if (c->d & MemAbs)
+		if (!c->has_seg_override)
+			set_seg_override(c, c->modrm_seg);
+	} else if (c->d & MemAbs)
 		rc = decode_abs(ctxt, ops);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
-- 
cgit v0.10.2


From 1a6440aef6d63252e6c80aff651147b5f8c737e9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:35:10 +0300
Subject: KVM: x86 emulator: use correct type for memory address in operands

Currently we use a void pointer for memory addresses.  That's wrong since
these are guest virtual addresses which are not directly dereferencable by
the host.

Use the correct type, unsigned long.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index cbdf767..0c835f7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -156,7 +156,10 @@ struct operand {
 		unsigned long orig_val;
 		u64 orig_val64;
 	};
-	unsigned long *ptr;
+	union {
+		unsigned long *reg;
+		unsigned long mem;
+	} addr;
 	union {
 		unsigned long val;
 		u64 val64;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 471f12a..5f45f66 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -489,7 +489,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
 
 static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
-			   void *ptr,
+			   ulong addr,
 			   u16 *size, unsigned long *address, int op_bytes)
 {
 	int rc;
@@ -497,12 +497,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (op_bytes == 2)
 		op_bytes = 3;
 	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-			   ctxt->vcpu, NULL);
+	rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-			   ctxt->vcpu, NULL);
+	rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
 	return rc;
 }
 
@@ -552,21 +550,21 @@ static void decode_register_operand(struct operand *op,
 		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
 	op->type = OP_REG;
 	if ((c->d & ByteOp) && !inhibit_bytereg) {
-		op->ptr = decode_register(reg, c->regs, highbyte_regs);
-		op->val = *(u8 *)op->ptr;
+		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+		op->val = *(u8 *)op->addr.reg;
 		op->bytes = 1;
 	} else {
-		op->ptr = decode_register(reg, c->regs, 0);
+		op->addr.reg = decode_register(reg, c->regs, 0);
 		op->bytes = c->op_bytes;
 		switch (op->bytes) {
 		case 2:
-			op->val = *(u16 *)op->ptr;
+			op->val = *(u16 *)op->addr.reg;
 			break;
 		case 4:
-			op->val = *(u32 *)op->ptr;
+			op->val = *(u32 *)op->addr.reg;
 			break;
 		case 8:
-			op->val = *(u64 *) op->ptr;
+			op->val = *(u64 *) op->addr.reg;
 			break;
 		}
 	}
@@ -976,23 +974,23 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 		 */
 		switch (c->dst.bytes) {
 		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
+			*(u8 *)c->dst.addr.reg = (u8)c->dst.val;
 			break;
 		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
+			*(u16 *)c->dst.addr.reg = (u16)c->dst.val;
 			break;
 		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
+			*c->dst.addr.reg = (u32)c->dst.val;
 			break;	/* 64b: zero-ext */
 		case 8:
-			*c->dst.ptr = c->dst.val;
+			*c->dst.addr.reg = c->dst.val;
 			break;
 		}
 		break;
 	case OP_MEM:
 		if (c->lock_prefix)
 			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
+					c->dst.addr.mem,
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
@@ -1000,14 +998,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
+					c->dst.addr.mem,
 					&c->dst.val,
 					c->dst.bytes,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
+			emulate_pf(ctxt, c->dst.addr.mem, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1029,8 +1026,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
-					       c->regs[VCPU_REGS_RSP]);
+	c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
+					   c->regs[VCPU_REGS_RSP]);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -2019,7 +2016,7 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
 	register_address_increment(c, &c->regs[reg], df * op->bytes);
-	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+	op->addr.mem = register_address(c,  base, c->regs[reg]);
 }
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2456,17 +2453,17 @@ done_prefixes:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->src.type = OP_REG;
 			c->src.val = c->modrm_val;
-			c->src.ptr = c->modrm_ptr;
+			c->src.addr.reg = c->modrm_ptr;
 			break;
 		}
 		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.addr.mem = c->modrm_ea;
 		c->src.val = 0;
 		break;
 	case SrcImm:
 	case SrcImmU:
 		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
+		c->src.addr.mem = c->eip;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		if (c->src.bytes == 8)
 			c->src.bytes = 4;
@@ -2499,7 +2496,7 @@ done_prefixes:
 	case SrcImmByte:
 	case SrcImmUByte:
 		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
+		c->src.addr.mem = c->eip;
 		c->src.bytes = 1;
 		if ((c->d & SrcMask) == SrcImmByte)
 			c->src.val = insn_fetch(s8, 1, c->eip);
@@ -2509,19 +2506,19 @@ done_prefixes:
 	case SrcAcc:
 		c->src.type = OP_REG;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
 		switch (c->src.bytes) {
 			case 1:
-				c->src.val = *(u8 *)c->src.ptr;
+				c->src.val = *(u8 *)c->src.addr.reg;
 				break;
 			case 2:
-				c->src.val = *(u16 *)c->src.ptr;
+				c->src.val = *(u16 *)c->src.addr.reg;
 				break;
 			case 4:
-				c->src.val = *(u32 *)c->src.ptr;
+				c->src.val = *(u32 *)c->src.addr.reg;
 				break;
 			case 8:
-				c->src.val = *(u64 *)c->src.ptr;
+				c->src.val = *(u64 *)c->src.addr.reg;
 				break;
 		}
 		break;
@@ -2532,20 +2529,20 @@ done_prefixes:
 	case SrcSI:
 		c->src.type = OP_MEM;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)
+		c->src.addr.mem =
 			register_address(c,  seg_override_base(ctxt, ops, c),
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		break;
 	case SrcImmFAddr:
 		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
+		c->src.addr.mem = c->eip;
 		c->src.bytes = c->op_bytes + 2;
 		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
 		break;
 	case SrcMemFAddr:
 		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.addr.mem = c->modrm_ea;
 		c->src.bytes = c->op_bytes + 2;
 		break;
 	}
@@ -2563,7 +2560,7 @@ done_prefixes:
 		break;
 	case Src2ImmByte:
 		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.addr.mem = c->eip;
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
@@ -2588,11 +2585,11 @@ done_prefixes:
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 			c->dst.type = OP_REG;
 			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.ptr = c->modrm_ptr;
+			c->dst.addr.reg = c->modrm_ptr;
 			break;
 		}
 		c->dst.type = OP_MEM;
-		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		c->dst.addr.mem = c->modrm_ea;
 		if ((c->d & DstMask) == DstMem64)
 			c->dst.bytes = 8;
 		else
@@ -2601,26 +2598,26 @@ done_prefixes:
 		if (c->d & BitOp) {
 			unsigned long mask = ~(c->dst.bytes * 8 - 1);
 
-			c->dst.ptr = (void *)c->dst.ptr +
+			c->dst.addr.mem = c->dst.addr.mem +
 						   (c->src.val & mask) / 8;
 		}
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
 		switch (c->dst.bytes) {
 			case 1:
-				c->dst.val = *(u8 *)c->dst.ptr;
+				c->dst.val = *(u8 *)c->dst.addr.reg;
 				break;
 			case 2:
-				c->dst.val = *(u16 *)c->dst.ptr;
+				c->dst.val = *(u16 *)c->dst.addr.reg;
 				break;
 			case 4:
-				c->dst.val = *(u32 *)c->dst.ptr;
+				c->dst.val = *(u32 *)c->dst.addr.reg;
 				break;
 			case 8:
-				c->dst.val = *(u64 *)c->dst.ptr;
+				c->dst.val = *(u64 *)c->dst.addr.reg;
 				break;
 		}
 		c->dst.orig_val = c->dst.val;
@@ -2628,7 +2625,7 @@ done_prefixes:
 	case DstDI:
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)
+		c->dst.addr.mem =
 			register_address(c, es_base(ctxt, ops),
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
@@ -2696,7 +2693,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2704,7 +2701,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src2.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+		rc = read_emulated(ctxt, ops, c->src2.addr.mem,
 					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2716,7 +2713,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+		rc = read_emulated(ctxt, ops, c->dst.addr.mem,
 				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2880,16 +2877,16 @@ special_insn:
 		/* Write back the register source. */
 		switch (c->dst.bytes) {
 		case 1:
-			*(u8 *) c->src.ptr = (u8) c->dst.val;
+			*(u8 *) c->src.addr.reg = (u8) c->dst.val;
 			break;
 		case 2:
-			*(u16 *) c->src.ptr = (u16) c->dst.val;
+			*(u16 *) c->src.addr.reg = (u16) c->dst.val;
 			break;
 		case 4:
-			*c->src.ptr = (u32) c->dst.val;
+			*c->src.addr.reg = (u32) c->dst.val;
 			break;	/* 64b reg: zero-extend */
 		case 8:
-			*c->src.ptr = c->dst.val;
+			*c->src.addr.reg = c->dst.val;
 			break;
 		}
 		/*
@@ -2936,15 +2933,15 @@ special_insn:
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
-		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) {
 			c->dst.type = OP_NONE;  /* nop */
 			break;
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
 		c->src.type = OP_REG;
 		c->src.bytes = c->op_bytes;
-		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
-		c->src.val = *(c->src.ptr);
+		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
+		c->src.val = *(c->src.addr.reg);
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
@@ -2952,7 +2949,7 @@ special_insn:
 		break;
 	case 0x9d: /* popf */
 		c->dst.type = OP_REG;
-		c->dst.ptr = (unsigned long *) &ctxt->eflags;
+		c->dst.addr.reg = &ctxt->eflags;
 		c->dst.bytes = c->op_bytes;
 		rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
@@ -2963,7 +2960,7 @@ special_insn:
 		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
 		goto cmp;
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
@@ -2982,7 +2979,7 @@ special_insn:
 		break;
 	case 0xc3: /* ret */
 		c->dst.type = OP_REG;
-		c->dst.ptr = &c->eip;
+		c->dst.addr.reg = &c->eip;
 		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
 	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
@@ -3184,7 +3181,7 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 2: /* lgdt */
-			rc = read_descriptor(ctxt, ops, c->src.ptr,
+			rc = read_descriptor(ctxt, ops, c->src.addr.mem,
 					     &size, &address, c->op_bytes);
 			if (rc != X86EMUL_CONTINUE)
 				goto done;
@@ -3204,7 +3201,7 @@ twobyte_insn:
 					goto cannot_emulate;
 				}
 			} else {
-				rc = read_descriptor(ctxt, ops, c->src.ptr,
+				rc = read_descriptor(ctxt, ops, c->src.addr.mem,
 						     &size, &address,
 						     c->op_bytes);
 				if (rc != X86EMUL_CONTINUE)
@@ -3399,7 +3396,7 @@ twobyte_insn:
 		} else {
 			/* Failure: write the value we saw to EAX. */
 			c->dst.type = OP_REG;
-			c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
 		}
 		break;
 	case 0xb3:
-- 
cgit v0.10.2


From 4515453964e78ce556a98c56aeb675ed8d48b8de Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:39:53 +0300
Subject: KVM: x86 emulator: simplify xchg decode tables

Use X8() to avoid repetition.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5f45f66..c7176df 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2147,7 +2147,7 @@ static struct opcode opcode_table[256] = {
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
-	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
+	X8(D(DstReg)),
 	/* 0x98 - 0x9F */
 	N, N, D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
-- 
cgit v0.10.2


From 3d9e77dff81c8be21ec0e7950ae06d1bddff8066 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:41:59 +0300
Subject: KVM: x86 emulator: use SrcAcc to simplify xchg decoding

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c7176df..b7da0e3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2147,7 +2147,7 @@ static struct opcode opcode_table[256] = {
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
-	X8(D(DstReg)),
+	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
 	N, N, D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
@@ -2932,16 +2932,9 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0x90: /* nop / xchg r8,rax */
-		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) {
-			c->dst.type = OP_NONE;  /* nop */
-			break;
-		}
-	case 0x91 ... 0x97: /* xchg reg,rax */
-		c->src.type = OP_REG;
-		c->src.bytes = c->op_bytes;
-		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-		c->src.val = *(c->src.addr.reg);
+	case 0x90 ... 0x97: /* nop / xchg reg, rax */
+		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+			goto done;
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
-- 
cgit v0.10.2


From 91ff3cb43cb3dd8810d726dfa1f3736dc9aea1df Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:53:09 +0300
Subject: KVM: x86 emulator: put register operand fetch into a function

The code is repeated three times, put it into fetch_register_operand()

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b7da0e3..898a55b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -539,6 +539,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
 	return (!!rc ^ (condition & 1));
 }
 
+static void fetch_register_operand(struct operand *op)
+{
+	switch (op->bytes) {
+	case 1:
+		op->val = *(u8 *)op->addr.reg;
+		break;
+	case 2:
+		op->val = *(u16 *)op->addr.reg;
+		break;
+	case 4:
+		op->val = *(u32 *)op->addr.reg;
+		break;
+	case 8:
+		op->val = *(u64 *)op->addr.reg;
+		break;
+	}
+}
+
 static void decode_register_operand(struct operand *op,
 				    struct decode_cache *c,
 				    int inhibit_bytereg)
@@ -551,23 +569,12 @@ static void decode_register_operand(struct operand *op,
 	op->type = OP_REG;
 	if ((c->d & ByteOp) && !inhibit_bytereg) {
 		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
-		op->val = *(u8 *)op->addr.reg;
 		op->bytes = 1;
 	} else {
 		op->addr.reg = decode_register(reg, c->regs, 0);
 		op->bytes = c->op_bytes;
-		switch (op->bytes) {
-		case 2:
-			op->val = *(u16 *)op->addr.reg;
-			break;
-		case 4:
-			op->val = *(u32 *)op->addr.reg;
-			break;
-		case 8:
-			op->val = *(u64 *) op->addr.reg;
-			break;
-		}
 	}
+	fetch_register_operand(op);
 	op->orig_val = op->val;
 }
 
@@ -2507,20 +2514,7 @@ done_prefixes:
 		c->src.type = OP_REG;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-		switch (c->src.bytes) {
-			case 1:
-				c->src.val = *(u8 *)c->src.addr.reg;
-				break;
-			case 2:
-				c->src.val = *(u16 *)c->src.addr.reg;
-				break;
-			case 4:
-				c->src.val = *(u32 *)c->src.addr.reg;
-				break;
-			case 8:
-				c->src.val = *(u64 *)c->src.addr.reg;
-				break;
-		}
+		fetch_register_operand(&c->src);
 		break;
 	case SrcOne:
 		c->src.bytes = 1;
@@ -2606,20 +2600,7 @@ done_prefixes:
 		c->dst.type = OP_REG;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
-		switch (c->dst.bytes) {
-			case 1:
-				c->dst.val = *(u8 *)c->dst.addr.reg;
-				break;
-			case 2:
-				c->dst.val = *(u16 *)c->dst.addr.reg;
-				break;
-			case 4:
-				c->dst.val = *(u32 *)c->dst.addr.reg;
-				break;
-			case 8:
-				c->dst.val = *(u64 *)c->dst.addr.reg;
-				break;
-		}
+		fetch_register_operand(&c->dst);
 		c->dst.orig_val = c->dst.val;
 		break;
 	case DstDI:
-- 
cgit v0.10.2


From d4709c78eeff2b272e0b9727748b72371b0e71ab Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 13:53:19 +0300
Subject: KVM: x86 emulator: drop use_modrm_ea

Unused (and has never been).

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0c835f7..e425444 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -202,7 +202,6 @@ struct decode_cache {
 	u8 modrm_reg;
 	u8 modrm_rm;
 	u8 modrm_seg;
-	u8 use_modrm_ea;
 	bool rip_relative;
 	unsigned long modrm_ea;
 	void *modrm_ptr;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 898a55b..7d2c715 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -597,7 +597,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_reg |= (c->modrm & 0x38) >> 3;
 	c->modrm_rm |= (c->modrm & 0x07);
 	c->modrm_ea = 0;
-	c->use_modrm_ea = 1;
 	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
-- 
cgit v0.10.2


From 1e87e3efe764285133866a14ddc71cf211f022c2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:42:51 +0300
Subject: KVM: x86 emulator: simplify REX.W check

(x && (x & y)) == (x & y)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7d2c715..a832019 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2358,9 +2358,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 done_prefixes:
 
 	/* REX prefix. */
-	if (c->rex_prefix)
-		if (c->rex_prefix & 8)
-			c->op_bytes = 8;	/* REX.W */
+	if (c->rex_prefix & 8)
+		c->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
 	opcode = opcode_table[c->b];
-- 
cgit v0.10.2


From 7f9b4b75be866de938a3094413a60554f7e66e4d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:46:54 +0300
Subject: KVM: x86 emulator: introduce Op3264 for mov cr and mov dr
 instructions

The operands for these instructions are 32 bits or 64 bits, depending on
long mode, and ignoring REX prefixes, or the operand size prefix.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a832019..b7adfcc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -83,6 +83,7 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 /* Misc flags */
+#define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
@@ -2406,6 +2407,13 @@ done_prefixes:
 	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
 		c->op_bytes = 8;
 
+	if (c->d & Op3264) {
+		if (mode == X86EMUL_MODE_PROT64)
+			c->op_bytes = 8;
+		else
+			c->op_bytes = 4;
+	}
+
 	/* ModRM and SIB bytes. */
 	if (c->d & ModRM) {
 		rc = decode_modrm(ctxt, ops);
-- 
cgit v0.10.2


From cecc9e39161898eb767a6b797e27a1660b3eb27e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:48:44 +0300
Subject: KVM: x86 emulator: mark mov cr and mov dr as 64-bit instructions in
 long mode

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b7adfcc..20752dc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x10 - 0x1F */
 	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-- 
cgit v0.10.2


From 1a0c7d44e4553ffb4902ec15549a9b855cd05a59 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:25:22 +0300
Subject: KVM: x86 emulator: use struct operand for mov reg,cr and mov cr,reg
 for reg op

This is an ordinary modrm source or destination; use the standard structure
representing it.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 20752dc..562e034 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x10 - 0x1F */
 	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
-	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | DstMem | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | Priv | Op3264),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
@@ -3240,8 +3240,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		}
-		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
-		c->dst.type = OP_NONE;	/* no writeback */
+		c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x21: /* mov from dr to reg */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
@@ -3253,7 +3252,7 @@ twobyte_insn:
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+		if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
 			emulate_gp(ctxt, 0);
 			goto done;
 		}
-- 
cgit v0.10.2


From b27f38563d956135a5e80aca749b399ac5f3158a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:25:22 +0300
Subject: KVM: x86 emulator: use struct operand for mov reg,dr and mov dr,reg
 for reg op

This is an ordinary modrm source or destination; use the standard structure
representing it.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 562e034..628fb5d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x10 - 0x1F */
 	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	D(ModRM | DstMem | Priv | Op3264), D(ModRM | Priv | Op3264),
-	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
+	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
@@ -3248,8 +3248,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		}
-		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
-		c->dst.type = OP_NONE;	/* no writeback */
+		ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
 		break;
 	case 0x22: /* mov reg, cr */
 		if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
@@ -3265,7 +3264,7 @@ twobyte_insn:
 			goto done;
 		}
 
-		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+		if (ops->set_dr(c->modrm_reg, c->src.val &
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
 			/* #UD condition is already handled by the code above */
-- 
cgit v0.10.2


From 5a506b125f1c97c846654ebacc913a136284e42b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:10:29 +0300
Subject: KVM: x86 emulator: add NoAccess flag for memory instructions that
 skip access

Use for INVLPG, which accesses the tlb, not memory.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 628fb5d..80efe76 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -83,6 +83,7 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 /* Misc flags */
+#define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
 #define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
@@ -2067,7 +2068,8 @@ static struct opcode group5[] = {
 static struct group_dual group7 = { {
 	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+	D(SrcMem16 | ModRM | Mov | Priv),
+	D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
 }, {
 	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
@@ -2456,7 +2458,7 @@ done_prefixes:
 		c->src.bytes = (c->d & ByteOp) ? 1 :
 							   c->op_bytes;
 		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+		if (c->d & NoAccess)
 			break;
 	srcmem_common:
 		/*
-- 
cgit v0.10.2


From 342fc63095e2d676f209b202d41a3f670dd9bf08 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:13:22 +0300
Subject: KVM: x86 emulator: switch LEA to use SrcMem decoding

The NoAccess flag will prevent memory from being accessed.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80efe76..b8aa667 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2153,7 +2153,7 @@ static struct opcode opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
-	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
 	X8(D(SrcAcc | DstReg)),
@@ -2895,7 +2895,7 @@ special_insn:
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->modrm_ea;
+		c->dst.val = c->src.addr.mem;
 		break;
 	case 0x8e: { /* mov seg, r/m16 */
 		uint16_t sel;
-- 
cgit v0.10.2


From 1f6f05800e2fdd815ac63e3264071d26d429f491 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:19:22 +0300
Subject: KVM: x86 emulator: change invlpg emulation to use src.mem.addr

Instead of using modrm_ea, which will soon be gone.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8aa667..eda6941 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3206,7 +3206,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
+			emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
-- 
cgit v0.10.2


From 2dbd0dd711e6c0ca6a2be9e6d93bbeb339386638 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:40:19 +0300
Subject: KVM: x86 emulator: Decode memory operands directly into a 'struct
 operand'

Since modrm operand can be either register or memory, decoding it into
a 'struct operand', which can represent both, is simpler.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e425444..1e4a72c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -203,9 +203,6 @@ struct decode_cache {
 	u8 modrm_rm;
 	u8 modrm_seg;
 	bool rip_relative;
-	unsigned long modrm_ea;
-	void *modrm_ptr;
-	unsigned long modrm_val;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache mem_read;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index eda6941..955d480 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -581,12 +581,14 @@ static void decode_register_operand(struct operand *op,
 }
 
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops)
+			struct x86_emulate_ops *ops,
+			struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
 	int rc = X86EMUL_CONTINUE;
+	ulong modrm_ea = 0;
 
 	if (c->rex_prefix) {
 		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
@@ -598,16 +600,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
 	c->modrm_reg |= (c->modrm & 0x38) >> 3;
 	c->modrm_rm |= (c->modrm & 0x07);
-	c->modrm_ea = 0;
 	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
-		c->modrm_ptr = decode_register(c->modrm_rm,
+		op->type = OP_REG;
+		op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		op->addr.reg = decode_register(c->modrm_rm,
 					       c->regs, c->d & ByteOp);
-		c->modrm_val = *(unsigned long *)c->modrm_ptr;
+		fetch_register_operand(op);
 		return rc;
 	}
 
+	op->type = OP_MEM;
+
 	if (c->ad_bytes == 2) {
 		unsigned bx = c->regs[VCPU_REGS_RBX];
 		unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -618,46 +623,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 6)
-				c->modrm_ea += insn_fetch(u16, 2, c->eip);
+				modrm_ea += insn_fetch(u16, 2, c->eip);
 			break;
 		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->eip);
 			break;
 		case 2:
-			c->modrm_ea += insn_fetch(u16, 2, c->eip);
+			modrm_ea += insn_fetch(u16, 2, c->eip);
 			break;
 		}
 		switch (c->modrm_rm) {
 		case 0:
-			c->modrm_ea += bx + si;
+			modrm_ea += bx + si;
 			break;
 		case 1:
-			c->modrm_ea += bx + di;
+			modrm_ea += bx + di;
 			break;
 		case 2:
-			c->modrm_ea += bp + si;
+			modrm_ea += bp + si;
 			break;
 		case 3:
-			c->modrm_ea += bp + di;
+			modrm_ea += bp + di;
 			break;
 		case 4:
-			c->modrm_ea += si;
+			modrm_ea += si;
 			break;
 		case 5:
-			c->modrm_ea += di;
+			modrm_ea += di;
 			break;
 		case 6:
 			if (c->modrm_mod != 0)
-				c->modrm_ea += bp;
+				modrm_ea += bp;
 			break;
 		case 7:
-			c->modrm_ea += bx;
+			modrm_ea += bx;
 			break;
 		}
 		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
 		    (c->modrm_rm == 6 && c->modrm_mod != 0))
 			c->modrm_seg = VCPU_SREG_SS;
-		c->modrm_ea = (u16)c->modrm_ea;
+		modrm_ea = (u16)modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
 		if ((c->modrm_rm & 7) == 4) {
@@ -667,48 +672,51 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			scale = sib >> 6;
 
 			if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->eip);
 			else
-				c->modrm_ea += c->regs[base_reg];
+				modrm_ea += c->regs[base_reg];
 			if (index_reg != 4)
-				c->modrm_ea += c->regs[index_reg] << scale;
+				modrm_ea += c->regs[index_reg] << scale;
 		} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
 				c->rip_relative = 1;
 		} else
-			c->modrm_ea += c->regs[c->modrm_rm];
+			modrm_ea += c->regs[c->modrm_rm];
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 5)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->eip);
 			break;
 		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->eip);
 			break;
 		case 2:
-			c->modrm_ea += insn_fetch(s32, 4, c->eip);
+			modrm_ea += insn_fetch(s32, 4, c->eip);
 			break;
 		}
 	}
+	op->addr.mem = modrm_ea;
 done:
 	return rc;
 }
 
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops)
+		      struct x86_emulate_ops *ops,
+		      struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
+	op->type = OP_MEM;
 	switch (c->ad_bytes) {
 	case 2:
-		c->modrm_ea = insn_fetch(u16, 2, c->eip);
+		op->addr.mem = insn_fetch(u16, 2, c->eip);
 		break;
 	case 4:
-		c->modrm_ea = insn_fetch(u32, 4, c->eip);
+		op->addr.mem = insn_fetch(u32, 4, c->eip);
 		break;
 	case 8:
-		c->modrm_ea = insn_fetch(u64, 8, c->eip);
+		op->addr.mem = insn_fetch(u64, 8, c->eip);
 		break;
 	}
 done:
@@ -2280,6 +2288,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, dual, goffset;
 	struct opcode opcode, *g_mod012, *g_mod3;
+	struct operand memop = { .type = OP_NONE };
 
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
@@ -2418,25 +2427,25 @@ done_prefixes:
 
 	/* ModRM and SIB bytes. */
 	if (c->d & ModRM) {
-		rc = decode_modrm(ctxt, ops);
+		rc = decode_modrm(ctxt, ops, &memop);
 		if (!c->has_seg_override)
 			set_seg_override(c, c->modrm_seg);
 	} else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops);
+		rc = decode_abs(ctxt, ops, &memop);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	if (!c->has_seg_override)
 		set_seg_override(c, VCPU_SREG_DS);
 
-	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, ops, c);
+	if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
+		memop.addr.mem += seg_override_base(ctxt, ops, c);
 
-	if (c->ad_bytes != 8)
-		c->modrm_ea = (u32)c->modrm_ea;
+	if (memop.type == OP_MEM && c->ad_bytes != 8)
+		memop.addr.mem = (u32)memop.addr.mem;
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
+	if (memop.type == OP_MEM && c->rip_relative)
+		memop.addr.mem += c->eip;
 
 	/*
 	 * Decode and fetch the source operand: register, memory
@@ -2449,31 +2458,16 @@ done_prefixes:
 		decode_register_operand(&c->src, c, 0);
 		break;
 	case SrcMem16:
-		c->src.bytes = 2;
+		memop.bytes = 2;
 		goto srcmem_common;
 	case SrcMem32:
-		c->src.bytes = 4;
+		memop.bytes = 4;
 		goto srcmem_common;
 	case SrcMem:
-		c->src.bytes = (c->d & ByteOp) ? 1 :
+		memop.bytes = (c->d & ByteOp) ? 1 :
 							   c->op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->d & NoAccess)
-			break;
 	srcmem_common:
-		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
-		 */
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->src.type = OP_REG;
-			c->src.val = c->modrm_val;
-			c->src.addr.reg = c->modrm_ptr;
-			break;
-		}
-		c->src.type = OP_MEM;
-		c->src.addr.mem = c->modrm_ea;
-		c->src.val = 0;
+		c->src = memop;
 		break;
 	case SrcImm:
 	case SrcImmU:
@@ -2543,9 +2537,8 @@ done_prefixes:
 		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
 		break;
 	case SrcMemFAddr:
-		c->src.type = OP_MEM;
-		c->src.addr.mem = c->modrm_ea;
-		c->src.bytes = c->op_bytes + 2;
+		memop.bytes = c->op_bytes + 2;
+		goto srcmem_common;
 		break;
 	}
 
@@ -2583,26 +2576,18 @@ done_prefixes:
 		break;
 	case DstMem:
 	case DstMem64:
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-			c->dst.type = OP_REG;
-			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.addr.reg = c->modrm_ptr;
-			break;
-		}
-		c->dst.type = OP_MEM;
-		c->dst.addr.mem = c->modrm_ea;
+		c->dst = memop;
 		if ((c->d & DstMask) == DstMem64)
 			c->dst.bytes = 8;
 		else
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
+		if (c->dst.type == OP_MEM && (c->d & BitOp)) {
 			unsigned long mask = ~(c->dst.bytes * 8 - 1);
 
 			c->dst.addr.mem = c->dst.addr.mem +
 						   (c->src.val & mask) / 8;
 		}
+		c->dst.orig_val = c->dst.val;
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
@@ -2682,11 +2667,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src.type == OP_MEM) {
+		if (c->d & NoAccess)
+			goto no_fetch;
 		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val64 = c->src.val64;
+	no_fetch:
+		;
 	}
 
 	if (c->src2.type == OP_MEM) {
-- 
cgit v0.10.2


From 34698d8c61bd3fc86b2e99c3d1ad9ef140b3eb0d Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 4 Aug 2010 14:41:04 +0300
Subject: KVM: x86 emulator: Fix nop emulation

If a nop instruction is encountered, we jump directly to the done label.
This skip updating rip. Break from the switch case instead

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 955d480..ddbad15 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2912,7 +2912,7 @@ special_insn:
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
-			goto done;
+			break;
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
-- 
cgit v0.10.2


From 96bc451a153297bf1f99ef2d633d512ea349ae7a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:42 +0200
Subject: KVM: PPC: Introduce shared page

For transparent variable sharing between the hypervisor and guest, I introduce
a shared page. This shared page will contain all the registers the guest can
read and write safely without exiting guest context.

This patch only implements the stubs required for the basic structure of the
shared page. The actual register moving follows.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b0b23c0..53edacd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/kvm_para.h>
 #include <asm/kvm_asm.h>
 
 #define KVM_MAX_VCPUS 1
@@ -290,6 +291,7 @@ struct kvm_vcpu_arch {
 	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
 	unsigned long pending_exceptions;
+	struct kvm_vcpu_arch_shared *shared;
 
 #ifdef CONFIG_PPC_BOOK3S
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 2d48f6a..1485ba8 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -20,6 +20,11 @@
 #ifndef __POWERPC_KVM_PARA_H__
 #define __POWERPC_KVM_PARA_H__
 
+#include <linux/types.h>
+
+struct kvm_vcpu_arch_shared {
+};
+
 #ifdef __KERNEL__
 
 static inline int kvm_para_available(void)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1c0607d..60e7db4 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -400,6 +400,7 @@ int main(void)
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
 	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
+	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
 
 	/* book3s */
 #ifdef CONFIG_PPC_BOOK3S
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 73c0a3f..e7b1f3f 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -123,8 +123,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_vcpu;
 
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_vcpu;
+
 	return vcpu;
 
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
 free_vcpu:
 	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 out:
@@ -135,6 +141,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 
+	free_page((unsigned long)vcpu->arch.shared);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index a3cef30..b3385dd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1242,6 +1242,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_shadow_vcpu;
 
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_vcpu;
+
 	vcpu->arch.host_retip = kvm_return_point;
 	vcpu->arch.host_msr = mfmsr();
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -1268,10 +1272,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	err = kvmppc_mmu_init(vcpu);
 	if (err < 0)
-		goto free_shadow_vcpu;
+		goto uninit_vcpu;
 
 	return vcpu;
 
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
 free_shadow_vcpu:
 	kfree(vcpu_book3s->shadow_vcpu);
 free_vcpu:
@@ -1284,6 +1290,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
+	free_page((unsigned long)vcpu->arch.shared);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu_book3s->shadow_vcpu);
 	vfree(vcpu_book3s);
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index e8a00b0..71750f2 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -117,8 +117,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto uninit_vcpu;
 
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_tlb;
+
 	return vcpu;
 
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
 uninit_vcpu:
 	kvm_vcpu_uninit(vcpu);
 free_vcpu:
@@ -131,6 +137,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
+	free_page((unsigned long)vcpu->arch.shared);
 	kvmppc_e500_tlb_uninit(vcpu_e500);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
-- 
cgit v0.10.2


From 666e7252a15b7fc4a116e65deaf6da5e4ce660e3 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:43 +0200
Subject: KVM: PPC: Convert MSR to shared page

One of the most obvious registers to share with the guest directly is the
MSR. The MSR contains the "interrupts enabled" flag which the guest has to
toggle in critical sections.

So in order to bring the overhead of interrupt en- and disabling down, let's
put msr into the shared page. Keep in mind that even though you can fully read
its contents, writing to it doesn't always update all state. There are a few
safe fields that don't require hypervisor interaction. See the documentation
for a list of MSR bits that are safe to be set from inside the guest.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 53edacd..ba20f90 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -211,7 +211,6 @@ struct kvm_vcpu_arch {
 	u32 cr;
 #endif
 
-	ulong msr;
 #ifdef CONFIG_PPC_BOOK3S
 	ulong shadow_msr;
 	ulong hflags;
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 1485ba8..a17dc52 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 
 struct kvm_vcpu_arch_shared {
+	__u64 msr;
 };
 
 #ifdef __KERNEL__
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 60e7db4..1221bcd 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -394,13 +394,13 @@ int main(void)
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
-	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
 	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
 	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 	DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
+	DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 
 	/* book3s */
 #ifdef CONFIG_PPC_BOOK3S
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 9b9b5cd..9f71b8d 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -221,14 +221,14 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
 
 int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
 
 	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
 int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
 
 	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
@@ -354,7 +354,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 
 	stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags,
-	                                            vcpu->arch.msr & MSR_PR);
+	                                            vcpu->arch.shared->msr & MSR_PR);
 	stlbe.tid = !(asid & 0xff);
 
 	/* Keep track of the reference so we can properly release it later. */
@@ -423,7 +423,7 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 
 	/* Does it match current guest AS? */
 	/* XXX what about IS != DS? */
-	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
 		return 0;
 
 	gpa = get_tlb_raddr(tlbe);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b3385dd..2efe692 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -115,31 +115,31 @@ static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
 
 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.shadow_msr = vcpu->arch.msr;
+	ulong smsr = vcpu->arch.shared->msr;
+
 	/* Guest MSR values */
-	vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE |
-				 MSR_BE | MSR_DE;
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
 	/* Process MSR values */
-	vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR |
-				 MSR_EE;
+	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
 	/* External providers the guest reserved */
-	vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext);
+	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
 	/* 64-bit Process MSR values */
 #ifdef CONFIG_PPC_BOOK3S_64
-	vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV;
+	smsr |= MSR_ISF | MSR_HV;
 #endif
+	vcpu->arch.shadow_msr = smsr;
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
-	ulong old_msr = vcpu->arch.msr;
+	ulong old_msr = vcpu->arch.shared->msr;
 
 #ifdef EXIT_DEBUG
 	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
 #endif
 
 	msr &= to_book3s(vcpu)->msr_mask;
-	vcpu->arch.msr = msr;
+	vcpu->arch.shared->msr = msr;
 	kvmppc_recalc_shadow_msr(vcpu);
 
 	if (msr & (MSR_WE|MSR_POW)) {
@@ -149,21 +149,21 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 		}
 	}
 
-	if ((vcpu->arch.msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
 		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
 		kvmppc_mmu_flush_segments(vcpu);
 		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
 	}
 
 	/* Preload FPU if it's enabled */
-	if (vcpu->arch.msr & MSR_FP)
+	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 }
 
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
 	vcpu->arch.srr0 = kvmppc_get_pc(vcpu);
-	vcpu->arch.srr1 = vcpu->arch.msr | flags;
+	vcpu->arch.srr1 = vcpu->arch.shared->msr | flags;
 	kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
 	vcpu->arch.mmu.reset_msr(vcpu);
 }
@@ -254,11 +254,11 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 
 	switch (priority) {
 	case BOOK3S_IRQPRIO_DECREMENTER:
-		deliver = vcpu->arch.msr & MSR_EE;
+		deliver = vcpu->arch.shared->msr & MSR_EE;
 		vec = BOOK3S_INTERRUPT_DECREMENTER;
 		break;
 	case BOOK3S_IRQPRIO_EXTERNAL:
-		deliver = vcpu->arch.msr & MSR_EE;
+		deliver = vcpu->arch.shared->msr & MSR_EE;
 		vec = BOOK3S_INTERRUPT_EXTERNAL;
 		break;
 	case BOOK3S_IRQPRIO_SYSTEM_RESET:
@@ -437,7 +437,7 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
 			 struct kvmppc_pte *pte)
 {
-	int relocated = (vcpu->arch.msr & (data ? MSR_DR : MSR_IR));
+	int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR));
 	int r;
 
 	if (relocated) {
@@ -545,8 +545,8 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int page_found = 0;
 	struct kvmppc_pte pte;
 	bool is_mmio = false;
-	bool dr = (vcpu->arch.msr & MSR_DR) ? true : false;
-	bool ir = (vcpu->arch.msr & MSR_IR) ? true : false;
+	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
+	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
 	u64 vsid;
 
 	relocated = data ? dr : ir;
@@ -563,7 +563,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte.vpage = eaddr >> 12;
 	}
 
-	switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 	case 0:
 		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
 		break;
@@ -571,7 +571,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case MSR_IR:
 		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
 
-		if ((vcpu->arch.msr & (MSR_DR|MSR_IR)) == MSR_DR)
+		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
 			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
 		else
 			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
@@ -596,14 +596,16 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* Page not found in guest PTE entries */
 		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
 		to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
-		vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EPERM) {
 		/* Storage protection */
 		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
 		to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
 		to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
-		vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EINVAL) {
 		/* Page not found in guest SLB */
@@ -695,9 +697,11 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
 
 	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
 	if (ret == -ENOENT) {
-		vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 33, 1);
-		vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 34, 36, 0);
-		vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0);
+		ulong msr = vcpu->arch.shared->msr;
+
+		msr = kvmppc_set_field(msr, 33, 33, 1);
+		msr = kvmppc_set_field(msr, 34, 36, 0);
+		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
 		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 		return EMULATE_AGAIN;
 	}
@@ -736,7 +740,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
 		return RESUME_GUEST;
 
-	if (!(vcpu->arch.msr & msr)) {
+	if (!(vcpu->arch.shared->msr & msr)) {
 		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 		return RESUME_GUEST;
 	}
@@ -804,7 +808,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	if ((exit_nr != 0x900) && (exit_nr != 0x500))
 		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n",
 			exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
-			vcpu->arch.msr);
+			vcpu->arch.shared->msr);
 #endif
 	kvm_resched(vcpu);
 	switch (exit_nr) {
@@ -836,7 +840,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			r = RESUME_GUEST;
 		} else {
-			vcpu->arch.msr |= to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+			vcpu->arch.shared->msr |=
+				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			r = RESUME_GUEST;
@@ -904,7 +909,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 program_interrupt:
 		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
 
-		if (vcpu->arch.msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & MSR_PR) {
 #ifdef EXIT_DEBUG
 			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
 #endif
@@ -1052,7 +1057,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->ctr = kvmppc_get_ctr(vcpu);
 	regs->lr = kvmppc_get_lr(vcpu);
 	regs->xer = kvmppc_get_xer(vcpu);
-	regs->msr = vcpu->arch.msr;
+	regs->msr = vcpu->arch.shared->msr;
 	regs->srr0 = vcpu->arch.srr0;
 	regs->srr1 = vcpu->arch.srr1;
 	regs->pid = vcpu->arch.pid;
@@ -1353,7 +1358,7 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	local_irq_enable();
 
 	/* Preload FPU if it's enabled */
-	if (vcpu->arch.msr & MSR_FP)
+	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 
 	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 3292d76..449bce5 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -133,7 +133,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
 		else
 			bat = &vcpu_book3s->ibat[i];
 
-		if (vcpu->arch.msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & MSR_PR) {
 			if (!bat->vp)
 				continue;
 		} else {
@@ -214,8 +214,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 			pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF);
 			pp = pteg[i+1] & 3;
 
-			if ((sre->Kp &&  (vcpu->arch.msr & MSR_PR)) ||
-			    (sre->Ks && !(vcpu->arch.msr & MSR_PR)))
+			if ((sre->Kp &&  (vcpu->arch.shared->msr & MSR_PR)) ||
+			    (sre->Ks && !(vcpu->arch.shared->msr & MSR_PR)))
 				pp |= 4;
 
 			pte->may_write = false;
@@ -334,7 +334,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	struct kvmppc_sr *sr;
 	u64 gvsid = esid;
 
-	if (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 		sr = find_sr(to_book3s(vcpu), ea);
 		if (sr->valid)
 			gvsid = sr->vsid;
@@ -343,7 +343,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	/* In case we only have one of MSR_IR or MSR_DR set, let's put
 	   that in the real-mode context (and hope RM doesn't access
 	   high memory) */
-	switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 	case 0:
 		*vsid = VSID_REAL | esid;
 		break;
@@ -363,7 +363,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		BUG();
 	}
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		*vsid |= VSID_PR;
 
 	return 0;
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 0b51ef8..67b8c38 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -86,7 +86,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 	struct kvmppc_sid_map *map;
 	u16 sid_map_mask;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
@@ -253,7 +253,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	u16 sid_map_mask;
 	static int backwards_map = 0;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	/* We might get collisions that trap in preceding order, so let's
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 4025ea2..58aa840 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -180,9 +180,9 @@ do_second:
 		goto no_page_found;
 	}
 
-	if ((vcpu->arch.msr & MSR_PR) && slbe->Kp)
+	if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp)
 		key = 4;
-	else if (!(vcpu->arch.msr & MSR_PR) && slbe->Ks)
+	else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks)
 		key = 4;
 
 	for (i=0; i<16; i+=2) {
@@ -381,7 +381,7 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
 	for (i = 1; i < vcpu_book3s->slb_nr; i++)
 		vcpu_book3s->slb[i].valid = false;
 
-	if (vcpu->arch.msr & MSR_IR) {
+	if (vcpu->arch.shared->msr & MSR_IR) {
 		kvmppc_mmu_flush_segments(vcpu);
 		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
 	}
@@ -446,13 +446,13 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	struct kvmppc_slb *slb;
 	u64 gvsid = esid;
 
-	if (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 		slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
 		if (slb)
 			gvsid = slb->vsid;
 	}
 
-	switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 	case 0:
 		*vsid = VSID_REAL | esid;
 		break;
@@ -473,7 +473,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		break;
 	}
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		*vsid |= VSID_PR;
 
 	return 0;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 384179a..71c1f90 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -66,7 +66,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 	struct kvmppc_sid_map *map;
 	u16 sid_map_mask;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
@@ -191,7 +191,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	u16 sid_map_mask;
 	static int backwards_map = 0;
 
-	if (vcpu->arch.msr & MSR_PR)
+	if (vcpu->arch.shared->msr & MSR_PR)
 		gvsid |= VSID_PR;
 
 	/* We might get collisions that trap in preceding order, so let's
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index c85f906..35d3c16 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -86,14 +86,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case 31:
 		switch (get_xop(inst)) {
 		case OP_31_XOP_MFMSR:
-			kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr);
+			kvmppc_set_gpr(vcpu, get_rt(inst),
+				       vcpu->arch.shared->msr);
 			break;
 		case OP_31_XOP_MTMSRD:
 		{
 			ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst));
 			if (inst & 0x10000) {
-				vcpu->arch.msr &= ~(MSR_RI | MSR_EE);
-				vcpu->arch.msr |= rs & (MSR_RI | MSR_EE);
+				vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE);
+				vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE);
 			} else
 				kvmppc_set_msr(vcpu, rs);
 			break;
@@ -204,7 +205,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				ra = kvmppc_get_gpr(vcpu, get_ra(inst));
 
 			addr = (ra + rb) & ~31ULL;
-			if (!(vcpu->arch.msr & MSR_SF))
+			if (!(vcpu->arch.shared->msr & MSR_SF))
 				addr &= 0xffffffff;
 			vaddr = addr;
 
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index 474f2e2..626e6ef 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -165,9 +165,10 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
 {
 	u64 dsisr;
+	struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
 
-	vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 36, 0);
-	vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0);
+	shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0);
+	shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0);
 	vcpu->arch.dear = eaddr;
 	/* Page Fault */
 	dsisr = kvmppc_set_field(0, 33, 33, 1);
@@ -658,7 +659,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	if (!kvmppc_inst_is_paired_single(vcpu, inst))
 		return EMULATE_FAIL;
 
-	if (!(vcpu->arch.msr & MSR_FP)) {
+	if (!(vcpu->arch.shared->msr & MSR_FP)) {
 		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
 		return EMULATE_AGAIN;
 	}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 8d4e35f..4ec9d49 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -62,7 +62,7 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
 	int i;
 
-	printk("pc:   %08lx msr:  %08lx\n", vcpu->arch.pc, vcpu->arch.msr);
+	printk("pc:   %08lx msr:  %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr);
 	printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
 	printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1);
 
@@ -169,34 +169,34 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		break;
 	case BOOKE_IRQPRIO_CRITICAL:
 	case BOOKE_IRQPRIO_WATCHDOG:
-		allowed = vcpu->arch.msr & MSR_CE;
+		allowed = vcpu->arch.shared->msr & MSR_CE;
 		msr_mask = MSR_ME;
 		break;
 	case BOOKE_IRQPRIO_MACHINE_CHECK:
-		allowed = vcpu->arch.msr & MSR_ME;
+		allowed = vcpu->arch.shared->msr & MSR_ME;
 		msr_mask = 0;
 		break;
 	case BOOKE_IRQPRIO_EXTERNAL:
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_FIT:
-		allowed = vcpu->arch.msr & MSR_EE;
+		allowed = vcpu->arch.shared->msr & MSR_EE;
 		msr_mask = MSR_CE|MSR_ME|MSR_DE;
 		break;
 	case BOOKE_IRQPRIO_DEBUG:
-		allowed = vcpu->arch.msr & MSR_DE;
+		allowed = vcpu->arch.shared->msr & MSR_DE;
 		msr_mask = MSR_ME;
 		break;
 	}
 
 	if (allowed) {
 		vcpu->arch.srr0 = vcpu->arch.pc;
-		vcpu->arch.srr1 = vcpu->arch.msr;
+		vcpu->arch.srr1 = vcpu->arch.shared->msr;
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		if (update_esr == true)
 			vcpu->arch.esr = vcpu->arch.queued_esr;
 		if (update_dear == true)
 			vcpu->arch.dear = vcpu->arch.queued_dear;
-		kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
+		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 
 		clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
@@ -265,7 +265,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_PROGRAM:
-		if (vcpu->arch.msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & MSR_PR) {
 			/* Program traps generated by user-level software must be handled
 			 * by the guest kernel. */
 			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
@@ -467,7 +467,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.pc = 0;
-	vcpu->arch.msr = 0;
+	vcpu->arch.shared->msr = 0;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 
 	vcpu->arch.shadow_pid = 1;
@@ -490,7 +490,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->ctr = vcpu->arch.ctr;
 	regs->lr = vcpu->arch.lr;
 	regs->xer = kvmppc_get_xer(vcpu);
-	regs->msr = vcpu->arch.msr;
+	regs->msr = vcpu->arch.shared->msr;
 	regs->srr0 = vcpu->arch.srr0;
 	regs->srr1 = vcpu->arch.srr1;
 	regs->pid = vcpu->arch.pid;
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index d59bcca..88258ac 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -54,12 +54,12 @@ extern unsigned long kvmppc_booke_handlers;
  * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
-	if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+	if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
 		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
 
-	vcpu->arch.msr = new_msr;
+	vcpu->arch.shared->msr = new_msr;
 
-	if (vcpu->arch.msr & MSR_WE) {
+	if (vcpu->arch.shared->msr & MSR_WE) {
 		kvm_vcpu_block(vcpu);
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
 	};
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index cbc790e..b115203 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -62,7 +62,7 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 		case OP_31_XOP_MFMSR:
 			rt = get_rt(inst);
-			kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr);
+			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
 			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
 			break;
 
@@ -74,13 +74,13 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 		case OP_31_XOP_WRTEE:
 			rs = get_rs(inst);
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
 					| (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
 			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
 			break;
 
 		case OP_31_XOP_WRTEEI:
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
 							 | (inst & MSR_EE);
 			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
 			break;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 380a78c..0498469 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -415,7 +415,8 @@ lightweight_exit:
 	lwz	r8, VCPU_GPR(r8)(r4)
 	lwz	r3, VCPU_PC(r4)
 	mtsrr0	r3
-	lwz	r3, VCPU_MSR(r4)
+	lwz	r3, VCPU_SHARED(r4)
+	lwz	r3, VCPU_SHARED_MSR(r3)
 	oris	r3, r3, KVMPPC_MSR_MASK@h
 	ori	r3, r3, KVMPPC_MSR_MASK@l
 	mtsrr1	r3
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 21011e1..092a390 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -314,10 +314,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
 		| e500_shadow_mas2_attrib(gtlbe->mas2,
-				vcpu_e500->vcpu.arch.msr & MSR_PR);
+				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 	stlbe->mas3 = (hpaddr & MAS3_RPN)
 		| e500_shadow_mas3_attrib(gtlbe->mas3,
-				vcpu_e500->vcpu.arch.msr & MSR_PR);
+				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
 
 	trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
@@ -576,28 +576,28 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
 
 	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
 }
 
 int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
 
 	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
 }
 
 void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
 
 	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as);
 }
 
 void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
 {
-	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
 
 	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as);
 }
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index d28e301..458946b 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -171,7 +171,7 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 
 	/* Does it match current guest AS? */
 	/* XXX what about IS != DS? */
-	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
 		return 0;
 
 	gpa = get_tlb_raddr(tlbe);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 72a4ad8..22f6fa2 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,7 +38,8 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
+	return !(v->arch.shared->msr & MSR_WE) ||
+	       !!(v->arch.pending_exceptions);
 }
 
 
-- 
cgit v0.10.2


From d562de48de68b60b3d2522e7d8273d7112034ee6 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:44 +0200
Subject: KVM: PPC: Convert DSISR to shared page

The DSISR register contains information about a data page fault. It is fully
read/write from inside the guest context and we don't need to worry about
interacting based on writes of this register.

This patch converts all users of the current field to the shared page.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8274a2d..b5b1961 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -85,7 +85,6 @@ struct kvmppc_vcpu_book3s {
 	u64 hid[6];
 	u64 gqr[8];
 	int slb_nr;
-	u32 dsisr;
 	u64 sdr1;
 	u64 hior;
 	u64 msr_mask;
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index a17dc52..9f7565b 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -24,6 +24,7 @@
 
 struct kvm_vcpu_arch_shared {
 	__u64 msr;
+	__u32 dsisr;
 };
 
 #ifdef __KERNEL__
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 2efe692..eb401b6 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -595,15 +595,16 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	if (page_found == -ENOENT) {
 		/* Page not found in guest PTE entries */
 		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
-		to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
+		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
 		vcpu->arch.shared->msr |=
 			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EPERM) {
 		/* Storage protection */
 		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
-		to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
-		to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
+		vcpu->arch.shared->dsisr =
+			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
 		vcpu->arch.shared->msr |=
 			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
@@ -867,7 +868,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
 		} else {
 			vcpu->arch.dear = dar;
-			to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr;
+			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFUL);
 			r = RESUME_GUEST;
@@ -994,7 +995,7 @@ program_interrupt:
 	}
 	case BOOK3S_INTERRUPT_ALIGNMENT:
 		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-			to_book3s(vcpu)->dsisr = kvmppc_alignment_dsisr(vcpu,
+			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
 				kvmppc_get_last_inst(vcpu));
 			vcpu->arch.dear = kvmppc_alignment_dar(vcpu,
 				kvmppc_get_last_inst(vcpu));
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 35d3c16..9982ff1 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -221,7 +221,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				else if (r == -EPERM)
 					dsisr |= DSISR_PROTFAULT;
 
-				to_book3s(vcpu)->dsisr = dsisr;
+				vcpu->arch.shared->dsisr = dsisr;
 				to_svcpu(vcpu)->fault_dsisr = dsisr;
 
 				kvmppc_book3s_queue_irqprio(vcpu,
@@ -327,7 +327,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		to_book3s(vcpu)->sdr1 = spr_val;
 		break;
 	case SPRN_DSISR:
-		to_book3s(vcpu)->dsisr = spr_val;
+		vcpu->arch.shared->dsisr = spr_val;
 		break;
 	case SPRN_DAR:
 		vcpu->arch.dear = spr_val;
@@ -440,7 +440,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
 		break;
 	case SPRN_DSISR:
-		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr);
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr);
 		break;
 	case SPRN_DAR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear);
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index 626e6ef..749dfbd 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -173,7 +173,7 @@ static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
 	/* Page Fault */
 	dsisr = kvmppc_set_field(0, 33, 33, 1);
 	if (is_store)
-		to_book3s(vcpu)->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+		shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
 }
 
-- 
cgit v0.10.2


From 5e030186dfc4e4e47c84d2557b17e4aa06c76f96 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:45 +0200
Subject: KVM: PPC: Convert DAR to shared page.

The DAR register contains the address a data page fault occured at. This
register behaves pretty much like a simple data storage register that gets
written to on data faults. There is no hypervisor interaction required on
read or write.

This patch converts all users of the current field to the shared page.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ba20f90..c852408 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -231,7 +231,6 @@ struct kvm_vcpu_arch {
 	ulong csrr1;
 	ulong dsrr0;
 	ulong dsrr1;
-	ulong dear;
 	ulong esr;
 	u32 dec;
 	u32 decar;
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 9f7565b..ec72a1c 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 
 struct kvm_vcpu_arch_shared {
+	__u64 dar;
 	__u64 msr;
 	__u32 dsisr;
 };
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index eb401b6..4d46f8b 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -594,14 +594,14 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	if (page_found == -ENOENT) {
 		/* Page not found in guest PTE entries */
-		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
 		vcpu->arch.shared->msr |=
 			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EPERM) {
 		/* Storage protection */
-		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 		vcpu->arch.shared->dsisr =
 			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
 		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
@@ -610,7 +610,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_book3s_queue_irqprio(vcpu, vec);
 	} else if (page_found == -EINVAL) {
 		/* Page not found in guest SLB */
-		vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
 	} else if (!is_mmio &&
 		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
@@ -867,17 +867,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
 			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
 		} else {
-			vcpu->arch.dear = dar;
+			vcpu->arch.shared->dar = dar;
 			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFUL);
+			kvmppc_mmu_pte_flush(vcpu, dar, ~0xFFFUL);
 			r = RESUME_GUEST;
 		}
 		break;
 	}
 	case BOOK3S_INTERRUPT_DATA_SEGMENT:
 		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-			vcpu->arch.dear = kvmppc_get_fault_dar(vcpu);
+			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
 			kvmppc_book3s_queue_irqprio(vcpu,
 				BOOK3S_INTERRUPT_DATA_SEGMENT);
 		}
@@ -997,7 +997,7 @@ program_interrupt:
 		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
 			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
 				kvmppc_get_last_inst(vcpu));
-			vcpu->arch.dear = kvmppc_alignment_dar(vcpu,
+			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
 				kvmppc_get_last_inst(vcpu));
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 		}
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 9982ff1..c147864 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -212,7 +212,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			r = kvmppc_st(vcpu, &addr, 32, zeros, true);
 			if ((r == -ENOENT) || (r == -EPERM)) {
 				*advance = 0;
-				vcpu->arch.dear = vaddr;
+				vcpu->arch.shared->dar = vaddr;
 				to_svcpu(vcpu)->fault_dar = vaddr;
 
 				dsisr = DSISR_ISSTORE;
@@ -330,7 +330,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		vcpu->arch.shared->dsisr = spr_val;
 		break;
 	case SPRN_DAR:
-		vcpu->arch.dear = spr_val;
+		vcpu->arch.shared->dar = spr_val;
 		break;
 	case SPRN_HIOR:
 		to_book3s(vcpu)->hior = spr_val;
@@ -443,7 +443,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr);
 		break;
 	case SPRN_DAR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear);
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar);
 		break;
 	case SPRN_HIOR:
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior);
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index 749dfbd..807576f 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -169,7 +169,7 @@ static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
 
 	shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0);
 	shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0);
-	vcpu->arch.dear = eaddr;
+	shared->dar = eaddr;
 	/* Page Fault */
 	dsisr = kvmppc_set_field(0, 33, 33, 1);
 	if (is_store)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4ec9d49..4aab6d2 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -195,7 +195,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		if (update_esr == true)
 			vcpu->arch.esr = vcpu->arch.queued_esr;
 		if (update_dear == true)
-			vcpu->arch.dear = vcpu->arch.queued_dear;
+			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
 		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 
 		clear_bit(priority, &vcpu->arch.pending_exceptions);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index b115203..51ef453 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -105,7 +105,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
 	switch (sprn) {
 	case SPRN_DEAR:
-		vcpu->arch.dear = spr_val; break;
+		vcpu->arch.shared->dar = spr_val; break;
 	case SPRN_ESR:
 		vcpu->arch.esr = spr_val; break;
 	case SPRN_DBCR0:
@@ -200,7 +200,7 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_IVPR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break;
 	case SPRN_DEAR:
-		kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
 	case SPRN_ESR:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break;
 	case SPRN_DBCR0:
-- 
cgit v0.10.2


From de7906c36ca1e22a3e3600e95c6a4e2c1e4e2e9c Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:46 +0200
Subject: KVM: PPC: Convert SRR0 and SRR1 to shared page

The SRR0 and SRR1 registers contain cached values of the PC and MSR
respectively. They get written to by the hypervisor when an interrupt
occurs or directly by the kernel. They are also used to tell the rfi(d)
instruction where to jump to.

Because it only gets touched on defined events that, it's very simple to
share with the guest. Hypervisor and guest both have full r/w access.

This patch converts all users of the current field to the shared page.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c852408..5255d75 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -225,8 +225,6 @@ struct kvm_vcpu_arch {
 	ulong sprg5;
 	ulong sprg6;
 	ulong sprg7;
-	ulong srr0;
-	ulong srr1;
 	ulong csrr0;
 	ulong csrr1;
 	ulong dsrr0;
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index ec72a1c..d7fc6c2 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -23,6 +23,8 @@
 #include <linux/types.h>
 
 struct kvm_vcpu_arch_shared {
+	__u64 srr0;
+	__u64 srr1;
 	__u64 dar;
 	__u64 msr;
 	__u32 dsisr;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 4d46f8b..afa0dd4 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -162,8 +162,8 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
-	vcpu->arch.srr0 = kvmppc_get_pc(vcpu);
-	vcpu->arch.srr1 = vcpu->arch.shared->msr | flags;
+	vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
+	vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
 	kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
 	vcpu->arch.mmu.reset_msr(vcpu);
 }
@@ -1059,8 +1059,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->lr = kvmppc_get_lr(vcpu);
 	regs->xer = kvmppc_get_xer(vcpu);
 	regs->msr = vcpu->arch.shared->msr;
-	regs->srr0 = vcpu->arch.srr0;
-	regs->srr1 = vcpu->arch.srr1;
+	regs->srr0 = vcpu->arch.shared->srr0;
+	regs->srr1 = vcpu->arch.shared->srr1;
 	regs->pid = vcpu->arch.pid;
 	regs->sprg0 = vcpu->arch.sprg0;
 	regs->sprg1 = vcpu->arch.sprg1;
@@ -1086,8 +1086,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	kvmppc_set_lr(vcpu, regs->lr);
 	kvmppc_set_xer(vcpu, regs->xer);
 	kvmppc_set_msr(vcpu, regs->msr);
-	vcpu->arch.srr0 = regs->srr0;
-	vcpu->arch.srr1 = regs->srr1;
+	vcpu->arch.shared->srr0 = regs->srr0;
+	vcpu->arch.shared->srr1 = regs->srr1;
 	vcpu->arch.sprg0 = regs->sprg0;
 	vcpu->arch.sprg1 = regs->sprg1;
 	vcpu->arch.sprg2 = regs->sprg2;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index c147864..f333cb4 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -73,8 +73,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		switch (get_xop(inst)) {
 		case OP_19_XOP_RFID:
 		case OP_19_XOP_RFI:
-			kvmppc_set_pc(vcpu, vcpu->arch.srr0);
-			kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+			kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0);
+			kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
 			*advance = 0;
 			break;
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4aab6d2..793df28 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -64,7 +64,8 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 
 	printk("pc:   %08lx msr:  %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr);
 	printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
-	printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1);
+	printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0,
+					    vcpu->arch.shared->srr1);
 
 	printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
 
@@ -189,8 +190,8 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	}
 
 	if (allowed) {
-		vcpu->arch.srr0 = vcpu->arch.pc;
-		vcpu->arch.srr1 = vcpu->arch.shared->msr;
+		vcpu->arch.shared->srr0 = vcpu->arch.pc;
+		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		if (update_esr == true)
 			vcpu->arch.esr = vcpu->arch.queued_esr;
@@ -491,8 +492,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->lr = vcpu->arch.lr;
 	regs->xer = kvmppc_get_xer(vcpu);
 	regs->msr = vcpu->arch.shared->msr;
-	regs->srr0 = vcpu->arch.srr0;
-	regs->srr1 = vcpu->arch.srr1;
+	regs->srr0 = vcpu->arch.shared->srr0;
+	regs->srr1 = vcpu->arch.shared->srr1;
 	regs->pid = vcpu->arch.pid;
 	regs->sprg0 = vcpu->arch.sprg0;
 	regs->sprg1 = vcpu->arch.sprg1;
@@ -518,8 +519,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.lr = regs->lr;
 	kvmppc_set_xer(vcpu, regs->xer);
 	kvmppc_set_msr(vcpu, regs->msr);
-	vcpu->arch.srr0 = regs->srr0;
-	vcpu->arch.srr1 = regs->srr1;
+	vcpu->arch.shared->srr0 = regs->srr0;
+	vcpu->arch.shared->srr1 = regs->srr1;
 	vcpu->arch.sprg0 = regs->sprg0;
 	vcpu->arch.sprg1 = regs->sprg1;
 	vcpu->arch.sprg2 = regs->sprg2;
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 51ef453..1260f5f 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -31,8 +31,8 @@
 
 static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.pc = vcpu->arch.srr0;
-	kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+	vcpu->arch.pc = vcpu->arch.shared->srr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
 }
 
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 4568ec3..ad0fa4f 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -242,9 +242,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 			switch (sprn) {
 			case SPRN_SRR0:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0);
+				break;
 			case SPRN_SRR1:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1);
+				break;
 			case SPRN_PVR:
 				kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break;
 			case SPRN_PIR:
@@ -320,9 +322,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			rs = get_rs(inst);
 			switch (sprn) {
 			case SPRN_SRR0:
-				vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SRR1:
-				vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs);
+				break;
 
 			/* XXX We need to context-switch the timebase for
 			 * watchdog and FIT. */
-- 
cgit v0.10.2


From a73a9599e03eef1324d5aeecaebc1b339d2e1664 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:47 +0200
Subject: KVM: PPC: Convert SPRG[0-4] to shared page

When in kernel mode there are 4 additional registers available that are
simple data storage. Instead of exiting to the hypervisor to read and
write those, we can just share them with the guest using the page.

This patch converts all users of the current field to the shared page.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5255d75..221cf85 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -217,10 +217,6 @@ struct kvm_vcpu_arch {
 	ulong guest_owned_ext;
 #endif
 	u32 mmucr;
-	ulong sprg0;
-	ulong sprg1;
-	ulong sprg2;
-	ulong sprg3;
 	ulong sprg4;
 	ulong sprg5;
 	ulong sprg6;
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index d7fc6c2..e402999 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -23,6 +23,10 @@
 #include <linux/types.h>
 
 struct kvm_vcpu_arch_shared {
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
 	__u64 srr0;
 	__u64 srr1;
 	__u64 dar;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index afa0dd4..cfd7fe5 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1062,10 +1062,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->srr0 = vcpu->arch.shared->srr0;
 	regs->srr1 = vcpu->arch.shared->srr1;
 	regs->pid = vcpu->arch.pid;
-	regs->sprg0 = vcpu->arch.sprg0;
-	regs->sprg1 = vcpu->arch.sprg1;
-	regs->sprg2 = vcpu->arch.sprg2;
-	regs->sprg3 = vcpu->arch.sprg3;
+	regs->sprg0 = vcpu->arch.shared->sprg0;
+	regs->sprg1 = vcpu->arch.shared->sprg1;
+	regs->sprg2 = vcpu->arch.shared->sprg2;
+	regs->sprg3 = vcpu->arch.shared->sprg3;
 	regs->sprg5 = vcpu->arch.sprg4;
 	regs->sprg6 = vcpu->arch.sprg5;
 	regs->sprg7 = vcpu->arch.sprg6;
@@ -1088,10 +1088,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	kvmppc_set_msr(vcpu, regs->msr);
 	vcpu->arch.shared->srr0 = regs->srr0;
 	vcpu->arch.shared->srr1 = regs->srr1;
-	vcpu->arch.sprg0 = regs->sprg0;
-	vcpu->arch.sprg1 = regs->sprg1;
-	vcpu->arch.sprg2 = regs->sprg2;
-	vcpu->arch.sprg3 = regs->sprg3;
+	vcpu->arch.shared->sprg0 = regs->sprg0;
+	vcpu->arch.shared->sprg1 = regs->sprg1;
+	vcpu->arch.shared->sprg2 = regs->sprg2;
+	vcpu->arch.shared->sprg3 = regs->sprg3;
 	vcpu->arch.sprg5 = regs->sprg4;
 	vcpu->arch.sprg6 = regs->sprg5;
 	vcpu->arch.sprg7 = regs->sprg6;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 793df28..b2c8c42 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -495,10 +495,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->srr0 = vcpu->arch.shared->srr0;
 	regs->srr1 = vcpu->arch.shared->srr1;
 	regs->pid = vcpu->arch.pid;
-	regs->sprg0 = vcpu->arch.sprg0;
-	regs->sprg1 = vcpu->arch.sprg1;
-	regs->sprg2 = vcpu->arch.sprg2;
-	regs->sprg3 = vcpu->arch.sprg3;
+	regs->sprg0 = vcpu->arch.shared->sprg0;
+	regs->sprg1 = vcpu->arch.shared->sprg1;
+	regs->sprg2 = vcpu->arch.shared->sprg2;
+	regs->sprg3 = vcpu->arch.shared->sprg3;
 	regs->sprg5 = vcpu->arch.sprg4;
 	regs->sprg6 = vcpu->arch.sprg5;
 	regs->sprg7 = vcpu->arch.sprg6;
@@ -521,10 +521,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	kvmppc_set_msr(vcpu, regs->msr);
 	vcpu->arch.shared->srr0 = regs->srr0;
 	vcpu->arch.shared->srr1 = regs->srr1;
-	vcpu->arch.sprg0 = regs->sprg0;
-	vcpu->arch.sprg1 = regs->sprg1;
-	vcpu->arch.sprg2 = regs->sprg2;
-	vcpu->arch.sprg3 = regs->sprg3;
+	vcpu->arch.shared->sprg0 = regs->sprg0;
+	vcpu->arch.shared->sprg1 = regs->sprg1;
+	vcpu->arch.shared->sprg2 = regs->sprg2;
+	vcpu->arch.shared->sprg3 = regs->sprg3;
 	vcpu->arch.sprg5 = regs->sprg4;
 	vcpu->arch.sprg6 = regs->sprg5;
 	vcpu->arch.sprg7 = regs->sprg6;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index ad0fa4f..454869b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -263,13 +263,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				kvmppc_set_gpr(vcpu, rt, get_tb()); break;
 
 			case SPRN_SPRG0:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0);
+				break;
 			case SPRN_SPRG1:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1);
+				break;
 			case SPRN_SPRG2:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2);
+				break;
 			case SPRN_SPRG3:
-				kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break;
+				kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3);
+				break;
 			/* Note: SPRG4-7 are user-readable, so we don't get
 			 * a trap. */
 
@@ -341,13 +345,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				break;
 
 			case SPRN_SPRG0:
-				vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SPRG1:
-				vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SPRG2:
-				vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs);
+				break;
 			case SPRN_SPRG3:
-				vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break;
+				vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs);
+				break;
 
 			default:
 				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
-- 
cgit v0.10.2


From 2a342ed57756ad5d8af5456959433884367e5ab2 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:48 +0200
Subject: KVM: PPC: Implement hypervisor interface

To communicate with KVM directly we need to plumb some sort of interface
between the guest and KVM. Usually those interfaces use hypercalls.

This hypercall implementation is described in the last patch of the series
in a special documentation file. Please read that for further information.

This patch implements stubs to handle KVM PPC hypercalls on the host and
guest side alike.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index e402999..556fd59 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
 #define __POWERPC_KVM_PARA_H__
 
 #include <linux/types.h>
+#include <linux/of.h>
 
 struct kvm_vcpu_arch_shared {
 	__u64 sprg0;
@@ -34,16 +35,127 @@ struct kvm_vcpu_arch_shared {
 	__u32 dsisr;
 };
 
+#define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
+#define HC_VENDOR_KVM		(42 << 16)
+#define HC_EV_SUCCESS		0
+#define HC_EV_UNIMPLEMENTED	12
+
 #ifdef __KERNEL__
 
+#ifdef CONFIG_KVM_GUEST
+
+static inline int kvm_para_available(void)
+{
+	struct device_node *hyper_node;
+
+	hyper_node = of_find_node_by_path("/hypervisor");
+	if (!hyper_node)
+		return 0;
+
+	if (!of_device_is_compatible(hyper_node, "linux,kvm"))
+		return 0;
+
+	return 1;
+}
+
+extern unsigned long kvm_hypercall(unsigned long *in,
+				   unsigned long *out,
+				   unsigned long nr);
+
+#else
+
 static inline int kvm_para_available(void)
 {
 	return 0;
 }
 
+static unsigned long kvm_hypercall(unsigned long *in,
+				   unsigned long *out,
+				   unsigned long nr)
+{
+	return HC_EV_UNIMPLEMENTED;
+}
+
+#endif
+
+static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+	unsigned long r;
+
+	r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	*r2 = out[0];
+
+	return r;
+}
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+				  unsigned long p2)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+				  unsigned long p2, unsigned long p3)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	in[2] = p3;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+				  unsigned long p2, unsigned long p3,
+				  unsigned long p4)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	in[2] = p3;
+	in[3] = p4;
+	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+}
+
+
 static inline unsigned int kvm_arch_para_features(void)
 {
-	return 0;
+	unsigned long r;
+
+	if (!kvm_para_available())
+		return 0;
+
+	if(kvm_hypercall0_1(KVM_HC_FEATURES, &r))
+		return 0;
+
+	return r;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 18d139e..ecb3bc7 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -107,6 +107,7 @@ extern int kvmppc_booke_init(void);
 extern void kvmppc_booke_exit(void);
 
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
+extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 
 /*
  * Cuts out inst bits with ordering according to spec.
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1dda701..3a6955d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -127,6 +127,8 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
 obj-y				+= ppc_save_regs.o
 endif
 
+obj-$(CONFIG_KVM_GUEST)		+= kvm.o
+
 # Disable GCOV in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 GCOV_PROFILE_ftrace.o := n
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
new file mode 100644
index 0000000..4f85505
--- /dev/null
+++ b/arch/powerpc/kernel/kvm.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/init.h>
+#include <linux/kvm_para.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include <asm/reg.h>
+#include <asm/kvm_ppc.h>
+#include <asm/sections.h>
+#include <asm/cacheflush.h>
+#include <asm/disassemble.h>
+
+unsigned long kvm_hypercall(unsigned long *in,
+			    unsigned long *out,
+			    unsigned long nr)
+{
+	unsigned long register r0 asm("r0");
+	unsigned long register r3 asm("r3") = in[0];
+	unsigned long register r4 asm("r4") = in[1];
+	unsigned long register r5 asm("r5") = in[2];
+	unsigned long register r6 asm("r6") = in[3];
+	unsigned long register r7 asm("r7") = in[4];
+	unsigned long register r8 asm("r8") = in[5];
+	unsigned long register r9 asm("r9") = in[6];
+	unsigned long register r10 asm("r10") = in[7];
+	unsigned long register r11 asm("r11") = nr;
+	unsigned long register r12 asm("r12");
+
+	asm volatile("bl	kvm_hypercall_start"
+		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
+		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
+		       "=r"(r12)
+		     : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
+		       "r"(r9), "r"(r10), "r"(r11)
+		     : "memory", "cc", "xer", "ctr", "lr");
+
+	out[0] = r4;
+	out[1] = r5;
+	out[2] = r6;
+	out[3] = r7;
+	out[4] = r8;
+	out[5] = r9;
+	out[6] = r10;
+	out[7] = r11;
+
+	return r3;
+}
+EXPORT_SYMBOL_GPL(kvm_hypercall);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index cfd7fe5..5cb5f0d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -947,10 +947,10 @@ program_interrupt:
 		break;
 	}
 	case BOOK3S_INTERRUPT_SYSCALL:
-		// XXX make user settable
 		if (vcpu->arch.osi_enabled &&
 		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
 		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+			/* MOL hypercalls */
 			u64 *gprs = run->osi.gprs;
 			int i;
 
@@ -959,8 +959,13 @@ program_interrupt:
 				gprs[i] = kvmppc_get_gpr(vcpu, i);
 			vcpu->arch.osi_needed = 1;
 			r = RESUME_HOST_NV;
-
+		} else if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
 		} else {
+			/* Guest syscalls */
 			vcpu->stat.syscall_exits++;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
 			r = RESUME_GUEST;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b2c8c42..13e0747 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -338,7 +338,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_SYSCALL:
-		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+		}
 		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
 		r = RESUME_GUEST;
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 22f6fa2..a4cf4b4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -42,6 +42,38 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	       !!(v->arch.pending_exceptions);
 }
 
+int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
+{
+	int nr = kvmppc_get_gpr(vcpu, 11);
+	int r;
+	unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3);
+	unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4);
+	unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5);
+	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
+	unsigned long r2 = 0;
+
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		/* 32 bit mode */
+		param1 &= 0xffffffff;
+		param2 &= 0xffffffff;
+		param3 &= 0xffffffff;
+		param4 &= 0xffffffff;
+	}
+
+	switch (nr) {
+	case HC_VENDOR_KVM | KVM_HC_FEATURES:
+		r = HC_EV_SUCCESS;
+
+		/* Second return value is in r4 */
+		kvmppc_set_gpr(vcpu, 4, r2);
+		break;
+	default:
+		r = HC_EV_UNIMPLEMENTED;
+		break;
+	}
+
+	return r;
+}
 
 int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index d731092..3b8080e 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -17,6 +17,7 @@
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
+#define KVM_HC_FEATURES			3
 
 /*
  * hypercalls use architecture specific
-- 
cgit v0.10.2


From 5c6cedf488a1144ac4f683f3ea1a642533d1dcd2 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:49 +0200
Subject: KVM: PPC: Add PV guest critical sections

When running in hooked code we need a way to disable interrupts without
clobbering any interrupts or exiting out to the hypervisor.

To achieve this, we have an additional critical field in the shared page. If
that field is equal to the r1 register of the guest, it tells the hypervisor
that we're in such a critical section and thus may not receive any interrupts.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 556fd59..4577e7b 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -24,6 +24,7 @@
 #include <linux/of.h>
 
 struct kvm_vcpu_arch_shared {
+	__u64 critical;		/* Guest may not get interrupts if == r1 */
 	__u64 sprg0;
 	__u64 sprg1;
 	__u64 sprg2;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 5cb5f0d..d6227ff 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -251,14 +251,28 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 	int deliver = 1;
 	int vec = 0;
 	ulong flags = 0ULL;
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
 
 	switch (priority) {
 	case BOOK3S_IRQPRIO_DECREMENTER:
-		deliver = vcpu->arch.shared->msr & MSR_EE;
+		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_DECREMENTER;
 		break;
 	case BOOK3S_IRQPRIO_EXTERNAL:
-		deliver = vcpu->arch.shared->msr & MSR_EE;
+		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_EXTERNAL;
 		break;
 	case BOOK3S_IRQPRIO_SYSTEM_RESET:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 13e0747..104d0ee 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -147,6 +147,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	int allowed = 0;
 	ulong uninitialized_var(msr_mask);
 	bool update_esr = false, update_dear = false;
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
 
 	switch (priority) {
 	case BOOKE_IRQPRIO_DTLB_MISS:
@@ -181,6 +195,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_FIT:
 		allowed = vcpu->arch.shared->msr & MSR_EE;
+		allowed = allowed && !crit;
 		msr_mask = MSR_CE|MSR_ME|MSR_DE;
 		break;
 	case BOOKE_IRQPRIO_DEBUG:
-- 
cgit v0.10.2


From fad93fe1d452960eb838109222cc949eb77f2859 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:50 +0200
Subject: KVM: PPC: Add PV guest scratch registers

While running in hooked code we need to store register contents out because
we must not clobber any registers.

So let's add some fields to the shared page we can just happily write to.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 4577e7b..5be00c9 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -24,6 +24,9 @@
 #include <linux/of.h>
 
 struct kvm_vcpu_arch_shared {
+	__u64 scratch1;
+	__u64 scratch2;
+	__u64 scratch3;
 	__u64 critical;		/* Guest may not get interrupts if == r1 */
 	__u64 sprg0;
 	__u64 sprg1;
-- 
cgit v0.10.2


From 90bba358873dc96a6746f0df453a0a8ca3d6b86e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:51 +0200
Subject: KVM: PPC: Tell guest about pending interrupts

When the guest turns on interrupts again, it needs to know if we have an
interrupt pending for it. Because if so, it should rather get out of guest
context and get the interrupt.

So we introduce a new field in the shared page that we use to tell the guest
that there's a pending interrupt lying around.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 5be00c9..0653b0d 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -37,6 +37,7 @@ struct kvm_vcpu_arch_shared {
 	__u64 dar;
 	__u64 msr;
 	__u32 dsisr;
+	__u32 int_pending;	/* Tells the guest if we have an interrupt */
 };
 
 #define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index d6227ff..06229fe 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -337,6 +337,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
 	unsigned int priority;
 
 #ifdef EXIT_DEBUG
@@ -356,6 +357,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 					 BITS_PER_BYTE * sizeof(*pending),
 					 priority + 1);
 	}
+
+	/* Tell the guest about our interrupt status */
+	if (*pending)
+		vcpu->arch.shared->int_pending = 1;
+	else if (old_pending)
+		vcpu->arch.shared->int_pending = 0;
 }
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 104d0ee..c604277 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -224,6 +224,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
 	unsigned int priority;
 
 	priority = __ffs(*pending);
@@ -235,6 +236,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 		                         BITS_PER_BYTE * sizeof(*pending),
 		                         priority + 1);
 	}
+
+	/* Tell the guest about our interrupt status */
+	if (*pending)
+		vcpu->arch.shared->int_pending = 1;
+	else if (old_pending)
+		vcpu->arch.shared->int_pending = 0;
 }
 
 /**
-- 
cgit v0.10.2


From 28e83b4fa7f8bd114940fa933ac8cbe80969eba2 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:52 +0200
Subject: KVM: PPC: Make PAM a define

On PowerPC it's very normal to not support all of the physical RAM in real mode.
To check if we're matching on the shared page or not, we need to know the limits
so we can restrain ourselves to that range.

So let's make it a define instead of open-coding it. And while at it, let's also
increase it.

Signed-off-by: Alexander Graf <agraf@suse.de>

v2 -> v3:

  - RMO -> PAM (non-magic page)
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 221cf85..1674da8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -48,6 +48,9 @@
 #define HPTEG_HASH_NUM_VPTE		(1 << HPTEG_HASH_BITS_VPTE)
 #define HPTEG_HASH_NUM_VPTE_LONG	(1 << HPTEG_HASH_BITS_VPTE_LONG)
 
+/* Physical Address Mask - allowed range of real mode RAM access */
+#define KVM_PAM			0x0fffffffffffffffULL
+
 struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 06229fe..0ed5376 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -465,7 +465,7 @@ static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
 		r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data);
 	} else {
 		pte->eaddr = eaddr;
-		pte->raddr = eaddr & 0xffffffff;
+		pte->raddr = eaddr & KVM_PAM;
 		pte->vpage = VSID_REAL | eaddr >> 12;
 		pte->may_read = true;
 		pte->may_write = true;
@@ -579,7 +579,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte.may_execute = true;
 		pte.may_read = true;
 		pte.may_write = true;
-		pte.raddr = eaddr & 0xffffffff;
+		pte.raddr = eaddr & KVM_PAM;
 		pte.eaddr = eaddr;
 		pte.vpage = eaddr >> 12;
 	}
-- 
cgit v0.10.2


From beb03f14da9ceff76ff08cbb8af064b52dc21f7e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:53 +0200
Subject: KVM: PPC: First magic page steps

We will be introducing a method to project the shared page in guest context.
As soon as we're talking about this coupling, the shared page is colled magic
page.

This patch introduces simple defines, so the follow-up patches are easier to
read.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1674da8..e1da775 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -287,6 +287,8 @@ struct kvm_vcpu_arch {
 	u64 dec_jiffies;
 	unsigned long pending_exceptions;
 	struct kvm_vcpu_arch_shared *shared;
+	unsigned long magic_page_pa; /* phys addr to map the magic page to */
+	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
 #ifdef CONFIG_PPC_BOOK3S
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3b8080e..ac2015a 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -18,6 +18,7 @@
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
 #define KVM_HC_FEATURES			3
+#define KVM_HC_PPC_MAP_MAGIC_PAGE	4
 
 /*
  * hypercalls use architecture specific
-- 
cgit v0.10.2


From e8508940a88691ad3d1c46608cd968eb4be9cbc5 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:54 +0200
Subject: KVM: PPC: Magic Page Book3s support

We need to override EA as well as PA lookups for the magic page. When the guest
tells us to project it, the magic page overrides any guest mappings.

In order to reflect that, we need to hook into all the MMU layers of KVM to
force map the magic page if necessary.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index b5b1961..00cf8b0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -130,6 +130,7 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern u32 kvmppc_trampoline_lowmem;
 extern u32 kvmppc_trampoline_enter;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 0ed5376..eee97b5 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -419,6 +419,25 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 	}
 }
 
+pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	/* Magic page override */
+	if (unlikely(mp_pa) &&
+	    unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) ==
+		     ((mp_pa & PAGE_MASK) & KVM_PAM))) {
+		ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+		pfn_t pfn;
+
+		pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
+		get_page(pfn_to_page(pfn));
+		return pfn;
+	}
+
+	return gfn_to_pfn(vcpu->kvm, gfn);
+}
+
 /* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
  * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
  * emulate 32 bytes dcbz length.
@@ -554,6 +573,13 @@ mmio:
 
 static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	if (unlikely(mp_pa) &&
+	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
+		return 1;
+	}
+
 	return kvm_is_visible_gfn(vcpu->kvm, gfn);
 }
 
@@ -1257,6 +1283,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	struct kvmppc_vcpu_book3s *vcpu_book3s;
 	struct kvm_vcpu *vcpu;
 	int err = -ENOMEM;
+	unsigned long p;
 
 	vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
 	if (!vcpu_book3s)
@@ -1274,8 +1301,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_shadow_vcpu;
 
-	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
-	if (!vcpu->arch.shared)
+	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+	/* the real shared page fills the last 4k of our page */
+	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
+	if (!p)
 		goto uninit_vcpu;
 
 	vcpu->arch.host_retip = kvm_return_point;
@@ -1322,7 +1351,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
-	free_page((unsigned long)vcpu->arch.shared);
+	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu_book3s->shadow_vcpu);
 	vfree(vcpu_book3s);
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 449bce5..a7d121a 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -281,8 +281,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 				      struct kvmppc_pte *pte, bool data)
 {
 	int r;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
 
 	pte->eaddr = eaddr;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(vcpu->arch.shared->msr & MSR_PR)) {
+		pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+		pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);
+		pte->raddr &= KVM_PAM;
+		pte->may_execute = true;
+		pte->may_read = true;
+		pte->may_write = true;
+
+		return 0;
+	}
+
 	r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data);
 	if (r < 0)
 	       r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true);
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 67b8c38..05e8c9e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -147,7 +147,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	struct hpte_cache *pte;
 
 	/* Get host physical address for gpa */
-	hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
 	if (kvm_is_error_hva(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
 				 orig_pte->eaddr);
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 58aa840..d7889ef 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -163,6 +163,22 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	bool found = false;
 	bool perm_err = false;
 	int second = 0;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(vcpu->arch.shared->msr & MSR_PR)) {
+		gpte->eaddr = eaddr;
+		gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+		gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff);
+		gpte->raddr &= KVM_PAM;
+		gpte->may_execute = true;
+		gpte->may_read = true;
+		gpte->may_write = true;
+
+		return 0;
+	}
 
 	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
 	if (!slbe)
@@ -445,6 +461,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	ulong ea = esid << SID_SHIFT;
 	struct kvmppc_slb *slb;
 	u64 gvsid = esid;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
 
 	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 		slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
@@ -464,7 +481,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		break;
 	case MSR_DR|MSR_IR:
 		if (!slb)
-			return -ENOENT;
+			goto no_slb;
 
 		*vsid = gvsid;
 		break;
@@ -477,6 +494,17 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		*vsid |= VSID_PR;
 
 	return 0;
+
+no_slb:
+	/* Catch magic page case */
+	if (unlikely(mp_ea) &&
+	    unlikely(esid == (mp_ea >> SID_SHIFT)) &&
+	    !(vcpu->arch.shared->msr & MSR_PR)) {
+		*vsid = VSID_REAL | esid;
+		return 0;
+	}
+
+	return -EINVAL;
 }
 
 static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 71c1f90..6cdd19a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -101,18 +101,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	struct kvmppc_sid_map *map;
 
 	/* Get host physical address for gpa */
-	hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
 	if (kvm_is_error_hva(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
 		return -EINVAL;
 	}
 	hpaddr <<= PAGE_SHIFT;
-#if PAGE_SHIFT == 12
-#elif PAGE_SHIFT == 16
-	hpaddr |= orig_pte->raddr & 0xf000;
-#else
-#error Unknown page size
-#endif
+	hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
 
 	/* and write the mapping ea -> hpa into the pt */
 	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
-- 
cgit v0.10.2


From 5fc87407b55f5799418f4dc5931232c2bc06d077 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:55 +0200
Subject: KVM: PPC: Expose magic page support to guest

Now that we have the shared page in place and the MMU code knows about
the magic page, we can expose that capability to the guest!

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 0653b0d..7438ab3 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -45,6 +45,8 @@ struct kvm_vcpu_arch_shared {
 #define HC_EV_SUCCESS		0
 #define HC_EV_UNIMPLEMENTED	12
 
+#define KVM_FEATURE_MAGIC_PAGE	1
+
 #ifdef __KERNEL__
 
 #ifdef CONFIG_KVM_GUEST
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a4cf4b4..fecfe04 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -61,8 +61,19 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	}
 
 	switch (nr) {
+	case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE:
+	{
+		vcpu->arch.magic_page_pa = param1;
+		vcpu->arch.magic_page_ea = param2;
+
+		r = HC_EV_SUCCESS;
+		break;
+	}
 	case HC_VENDOR_KVM | KVM_HC_FEATURES:
 		r = HC_EV_SUCCESS;
+#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */
+		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
+#endif
 
 		/* Second return value is in r4 */
 		kvmppc_set_gpr(vcpu, 4, r2);
-- 
cgit v0.10.2


From ba492962363a02c45836be205f339be48093e1be Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:56 +0200
Subject: KVM: Move kvm_guest_init out of generic code

Currently x86 is the only architecture that uses kvm_guest_init(). With
PowerPC we're getting a second user, but the signature is different there
and we don't need to export it, as it uses the normal kernel init framework.

So let's move the x86 specific definition of that function over to the x86
specfic header file.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e..7b562b6 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
 	return cpuid_eax(KVM_CPUID_FEATURES);
 }
 
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
 #endif
 
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index ac2015a..47a070b 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -26,11 +26,6 @@
 #include <asm/kvm_para.h>
 
 #ifdef __KERNEL__
-#ifdef CONFIG_KVM_GUEST
-void __init kvm_guest_init(void);
-#else
-#define kvm_guest_init() do { } while (0)
-#endif
 
 static inline int kvm_para_has_feature(unsigned int feature)
 {
-- 
cgit v0.10.2


From d17051cb8d223dffd6bb847b0565ef1654f8e0e1 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:57 +0200
Subject: KVM: PPC: Generic KVM PV guest support

We have all the hypervisor pieces in place now, but the guest parts are still
missing.

This patch implements basic awareness of KVM when running Linux as guest. It
doesn't do anything with it yet though.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 3a6955d..be257b0 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -127,7 +127,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
 obj-y				+= ppc_save_regs.o
 endif
 
-obj-$(CONFIG_KVM_GUEST)		+= kvm.o
+obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
 
 # Disable GCOV in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1221bcd..37486ca 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -465,6 +465,21 @@ int main(void)
 	DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif /* CONFIG_PPC_BOOK3S */
 #endif
+
+#ifdef CONFIG_KVM_GUEST
+	DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
+					    scratch1));
+	DEFINE(KVM_MAGIC_SCRATCH2, offsetof(struct kvm_vcpu_arch_shared,
+					    scratch2));
+	DEFINE(KVM_MAGIC_SCRATCH3, offsetof(struct kvm_vcpu_arch_shared,
+					    scratch3));
+	DEFINE(KVM_MAGIC_INT, offsetof(struct kvm_vcpu_arch_shared,
+				       int_pending));
+	DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
+	DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared,
+					    critical));
+#endif
+
 #ifdef CONFIG_44x
 	DEFINE(PGD_T_LOG2, PGD_T_LOG2);
 	DEFINE(PTE_T_LOG2, PTE_T_LOG2);
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 4f85505..a5ece71 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -30,6 +30,9 @@
 #include <asm/cacheflush.h>
 #include <asm/disassemble.h>
 
+#define KVM_MAGIC_PAGE		(-4096L)
+#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
+
 unsigned long kvm_hypercall(unsigned long *in,
 			    unsigned long *out,
 			    unsigned long nr)
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
new file mode 100644
index 0000000..5cfa2ae
--- /dev/null
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -0,0 +1,36 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+
+.global kvm_hypercall_start
+kvm_hypercall_start:
+	li	r3, -1
+	nop
+	nop
+	nop
+	blr
+
+#define KVM_MAGIC_PAGE		(-4096)
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 81c9208..956154f 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -21,6 +21,16 @@ source "arch/powerpc/platforms/44x/Kconfig"
 source "arch/powerpc/platforms/40x/Kconfig"
 source "arch/powerpc/platforms/amigaone/Kconfig"
 
+config KVM_GUEST
+	bool "KVM Guest support"
+	default y
+	---help---
+	  This option enables various optimizations for running under the KVM
+	  hypervisor. Overhead for the kernel when not running inside KVM should
+	  be minimal.
+
+	  In case of doubt, say Y
+
 config PPC_NATIVE
 	bool
 	depends on 6xx || PPC64
-- 
cgit v0.10.2


From 73a18109829e7696226a9fd4062d339e7c6ee130 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:58 +0200
Subject: KVM: PPC: KVM PV guest stubs

We will soon start and replace instructions from the text section with
other, paravirtualized versions. To ease the readability of those patches
I split out the generic looping and magic page mapping code out.

This patch still only contains stubs. But at least it loops through the
text section :).

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index a5ece71..e93366f 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -33,6 +33,62 @@
 #define KVM_MAGIC_PAGE		(-4096L)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
 
+#define KVM_MASK_RT		0x03e00000
+
+static bool kvm_patching_worked = true;
+
+static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
+{
+	*inst = new_inst;
+	flush_icache_range((ulong)inst, (ulong)inst + 4);
+}
+
+static void kvm_map_magic_page(void *data)
+{
+	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
+		       KVM_MAGIC_PAGE,  /* Physical Address */
+		       KVM_MAGIC_PAGE); /* Effective Address */
+}
+
+static void kvm_check_ins(u32 *inst)
+{
+	u32 _inst = *inst;
+	u32 inst_no_rt = _inst & ~KVM_MASK_RT;
+	u32 inst_rt = _inst & KVM_MASK_RT;
+
+	switch (inst_no_rt) {
+	}
+
+	switch (_inst) {
+	}
+}
+
+static void kvm_use_magic_page(void)
+{
+	u32 *p;
+	u32 *start, *end;
+	u32 tmp;
+
+	/* Tell the host to map the magic page to -4096 on all CPUs */
+	on_each_cpu(kvm_map_magic_page, NULL, 1);
+
+	/* Quick self-test to see if the mapping works */
+	if (__get_user(tmp, (u32*)KVM_MAGIC_PAGE)) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Now loop through all code and find instructions */
+	start = (void*)_stext;
+	end = (void*)_etext;
+
+	for (p = start; p < end; p++)
+		kvm_check_ins(p);
+
+	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
+			 kvm_patching_worked ? "worked" : "failed");
+}
+
 unsigned long kvm_hypercall(unsigned long *in,
 			    unsigned long *out,
 			    unsigned long nr)
@@ -69,3 +125,42 @@ unsigned long kvm_hypercall(unsigned long *in,
 	return r3;
 }
 EXPORT_SYMBOL_GPL(kvm_hypercall);
+
+static int kvm_para_setup(void)
+{
+	extern u32 kvm_hypercall_start;
+	struct device_node *hyper_node;
+	u32 *insts;
+	int len, i;
+
+	hyper_node = of_find_node_by_path("/hypervisor");
+	if (!hyper_node)
+		return -1;
+
+	insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
+	if (len % 4)
+		return -1;
+	if (len > (4 * 4))
+		return -1;
+
+	for (i = 0; i < (len / 4); i++)
+		kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
+
+	return 0;
+}
+
+static int __init kvm_guest_init(void)
+{
+	if (!kvm_para_available())
+		return 0;
+
+	if (kvm_para_setup())
+		return 0;
+
+	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
+		kvm_use_magic_page();
+
+	return 0;
+}
+
+postcore_initcall(kvm_guest_init);
-- 
cgit v0.10.2


From d1293c927568f5b5b8dd3fa263a98683cf8556dc Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:59 +0200
Subject: KVM: PPC: PV instructions to loads and stores

Some instructions can simply be replaced by load and store instructions to
or from the magic page.

This patch replaces often called instructions that fall into the above category.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index e93366f..9ec572c 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -33,7 +33,34 @@
 #define KVM_MAGIC_PAGE		(-4096L)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
 
+#define KVM_INST_LWZ		0x80000000
+#define KVM_INST_STW		0x90000000
+#define KVM_INST_LD		0xe8000000
+#define KVM_INST_STD		0xf8000000
+#define KVM_INST_NOP		0x60000000
+#define KVM_INST_B		0x48000000
+#define KVM_INST_B_MASK		0x03ffffff
+#define KVM_INST_B_MAX		0x01ffffff
+
 #define KVM_MASK_RT		0x03e00000
+#define KVM_INST_MFMSR		0x7c0000a6
+#define KVM_INST_MFSPR_SPRG0	0x7c1042a6
+#define KVM_INST_MFSPR_SPRG1	0x7c1142a6
+#define KVM_INST_MFSPR_SPRG2	0x7c1242a6
+#define KVM_INST_MFSPR_SPRG3	0x7c1342a6
+#define KVM_INST_MFSPR_SRR0	0x7c1a02a6
+#define KVM_INST_MFSPR_SRR1	0x7c1b02a6
+#define KVM_INST_MFSPR_DAR	0x7c1302a6
+#define KVM_INST_MFSPR_DSISR	0x7c1202a6
+
+#define KVM_INST_MTSPR_SPRG0	0x7c1043a6
+#define KVM_INST_MTSPR_SPRG1	0x7c1143a6
+#define KVM_INST_MTSPR_SPRG2	0x7c1243a6
+#define KVM_INST_MTSPR_SPRG3	0x7c1343a6
+#define KVM_INST_MTSPR_SRR0	0x7c1a03a6
+#define KVM_INST_MTSPR_SRR1	0x7c1b03a6
+#define KVM_INST_MTSPR_DAR	0x7c1303a6
+#define KVM_INST_MTSPR_DSISR	0x7c1203a6
 
 static bool kvm_patching_worked = true;
 
@@ -43,6 +70,34 @@ static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
 	flush_icache_range((ulong)inst, (ulong)inst + 4);
 }
 
+static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
+}
+
+static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -57,6 +112,60 @@ static void kvm_check_ins(u32 *inst)
 	u32 inst_rt = _inst & KVM_MASK_RT;
 
 	switch (inst_no_rt) {
+	/* Loads */
+	case KVM_INST_MFMSR:
+		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG0:
+		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG1:
+		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG2:
+		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG3:
+		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SRR0:
+		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SRR1:
+		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
+		break;
+	case KVM_INST_MFSPR_DAR:
+		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MFSPR_DSISR:
+		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
+		break;
+
+	/* Stores */
+	case KVM_INST_MTSPR_SPRG0:
+		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG1:
+		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG2:
+		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG3:
+		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SRR0:
+		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SRR1:
+		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
+		break;
+	case KVM_INST_MTSPR_DAR:
+		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MTSPR_DSISR:
+		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
+		break;
 	}
 
 	switch (_inst) {
-- 
cgit v0.10.2


From d1290b15e7f139e24150cc6e6d8e904214359e8a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:00 +0200
Subject: KVM: PPC: PV tlbsync to nop

With our current MMU scheme we don't need to know about the tlbsync instruction.
So we can just nop it out.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 9ec572c..3258922 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -62,6 +62,8 @@
 #define KVM_INST_MTSPR_DAR	0x7c1303a6
 #define KVM_INST_MTSPR_DSISR	0x7c1203a6
 
+#define KVM_INST_TLBSYNC	0x7c00046c
+
 static bool kvm_patching_worked = true;
 
 static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
@@ -98,6 +100,11 @@ static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
 	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
 }
 
+static void kvm_patch_ins_nop(u32 *inst)
+{
+	kvm_patch_ins(inst, KVM_INST_NOP);
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -166,6 +173,11 @@ static void kvm_check_ins(u32 *inst)
 	case KVM_INST_MTSPR_DSISR:
 		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
 		break;
+
+	/* Nops */
+	case KVM_INST_TLBSYNC:
+		kvm_patch_ins_nop(inst);
+		break;
 	}
 
 	switch (_inst) {
-- 
cgit v0.10.2


From 2d4f567103ff5a931e773f2e356b4eb303115deb Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:01 +0200
Subject: KVM: PPC: Introduce kvm_tmp framework

We will soon require more sophisticated methods to replace single instructions
with multiple instructions. We do that by branching to a memory region where we
write replacement code for the instruction to.

This region needs to be within 32 MB of the patched instruction though, because
that's the furthest we can jump with immediate branches.

So we keep 1MB of free space around in bss. After we're done initing we can just
tell the mm system that the unused pages are free, but until then we have enough
space to fit all our code in.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 3258922..926f93f 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -65,6 +65,8 @@
 #define KVM_INST_TLBSYNC	0x7c00046c
 
 static bool kvm_patching_worked = true;
+static char kvm_tmp[1024 * 1024];
+static int kvm_tmp_index;
 
 static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
 {
@@ -105,6 +107,23 @@ static void kvm_patch_ins_nop(u32 *inst)
 	kvm_patch_ins(inst, KVM_INST_NOP);
 }
 
+static u32 *kvm_alloc(int len)
+{
+	u32 *p;
+
+	if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
+		printk(KERN_ERR "KVM: No more space (%d + %d)\n",
+				kvm_tmp_index, len);
+		kvm_patching_worked = false;
+		return NULL;
+	}
+
+	p = (void*)&kvm_tmp[kvm_tmp_index];
+	kvm_tmp_index += len;
+
+	return p;
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -270,17 +289,36 @@ static int kvm_para_setup(void)
 	return 0;
 }
 
+static __init void kvm_free_tmp(void)
+{
+	unsigned long start, end;
+
+	start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
+	end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
+
+	/* Free the tmp space we don't need */
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+
 static int __init kvm_guest_init(void)
 {
 	if (!kvm_para_available())
-		return 0;
+		goto free_tmp;
 
 	if (kvm_para_setup())
-		return 0;
+		goto free_tmp;
 
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
 		kvm_use_magic_page();
 
+free_tmp:
+	kvm_free_tmp();
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 71ee8e34fe26252b11668a95708783ec9c58cbda Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:02 +0200
Subject: KVM: PPC: Introduce branch patching helper

We will need to patch several instruction streams over to a different
code path, so we need a way to patch a single instruction with a branch
somewhere else.

This patch adds a helper to facilitate this patching.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 926f93f..239a70d 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -107,6 +107,20 @@ static void kvm_patch_ins_nop(u32 *inst)
 	kvm_patch_ins(inst, KVM_INST_NOP);
 }
 
+static void kvm_patch_ins_b(u32 *inst, int addr)
+{
+#ifdef CONFIG_RELOCATABLE
+	/* On relocatable kernels interrupts handlers and our code
+	   can be in different regions, so we don't patch them */
+
+	extern u32 __end_interrupts;
+	if ((ulong)inst < (ulong)&__end_interrupts)
+		return;
+#endif
+
+	kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
+}
+
 static u32 *kvm_alloc(int len)
 {
 	u32 *p;
-- 
cgit v0.10.2


From 92234722ed631f472f1c4d79d35d8e5cf6910002 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:03 +0200
Subject: KVM: PPC: PV assembler helpers

When we hook an instruction we need to make sure we don't clobber any of
the registers at that point. So we write them out to scratch space in the
magic page. To make sure we don't fall into a race with another piece of
hooked code, we need to disable interrupts.

To make the later patches and code in general easier readable, let's introduce
a set of defines that save and restore r30, r31 and cr. Let's also define some
helpers to read the lower 32 bits of a 64 bit field on 32 bit systems.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 5cfa2ae..1dac72d 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -34,3 +34,33 @@ kvm_hypercall_start:
 	blr
 
 #define KVM_MAGIC_PAGE		(-4096)
+
+#ifdef CONFIG_64BIT
+#define LL64(reg, offs, reg2)	ld	reg, (offs)(reg2)
+#define STL64(reg, offs, reg2)	std	reg, (offs)(reg2)
+#else
+#define LL64(reg, offs, reg2)	lwz	reg, (offs + 4)(reg2)
+#define STL64(reg, offs, reg2)	stw	reg, (offs + 4)(reg2)
+#endif
+
+#define SCRATCH_SAVE							\
+	/* Enable critical section. We are critical if			\
+	   shared->critical == r1 */					\
+	STL64(r1, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);		\
+									\
+	/* Save state */						\
+	PPC_STL	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0);		\
+	PPC_STL	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0);		\
+	mfcr	r31;							\
+	stw	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);
+
+#define SCRATCH_RESTORE							\
+	/* Restore state */						\
+	PPC_LL	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0);		\
+	lwz	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);		\
+	mtcr	r30;							\
+	PPC_LL	r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0);		\
+									\
+	/* Disable critical section. We are critical if			\
+	   shared->critical == r1 and r2 is always != r1 */		\
+	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
-- 
cgit v0.10.2


From 819a63dc792b0888edd3eda306a9e1e049dcbb1c Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:04 +0200
Subject: KVM: PPC: PV mtmsrd L=1

The PowerPC ISA has a special instruction for mtmsr that only changes the EE
and RI bits, namely the L=1 form.

Since that one is reasonably often occuring and simple to implement, let's
go with this first. Writing EE=0 is always just a store. Doing EE=1 also
requires us to check for pending interrupts and if necessary exit back to the
hypervisor.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 239a70d..717ab0d 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -63,6 +63,7 @@
 #define KVM_INST_MTSPR_DSISR	0x7c1203a6
 
 #define KVM_INST_TLBSYNC	0x7c00046c
+#define KVM_INST_MTMSRD_L1	0x7c010164
 
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
@@ -138,6 +139,43 @@ static u32 *kvm_alloc(int len)
 	return p;
 }
 
+extern u32 kvm_emulate_mtmsrd_branch_offs;
+extern u32 kvm_emulate_mtmsrd_reg_offs;
+extern u32 kvm_emulate_mtmsrd_len;
+extern u32 kvm_emulate_mtmsrd[];
+
+static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsrd_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4);
+	p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtmsrd_reg_offs] |= rt;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -211,6 +249,13 @@ static void kvm_check_ins(u32 *inst)
 	case KVM_INST_TLBSYNC:
 		kvm_patch_ins_nop(inst);
 		break;
+
+	/* Rewrites */
+	case KVM_INST_MTMSRD_L1:
+		/* We use r30 and r31 during the hook */
+		if (get_rt(inst_rt) < 30)
+			kvm_patch_ins_mtmsrd(inst, inst_rt);
+		break;
 	}
 
 	switch (_inst) {
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 1dac72d..10dc4a6 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -64,3 +64,59 @@ kvm_hypercall_start:
 	/* Disable critical section. We are critical if			\
 	   shared->critical == r1 and r2 is always != r1 */		\
 	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
+
+.global kvm_emulate_mtmsrd
+kvm_emulate_mtmsrd:
+
+	SCRATCH_SAVE
+
+	/* Put MSR & ~(MSR_EE|MSR_RI) in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	lis	r30, (~(MSR_EE | MSR_RI))@h
+	ori	r30, r30, (~(MSR_EE | MSR_RI))@l
+	and	r31, r31, r30
+
+	/* OR the register's (MSR_EE|MSR_RI) on MSR */
+kvm_emulate_mtmsrd_reg:
+	andi.	r30, r0, (MSR_EE|MSR_RI)
+	or	r31, r31, r30
+
+	/* Put MSR back into magic page */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_check
+
+	/* Check if we may trigger an interrupt */
+	andi.	r30, r30, MSR_EE
+	beq	no_check
+
+	SCRATCH_RESTORE
+
+	/* Nag hypervisor */
+	tlbsync
+
+	b	kvm_emulate_mtmsrd_branch
+
+no_check:
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsrd_branch:
+	b	.
+kvm_emulate_mtmsrd_end:
+
+.global kvm_emulate_mtmsrd_branch_offs
+kvm_emulate_mtmsrd_branch_offs:
+	.long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_reg_offs
+kvm_emulate_mtmsrd_reg_offs:
+	.long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_len
+kvm_emulate_mtmsrd_len:
+	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
-- 
cgit v0.10.2


From 7810927760a0d16d7a41be4dab895fbbf9445bc0 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:05 +0200
Subject: KVM: PPC: PV mtmsrd L=0 and mtmsr

There is also a form of mtmsr where all bits need to be addressed. While the
PPC64 Linux kernel behaves resonably well here, on PPC32 we do not have an
L=1 form. It does mtmsr even for simple things like only changing EE.

So we need to hook into that one as well and check for a mask of bits that we
deem safe to change from within guest context.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 717ab0d..8ac57e2 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -63,7 +63,9 @@
 #define KVM_INST_MTSPR_DSISR	0x7c1203a6
 
 #define KVM_INST_TLBSYNC	0x7c00046c
+#define KVM_INST_MTMSRD_L0	0x7c000164
 #define KVM_INST_MTMSRD_L1	0x7c010164
+#define KVM_INST_MTMSR		0x7c000124
 
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
@@ -176,6 +178,49 @@ static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
 	kvm_patch_ins_b(inst, distance_start);
 }
 
+extern u32 kvm_emulate_mtmsr_branch_offs;
+extern u32 kvm_emulate_mtmsr_reg1_offs;
+extern u32 kvm_emulate_mtmsr_reg2_offs;
+extern u32 kvm_emulate_mtmsr_reg3_offs;
+extern u32 kvm_emulate_mtmsr_orig_ins_offs;
+extern u32 kvm_emulate_mtmsr_len;
+extern u32 kvm_emulate_mtmsr[];
+
+static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsr_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4);
+	p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtmsr_reg1_offs] |= rt;
+	p[kvm_emulate_mtmsr_reg2_offs] |= rt;
+	p[kvm_emulate_mtmsr_reg3_offs] |= rt;
+	p[kvm_emulate_mtmsr_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -256,6 +301,12 @@ static void kvm_check_ins(u32 *inst)
 		if (get_rt(inst_rt) < 30)
 			kvm_patch_ins_mtmsrd(inst, inst_rt);
 		break;
+	case KVM_INST_MTMSR:
+	case KVM_INST_MTMSRD_L0:
+		/* We use r30 and r31 during the hook */
+		if (get_rt(inst_rt) < 30)
+			kvm_patch_ins_mtmsr(inst, inst_rt);
+		break;
 	}
 
 	switch (_inst) {
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 10dc4a6..8cd22f4 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -120,3 +120,87 @@ kvm_emulate_mtmsrd_reg_offs:
 .global kvm_emulate_mtmsrd_len
 kvm_emulate_mtmsrd_len:
 	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
+
+
+#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
+
+.global kvm_emulate_mtmsr
+kvm_emulate_mtmsr:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Find the changed bits between old and new MSR */
+kvm_emulate_mtmsr_reg1:
+	xor	r31, r0, r31
+
+	/* Check if we need to really do mtmsr */
+	LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS)
+	and.	r31, r31, r30
+
+	/* No critical bits changed? Maybe we can stay in the guest. */
+	beq	maybe_stay_in_guest
+
+do_mtmsr:
+
+	SCRATCH_RESTORE
+
+	/* Just fire off the mtmsr if it's critical */
+kvm_emulate_mtmsr_orig_ins:
+	mtmsr	r0
+
+	b	kvm_emulate_mtmsr_branch
+
+maybe_stay_in_guest:
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_mtmsr
+
+	/* Check if we may trigger an interrupt */
+kvm_emulate_mtmsr_reg2:
+	andi.	r31, r0, MSR_EE
+	beq	no_mtmsr
+
+	b	do_mtmsr
+
+no_mtmsr:
+
+	/* Put MSR into magic page because we don't call mtmsr */
+kvm_emulate_mtmsr_reg3:
+	STL64(r0, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsr_branch:
+	b	.
+kvm_emulate_mtmsr_end:
+
+.global kvm_emulate_mtmsr_branch_offs
+kvm_emulate_mtmsr_branch_offs:
+	.long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg1_offs
+kvm_emulate_mtmsr_reg1_offs:
+	.long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg2_offs
+kvm_emulate_mtmsr_reg2_offs:
+	.long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg3_offs
+kvm_emulate_mtmsr_reg3_offs:
+	.long (kvm_emulate_mtmsr_reg3 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_orig_ins_offs
+kvm_emulate_mtmsr_orig_ins_offs:
+	.long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_len
+kvm_emulate_mtmsr_len:
+	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
-- 
cgit v0.10.2


From 644bfa013fd589b0df2470a66bcd104318ef24cd Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:06 +0200
Subject: KVM: PPC: PV wrteei

On BookE the preferred way to write the EE bit is the wrteei instruction. It
already encodes the EE bit in the instruction.

So in order to get BookE some speedups as well, let's also PV'nize thati
instruction.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 8ac57e2..e936817 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -67,6 +67,9 @@
 #define KVM_INST_MTMSRD_L1	0x7c010164
 #define KVM_INST_MTMSR		0x7c000124
 
+#define KVM_INST_WRTEEI_0	0x7c000146
+#define KVM_INST_WRTEEI_1	0x7c008146
+
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
 static int kvm_tmp_index;
@@ -221,6 +224,47 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 	kvm_patch_ins_b(inst, distance_start);
 }
 
+#ifdef CONFIG_BOOKE
+
+extern u32 kvm_emulate_wrteei_branch_offs;
+extern u32 kvm_emulate_wrteei_ee_offs;
+extern u32 kvm_emulate_wrteei_len;
+extern u32 kvm_emulate_wrteei[];
+
+static void kvm_patch_ins_wrteei(u32 *inst)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrteei_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
+	p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -310,6 +354,12 @@ static void kvm_check_ins(u32 *inst)
 	}
 
 	switch (_inst) {
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEEI_0:
+	case KVM_INST_WRTEEI_1:
+		kvm_patch_ins_wrteei(inst);
+		break;
+#endif
 	}
 }
 
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 8cd22f4..3199f65 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -204,3 +204,44 @@ kvm_emulate_mtmsr_orig_ins_offs:
 .global kvm_emulate_mtmsr_len
 kvm_emulate_mtmsr_len:
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
+
+
+
+.global kvm_emulate_wrteei
+kvm_emulate_wrteei:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Remove MSR_EE from old MSR */
+	li	r30, 0
+	ori	r30, r30, MSR_EE
+	andc	r31, r31, r30
+
+	/* OR new MSR_EE onto the old MSR */
+kvm_emulate_wrteei_ee:
+	ori	r31, r31, 0
+
+	/* Write new MSR value back */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrteei_branch:
+	b	.
+kvm_emulate_wrteei_end:
+
+.global kvm_emulate_wrteei_branch_offs
+kvm_emulate_wrteei_branch_offs:
+	.long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
+
+.global kvm_emulate_wrteei_ee_offs
+kvm_emulate_wrteei_ee_offs:
+	.long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
+
+.global kvm_emulate_wrteei_len
+kvm_emulate_wrteei_len:
+	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
-- 
cgit v0.10.2


From d7d3c2ea99c4845611997cf728af88c4c232e908 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:07 +0200
Subject: KVM: PPC: Add Documentation about PV interface

We just introduced a new PV interface that screams for documentation. So here
it is - a shiny new and awesome text file describing the internal works of
the PPC KVM paravirtual interface.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
new file mode 100644
index 0000000..41ee16d
--- /dev/null
+++ b/Documentation/kvm/ppc-pv.txt
@@ -0,0 +1,179 @@
+The PPC KVM paravirtual interface
+=================================
+
+The basic execution principle by which KVM on PowerPC works is to run all kernel
+space code in PR=1 which is user space. This way we trap all privileged
+instructions and can emulate them accordingly.
+
+Unfortunately that is also the downfall. There are quite some privileged
+instructions that needlessly return us to the hypervisor even though they
+could be handled differently.
+
+This is what the PPC PV interface helps with. It takes privileged instructions
+and transforms them into unprivileged ones with some help from the hypervisor.
+This cuts down virtualization costs by about 50% on some of my benchmarks.
+
+The code for that interface can be found in arch/powerpc/kernel/kvm*
+
+Querying for existence
+======================
+
+To find out if we're running on KVM or not, we leverage the device tree. When
+Linux is running on KVM, a node /hypervisor exists. That node contains a
+compatible property with the value "linux,kvm".
+
+Once you determined you're running under a PV capable KVM, you can now use
+hypercalls as described below.
+
+KVM hypercalls
+==============
+
+Inside the device tree's /hypervisor node there's a property called
+'hypercall-instructions'. This property contains at most 4 opcodes that make
+up the hypercall. To call a hypercall, just call these instructions.
+
+The parameters are as follows:
+
+	Register	IN			OUT
+
+	r0		-			volatile
+	r3		1st parameter		Return code
+	r4		2nd parameter		1st output value
+	r5		3rd parameter		2nd output value
+	r6		4th parameter		3rd output value
+	r7		5th parameter		4th output value
+	r8		6th parameter		5th output value
+	r9		7th parameter		6th output value
+	r10		8th parameter		7th output value
+	r11		hypercall number	8th output value
+	r12		-			volatile
+
+Hypercall definitions are shared in generic code, so the same hypercall numbers
+apply for x86 and powerpc alike with the exception that each KVM hypercall
+also needs to be ORed with the KVM vendor code which is (42 << 16).
+
+Return codes can be as follows:
+
+	Code		Meaning
+
+	0		Success
+	12		Hypercall not implemented
+	<0		Error
+
+The magic page
+==============
+
+To enable communication between the hypervisor and guest there is a new shared
+page that contains parts of supervisor visible register state. The guest can
+map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
+
+With this hypercall issued the guest always gets the magic page mapped at the
+desired location in effective and physical address space. For now, we always
+map the page to -4096. This way we can access it using absolute load and store
+functions. The following instruction reads the first field of the magic page:
+
+	ld	rX, -4096(0)
+
+The interface is designed to be extensible should there be need later to add
+additional registers to the magic page. If you add fields to the magic page,
+also define a new hypercall feature to indicate that the host can give you more
+registers. Only if the host supports the additional features, make use of them.
+
+The magic page has the following layout as described in
+arch/powerpc/include/asm/kvm_para.h:
+
+struct kvm_vcpu_arch_shared {
+	__u64 scratch1;
+	__u64 scratch2;
+	__u64 scratch3;
+	__u64 critical;		/* Guest may not get interrupts if == r1 */
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
+	__u64 srr0;
+	__u64 srr1;
+	__u64 dar;
+	__u64 msr;
+	__u32 dsisr;
+	__u32 int_pending;	/* Tells the guest if we have an interrupt */
+};
+
+Additions to the page must only occur at the end. Struct fields are always 32
+or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
+
+MSR bits
+========
+
+The MSR contains bits that require hypervisor intervention and bits that do
+not require direct hypervisor intervention because they only get interpreted
+when entering the guest or don't have any impact on the hypervisor's behavior.
+
+The following bits are safe to be set inside the guest:
+
+  MSR_EE
+  MSR_RI
+  MSR_CR
+  MSR_ME
+
+If any other bit changes in the MSR, please still use mtmsr(d).
+
+Patched instructions
+====================
+
+The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions
+respectively on 32 bit systems with an added offset of 4 to accomodate for big
+endianness.
+
+The following is a list of mapping the Linux kernel performs when running as
+guest. Implementing any of those mappings is optional, as the instruction traps
+also act on the shared page. So calling privileged instructions still works as
+before.
+
+From			To
+====			==
+
+mfmsr	rX		ld	rX, magic_page->msr
+mfsprg	rX, 0		ld	rX, magic_page->sprg0
+mfsprg	rX, 1		ld	rX, magic_page->sprg1
+mfsprg	rX, 2		ld	rX, magic_page->sprg2
+mfsprg	rX, 3		ld	rX, magic_page->sprg3
+mfsrr0	rX		ld	rX, magic_page->srr0
+mfsrr1	rX		ld	rX, magic_page->srr1
+mfdar	rX		ld	rX, magic_page->dar
+mfdsisr	rX		lwz	rX, magic_page->dsisr
+
+mtmsr	rX		std	rX, magic_page->msr
+mtsprg	0, rX		std	rX, magic_page->sprg0
+mtsprg	1, rX		std	rX, magic_page->sprg1
+mtsprg	2, rX		std	rX, magic_page->sprg2
+mtsprg	3, rX		std	rX, magic_page->sprg3
+mtsrr0	rX		std	rX, magic_page->srr0
+mtsrr1	rX		std	rX, magic_page->srr1
+mtdar	rX		std	rX, magic_page->dar
+mtdsisr	rX		stw	rX, magic_page->dsisr
+
+tlbsync			nop
+
+mtmsrd	rX, 0		b	<special mtmsr section>
+mtmsr	rX		b	<special mtmsr section>
+
+mtmsrd	rX, 1		b	<special mtmsrd section>
+
+[BookE only]
+wrteei	[0|1]		b	<special wrteei section>
+
+
+Some instructions require more logic to determine what's going on than a load
+or store instruction can deliver. To enable patching of those, we keep some
+RAM around where we can live translate instructions to. What happens is the
+following:
+
+	1) copy emulation code to memory
+	2) patch that code to fit the emulated instruction
+	3) patch that code to return to the original pc + 4
+	4) patch the original instruction to branch to the new code
+
+That way we can inject an arbitrary amount of code as replacement for a single
+instruction. This allows us to check for pending interrupts when setting EE=1
+for example.
-- 
cgit v0.10.2


From 15711e9c927bfc08e66791cbf0ca7887c0880768 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:08 +0200
Subject: KVM: PPC: Add get_pvinfo interface to query hypercall instructions

We need to tell the guest the opcodes that make up a hypercall through
interfaces that are controlled by userspace. So we need to add a call
for userspace to allow it to query those opcodes so it can pass them
on.

This is required because the hypercall opcodes can change based on
the hypervisor conditions. If we're running in hardware accelerated
hypervisor mode, a hypercall looks different from when we're running
without hardware acceleration.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 5f5b649..44d9893 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1032,6 +1032,29 @@ are defined as follows:
    eax, ebx, ecx, edx: the values returned by the cpuid instruction for
          this function/index combination
 
+4.46 KVM_PPC_GET_PVINFO
+
+Capability: KVM_CAP_PPC_GET_PVINFO
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_pvinfo (out)
+Returns: 0 on success, !0 on error
+
+struct kvm_ppc_pvinfo {
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
+This ioctl fetches PV specific information that need to be passed to the guest
+using the device tree or other means from vm context.
+
+For now the only implemented piece of information distributed here is an array
+of 4 instructions that make up a hypercall.
+
+If any additional field gets added to this structure later on, a bit for that
+additional piece of information will be set in the flags bitmap.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index fecfe04..6a53a3f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -191,6 +191,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_UNSET_IRQ:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_PPC_OSI:
+	case KVM_CAP_PPC_GET_PVINFO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -578,16 +579,53 @@ out:
 	return r;
 }
 
+static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
+{
+	u32 inst_lis = 0x3c000000;
+	u32 inst_ori = 0x60000000;
+	u32 inst_nop = 0x60000000;
+	u32 inst_sc = 0x44000002;
+	u32 inst_imm_mask = 0xffff;
+
+	/*
+	 * The hypercall to get into KVM from within guest context is as
+	 * follows:
+	 *
+	 *    lis r0, r0, KVM_SC_MAGIC_R0@h
+	 *    ori r0, KVM_SC_MAGIC_R0@l
+	 *    sc
+	 *    nop
+	 */
+	pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask);
+	pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
+	pvinfo->hcall[2] = inst_sc;
+	pvinfo->hcall[3] = inst_nop;
+
+	return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
+	void __user *argp = (void __user *)arg;
 	long r;
 
 	switch (ioctl) {
+	case KVM_PPC_GET_PVINFO: {
+		struct kvm_ppc_pvinfo pvinfo;
+		r = kvm_vm_ioctl_get_pvinfo(&pvinfo);
+		if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
 
+out:
 	return r;
 }
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 636fc38..3707704 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -414,6 +414,14 @@ struct kvm_enable_cap {
 	__u8  pad[64];
 };
 
+/* for KVM_PPC_GET_PVINFO */
+struct kvm_ppc_pvinfo {
+	/* out */
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
 #define KVMIO 0xAE
 
 /*
@@ -530,6 +538,7 @@ struct kvm_enable_cap {
 #ifdef __KVM_HAVE_XCRS
 #define KVM_CAP_XCRS 56
 #endif
+#define KVM_CAP_PPC_GET_PVINFO 57
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -664,6 +673,8 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
+/* Available with KVM_CAP_PPC_GET_PVINFO */
+#define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
 
 /*
  * ioctls for vcpu fds
-- 
cgit v0.10.2


From 5302104235f0e9f05781b92a4ab25d20e4537f56 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 15:04:16 +0200
Subject: KVM: PPC: Book3S_32 MMU debug compile fixes

Due to previous changes, the Book3S_32 guest MMU code didn't compile properly
when enabling debugging.

This patch repairs the broken code paths, making it possible to define DEBUG_MMU
and friends again.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index a7d121a..5bf4bf8 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -104,7 +104,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 	pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
 
 	dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
-		vcpu_book3s->vcpu.arch.pc, eaddr, vcpu_book3s->sdr1, pteg,
+		kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg,
 		sre->vsid);
 
 	r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
@@ -269,7 +269,7 @@ no_page_found:
 		dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n",
 			    to_book3s(vcpu)->sdr1, ptegp);
 		for (i=0; i<16; i+=2) {
-			dprintk_pte("   %02d: 0x%x - 0x%x (0x%llx)\n",
+			dprintk_pte("   %02d: 0x%x - 0x%x (0x%x)\n",
 				    i, pteg[i], pteg[i+1], ptem);
 		}
 	}
-- 
cgit v0.10.2


From 2e0908afaf03675d22e40ce45a66b8d2070214ac Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 15:04:17 +0200
Subject: KVM: PPC: RCU'ify the Book3s MMU

So far we've been running all code without locking of any sort. This wasn't
really an issue because I didn't see any parallel access to the shadow MMU
code coming.

But then I started to implement dirty bitmapping to MOL which has the video
code in its own thread, so suddenly we had the dirty bitmap code run in
parallel to the shadow mmu code. And with that came trouble.

So I went ahead and made the MMU modifying functions as parallelizable as
I could think of. I hope I didn't screw up too much RCU logic :-). If you
know your way around RCU and locking and what needs to be done when, please
take a look at this patch.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e1da775..fafc71a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -165,6 +165,7 @@ struct hpte_cache {
 	struct hlist_node list_pte;
 	struct hlist_node list_vpte;
 	struct hlist_node list_vpte_long;
+	struct rcu_head rcu_head;
 	u64 host_va;
 	u64 pfn;
 	ulong slot;
@@ -295,6 +296,7 @@ struct kvm_vcpu_arch {
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
 	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
 	int hpte_cache_count;
+	spinlock_t mmu_lock;
 #endif
 };
 
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 4868d4a..b643893 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -60,68 +60,94 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	u64 index;
 
+	spin_lock(&vcpu->arch.mmu_lock);
+
 	/* Add to ePTE list */
 	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
-	hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+	hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
 
 	/* Add to vPTE list */
 	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
-	hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+	hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
 
 	/* Add to vPTE_long list */
 	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
-	hlist_add_head(&pte->list_vpte_long,
-		       &vcpu->arch.hpte_hash_vpte_long[index]);
+	hlist_add_head_rcu(&pte->list_vpte_long,
+			   &vcpu->arch.hpte_hash_vpte_long[index]);
+
+	spin_unlock(&vcpu->arch.mmu_lock);
+}
+
+static void free_pte_rcu(struct rcu_head *head)
+{
+	struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head);
+	kmem_cache_free(hpte_cache, pte);
 }
 
 static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
+	/* pte already invalidated? */
+	if (hlist_unhashed(&pte->list_pte))
+		return;
+
 	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
 		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
 
 	/* Different for 32 and 64 bit */
 	kvmppc_mmu_invalidate_pte(vcpu, pte);
 
+	spin_lock(&vcpu->arch.mmu_lock);
+
+	hlist_del_init_rcu(&pte->list_pte);
+	hlist_del_init_rcu(&pte->list_vpte);
+	hlist_del_init_rcu(&pte->list_vpte_long);
+
+	spin_unlock(&vcpu->arch.mmu_lock);
+
 	if (pte->pte.may_write)
 		kvm_release_pfn_dirty(pte->pfn);
 	else
 		kvm_release_pfn_clean(pte->pfn);
 
-	hlist_del(&pte->list_pte);
-	hlist_del(&pte->list_vpte);
-	hlist_del(&pte->list_vpte_long);
-
 	vcpu->arch.hpte_cache_count--;
-	kmem_cache_free(hpte_cache, pte);
+	call_rcu(&pte->rcu_head, free_pte_rcu);
 }
 
 static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 {
 	struct hpte_cache *pte;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	int i;
 
+	rcu_read_lock();
+
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
 		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
 
-		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			invalidate_pte(vcpu, pte);
 	}
+
+	rcu_read_unlock();
 }
 
 static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
 	struct hlist_head *list;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 
 	/* Find the list of entries in the map */
 	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
 
+	rcu_read_lock();
+
 	/* Check the list for matching entries and invalidate */
-	hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
+	hlist_for_each_entry_rcu(pte, node, list, list_pte)
 		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
 			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
 }
 
 void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
@@ -156,33 +182,41 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
 	struct hlist_head *list;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xfffffffffULL;
 
 	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
 
+	rcu_read_lock();
+
 	/* Check the list for matching entries and invalidate */
-	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
+	hlist_for_each_entry_rcu(pte, node, list, list_vpte)
 		if ((pte->pte.vpage & vp_mask) == guest_vp)
 			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
 }
 
 /* Flush with mask 0xffffff000 */
 static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
 	struct hlist_head *list;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xffffff000ULL;
 
 	list = &vcpu->arch.hpte_hash_vpte_long[
 		kvmppc_mmu_hash_vpte_long(guest_vp)];
 
+	rcu_read_lock();
+
 	/* Check the list for matching entries and invalidate */
-	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+	hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 		if ((pte->pte.vpage & vp_mask) == guest_vp)
 			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
 }
 
 void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
@@ -206,21 +240,25 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 
 void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 {
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	struct hpte_cache *pte;
 	int i;
 
 	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
 		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
 
+	rcu_read_lock();
+
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
 		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
 
-		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			if ((pte->pte.raddr >= pa_start) &&
 			    (pte->pte.raddr < pa_end))
 				invalidate_pte(vcpu, pte);
 	}
+
+	rcu_read_unlock();
 }
 
 struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
@@ -259,6 +297,8 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
 				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
 
+	spin_lock_init(&vcpu->arch.mmu_lock);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 49451389ecc2b4336c305678c210b25fadd18994 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 29 Jul 2010 15:04:18 +0200
Subject: KVM: PPC: correctly check gfn_to_pfn() return value

On failure gfn_to_pfn returns bad_page so use correct function to check
for that.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 05e8c9e..343452c 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -148,7 +148,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 
 	/* Get host physical address for gpa */
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
-	if (kvm_is_error_hva(hpaddr)) {
+	if (is_error_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
 				 orig_pte->eaddr);
 		return -EINVAL;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 6cdd19a..672b149 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -102,7 +102,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 
 	/* Get host physical address for gpa */
 	hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
-	if (kvm_is_error_hva(hpaddr)) {
+	if (is_error_pfn(hpaddr)) {
 		printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From 2d27fc5eac0205588cb59ae138062e5e96695276 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 15:04:19 +0200
Subject: KVM: PPC: Add book3s_32 tlbie flush acceleration

On Book3s_32 the tlbie instruction flushed effective addresses by the mask
0x0ffff000. This is pretty hard to reflect with a hash that hashes ~0xfff, so
to speed up that target we should also keep a special hash around for it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fafc71a..bba3b9b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -42,9 +42,11 @@
 
 #define HPTEG_CACHE_NUM			(1 << 15)
 #define HPTEG_HASH_BITS_PTE		13
+#define HPTEG_HASH_BITS_PTE_LONG	12
 #define HPTEG_HASH_BITS_VPTE		13
 #define HPTEG_HASH_BITS_VPTE_LONG	5
 #define HPTEG_HASH_NUM_PTE		(1 << HPTEG_HASH_BITS_PTE)
+#define HPTEG_HASH_NUM_PTE_LONG		(1 << HPTEG_HASH_BITS_PTE_LONG)
 #define HPTEG_HASH_NUM_VPTE		(1 << HPTEG_HASH_BITS_VPTE)
 #define HPTEG_HASH_NUM_VPTE_LONG	(1 << HPTEG_HASH_BITS_VPTE_LONG)
 
@@ -163,6 +165,7 @@ struct kvmppc_mmu {
 
 struct hpte_cache {
 	struct hlist_node list_pte;
+	struct hlist_node list_pte_long;
 	struct hlist_node list_vpte;
 	struct hlist_node list_vpte_long;
 	struct rcu_head rcu_head;
@@ -293,6 +296,7 @@ struct kvm_vcpu_arch {
 
 #ifdef CONFIG_PPC_BOOK3S
 	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
 	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
 	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
 	int hpte_cache_count;
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index b643893..02c64ab 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -45,6 +45,12 @@ static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
 	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
 }
 
+static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr)
+{
+	return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE,
+		       HPTEG_HASH_BITS_PTE_LONG);
+}
+
 static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
 {
 	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
@@ -66,6 +72,11 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
 	hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
 
+	/* Add to ePTE_long list */
+	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
+	hlist_add_head_rcu(&pte->list_pte_long,
+			   &vcpu->arch.hpte_hash_pte_long[index]);
+
 	/* Add to vPTE list */
 	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
 	hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
@@ -99,6 +110,7 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 	spin_lock(&vcpu->arch.mmu_lock);
 
 	hlist_del_init_rcu(&pte->list_pte);
+	hlist_del_init_rcu(&pte->list_pte_long);
 	hlist_del_init_rcu(&pte->list_vpte);
 	hlist_del_init_rcu(&pte->list_vpte_long);
 
@@ -150,10 +162,28 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 	rcu_read_unlock();
 }
 
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
-	u64 i;
+	struct hlist_head *list;
+	struct hlist_node *node;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu->arch.hpte_hash_pte_long[
+			kvmppc_mmu_hash_pte_long(guest_ea)];
 
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, node, list, list_pte_long)
+		if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
 	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
 		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
 
@@ -164,9 +194,7 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 		kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
 		break;
 	case 0x0ffff000:
-		/* 32-bit flush w/o segment, go through all possible segments */
-		for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
-			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+		kvmppc_mmu_pte_flush_long(vcpu, guest_ea);
 		break;
 	case 0:
 		/* Doing a complete flush -> start from scratch */
@@ -292,6 +320,8 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
 	/* init hpte lookup hashes */
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
 				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long));
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
 				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
 	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
-- 
cgit v0.10.2


From 0e677903878ef90e09a45507255c0b1e36166064 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 15:04:20 +0200
Subject: KVM: PPC: Use MSR_DR for external load_up

Book3S_32 requires MSR_DR to be disabled during load_up_xxx while on Book3S_64
it's supposed to be enabled. I misread the code and disabled it in both cases,
potentially breaking the PS3 which has a really small RMA.

This patch makes KVM work on the PS3 again.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 506d5c3..229d3d6 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -202,8 +202,25 @@ _GLOBAL(kvmppc_rmcall)
 
 #if defined(CONFIG_PPC_BOOK3S_32)
 #define STACK_LR	INT_FRAME_SIZE+4
+
+/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */
+#define MSR_EXT_START						\
+	PPC_STL	r20, _NIP(r1);					\
+	mfmsr	r20;						\
+	LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE);			\
+	andc	r3,r20,r3;		/* Disable DR,EE */	\
+	mtmsr	r3;						\
+	sync
+
+#define MSR_EXT_END						\
+	mtmsr	r20;			/* Enable DR,EE */	\
+	sync;							\
+	PPC_LL	r20, _NIP(r1)
+
 #elif defined(CONFIG_PPC_BOOK3S_64)
 #define STACK_LR	_LINK
+#define MSR_EXT_START
+#define MSR_EXT_END
 #endif
 
 /*
@@ -215,19 +232,12 @@ _GLOBAL(kvmppc_load_up_ ## what);				\
 	PPC_STLU r1, -INT_FRAME_SIZE(r1);			\
 	mflr	r3;						\
 	PPC_STL	r3, STACK_LR(r1);				\
-	PPC_STL	r20, _NIP(r1);					\
-	mfmsr	r20;						\
-	LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE);			\
-	andc	r3,r20,r3;		/* Disable DR,EE */	\
-	mtmsr	r3;						\
-	sync;							\
+	MSR_EXT_START;						\
 								\
 	bl	FUNC(load_up_ ## what);				\
 								\
-	mtmsr	r20;			/* Enable DR,EE */	\
-	sync;							\
+	MSR_EXT_END;						\
 	PPC_LL	r3, STACK_LR(r1);				\
-	PPC_LL	r20, _NIP(r1);					\
 	mtlr	r3;						\
 	addi	r1, r1, INT_FRAME_SIZE;				\
 	blr
-- 
cgit v0.10.2


From 2b05d71fefc3b83e686bead355c6d35e440c4261 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 15:04:21 +0200
Subject: KVM: PPC: Make long relocations be ulong

On Book3S KVM we directly expose some asm pointers to C code as
variables. These need to be relocated and thus break on relocatable
kernels.

To make sure we can at least build, let's mark them as long instead
of u32 where 64bit relocations don't work.

This fixes the following build error:

WARNING: 2 bad relocations^M
> c000000000008590 R_PPC64_ADDR32    .text+0x4000000000008460^M
> c000000000008594 R_PPC64_ADDR32    .text+0x4000000000008598^M

Please keep in mind that actually using KVM on a relocated kernel
might still break. This only fixes the compile problem.

Reported-by: Subrata Modak <subrata@linux.vnet.ibm.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 00cf8b0..f04f516 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -132,8 +132,8 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern u32 kvmppc_trampoline_lowmem;
-extern u32 kvmppc_trampoline_enter;
+extern ulong kvmppc_trampoline_lowmem;
+extern ulong kvmppc_trampoline_enter;
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 229d3d6..2b9c908 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -252,10 +252,10 @@ define_load_up(vsx)
 
 .global kvmppc_trampoline_lowmem
 kvmppc_trampoline_lowmem:
-	.long kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
+	PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
 
 .global kvmppc_trampoline_enter
 kvmppc_trampoline_enter:
-	.long kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
+	PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
 
 #include "book3s_segment.S"
-- 
cgit v0.10.2


From a58ddea556f8877ccf7caa046b6d6b32982f5b1d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 15:04:22 +0200
Subject: KVM: PPC: Move KVM trampolines before __end_interrupts

When using a relocatable kernel we need to make sure that the trampline code
and the interrupt handlers are both copied to low memory. The only way to do
this reliably is to put them in the copied section.

This patch should make relocated kernels work with KVM.

KVM-Stable-Tag
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index f53029a..1667a07 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -299,6 +299,12 @@ slb_miss_user_pseries:
 	b	.				/* prevent spec. execution */
 #endif /* __DISABLED__ */
 
+/* KVM's trampoline code needs to be close to the interrupt handlers */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include "../kvm/book3s_rmhandlers.S"
+#endif
+
 	.align	7
 	.globl	__end_interrupts
 __end_interrupts:
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index c571cd3..f0dd577 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -166,12 +166,6 @@ exception_marker:
 #include "exceptions-64s.S"
 #endif
 
-/* KVM trampoline code needs to be close to the interrupt handlers */
-
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#include "../kvm/book3s_rmhandlers.S"
-#endif
-
 _GLOBAL(generic_secondary_thread_init)
 	mr	r24,r3
 
-- 
cgit v0.10.2


From d3ad6243293d92c82530a50c77d71bb0a0a42fdc Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 5 Aug 2010 16:34:39 +0800
Subject: KVM: x86 emulator: simplify two-byte opcode check

Two-byte opcode always start with 0x0F and the decode flags
of opcode 0xF0 is always 0, so remove dup check.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ddbad15..a9a4a0b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2375,13 +2375,11 @@ done_prefixes:
 
 	/* Opcode byte(s). */
 	opcode = opcode_table[c->b];
-	if (opcode.flags == 0) {
-		/* Two-byte opcode? */
-		if (c->b == 0x0f) {
-			c->twobyte = 1;
-			c->b = insn_fetch(u8, 1, c->eip);
-			opcode = twobyte_table[c->b];
-		}
+	/* Two-byte opcode? */
+	if (c->b == 0x0f) {
+		c->twobyte = 1;
+		c->b = insn_fetch(u8, 1, c->eip);
+		opcode = twobyte_table[c->b];
 	}
 	c->d = opcode.flags;
 
-- 
cgit v0.10.2


From 160ce1f1a8fe64b3e2686ae73fbf051ccfe7c7ef Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 4 Aug 2010 05:44:24 +0300
Subject: KVM: x86 emulator: Allow accessing IDT via emulator ops

The patch adds a new member get_idt() to x86_emulate_ops.
It also adds a function to get the idt in order to be used by the emulator.

This is needed for real mode interrupt injection and the emulation of int
instructions.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1e4a72c..1bbf2b6 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,7 @@ struct x86_emulate_ops {
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
 	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+	void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35c0f4e..768197a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3790,6 +3790,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_gdt(vcpu, dt);
 }
 
+static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->get_idt(vcpu, dt);
+}
+
 static unsigned long emulator_get_cached_segment_base(int seg,
 						      struct kvm_vcpu *vcpu)
 {
@@ -3883,6 +3888,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_segment_selector = emulator_set_segment_selector,
 	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
+	.get_idt	     = emulator_get_idt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
-- 
cgit v0.10.2


From 6e154e56b4d7a6a28c54f0984e13d3f8defc4755 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 4 Aug 2010 14:38:06 +0300
Subject: KVM: x86 emulator: Add into, int, and int3 instructions (opcodes
 0xcc-0xce)

This adds support for int instructions to the emulator.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a9a4a0b..5205d68 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1180,6 +1180,67 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+			       struct x86_emulate_ops *ops, int irq)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	struct desc_ptr dt;
+	gva_t cs_addr;
+	gva_t eip_addr;
+	u16 cs, eip;
+	u32 err;
+
+	/* TODO: Add limit checks */
+	c->src.val = ctxt->eflags;
+	emulate_push(ctxt, ops);
+
+	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+
+	c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	emulate_push(ctxt, ops);
+
+	c->src.val = c->eip;
+	emulate_push(ctxt, ops);
+
+	ops->get_idt(&dt, ctxt->vcpu);
+
+	eip_addr = dt.address + (irq << 2);
+	cs_addr = dt.address + (irq << 2) + 2;
+
+	rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->eip = eip;
+
+	return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops, int irq)
+{
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_int_real(ctxt, ops, irq);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
+	default:
+		/* Protected mode interrupts unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
 static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 			     struct x86_emulate_ops *ops)
 {
@@ -2616,6 +2677,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
+	int irq; /* Used for int 3, int, and into */
 
 	ctxt->decode.mem_read.pos = 0;
 
@@ -2960,6 +3022,22 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	case 0xcc:		/* int3 */
+		irq = 3;
+		goto do_interrupt;
+	case 0xcd:		/* int n */
+		irq = c->src.val;
+	do_interrupt:
+		rc = emulate_int(ctxt, ops, irq);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xce:		/* into */
+		if (ctxt->eflags & EFLG_OF) {
+			irq = 4;
+			goto do_interrupt;
+		}
+		break;
 	case 0xcf:		/* iret */
 		rc = emulate_iret(ctxt, ops);
 
-- 
cgit v0.10.2


From 06cb704611caf40e531a3835809283f14f5307d5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 4 Aug 2010 15:36:53 +0800
Subject: KVM: x86 emulator: use SrcAcc to simplify stos decoding

Use SrcAcc to simplify stos decoding.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5205d68..6c1e4d6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2235,7 +2235,8 @@ static struct opcode opcode_table[256] = {
 	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
 	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
-	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
+	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
+	D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String),
 	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
 	D(ByteOp | DstDI | String), D(DstDI | String),
 	/* 0xB0 - 0xB7 */
@@ -2996,8 +2997,6 @@ special_insn:
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
 	case 0xaa ... 0xab:	/* stos */
-		c->dst.val = c->regs[VCPU_REGS_RAX];
-		break;
 	case 0xac ... 0xad:	/* lods */
 		goto mov;
 	case 0xae ... 0xaf:	/* scas */
-- 
cgit v0.10.2


From 36089fed70337f4d96a5c3aa7fadc4095b707f73 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 4 Aug 2010 15:38:18 +0800
Subject: KVM: x86 emulator: disable writeback when decode dest operand

This patch change to disable writeback when decode dest
operand if the dest type is ImplicitOps or not specified.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6c1e4d6..e0216eb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2627,9 +2627,6 @@ done_prefixes:
 
 	/* Decode and fetch the destination operand: register or memory. */
 	switch (c->d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		return 0;
 	case DstReg:
 		decode_register_operand(&c->dst, c,
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
@@ -2664,6 +2661,11 @@ done_prefixes:
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
 		break;
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+	default:
+		c->dst.type = OP_NONE; /* Disable writeback. */
+		return 0;
 	}
 
 done:
@@ -3115,7 +3117,6 @@ special_insn:
 	case 0xf5:	/* cmc */
 		/* complement carry flag from eflags reg */
 		ctxt->eflags ^= EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		if (!emulate_grp3(ctxt, ops))
@@ -3123,16 +3124,13 @@ special_insn:
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
 			goto done;
-		} else {
+		} else
 			ctxt->eflags &= ~X86_EFLAGS_IF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
-		}
 		break;
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops)) {
@@ -3141,16 +3139,13 @@ special_insn:
 		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	case 0xfc: /* cld */
 		ctxt->eflags &= ~EFLG_DF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfd: /* std */
 		ctxt->eflags |= EFLG_DF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfe: /* Grp4 */
 	grp45:
@@ -3287,16 +3282,13 @@ twobyte_insn:
 		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
-		c->dst.type = OP_NONE;
 		break;
 	case 0x09:		/* wbinvd */
 		kvm_emulate_wbinvd(ctxt->vcpu);
-		c->dst.type = OP_NONE;
 		break;
 	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
-		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
 		switch (c->modrm_reg) {
@@ -3349,7 +3341,6 @@ twobyte_insn:
 			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
 		break;
 	case 0x32:
 		/* rdmsr */
@@ -3361,7 +3352,6 @@ twobyte_insn:
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
 		}
 		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
 		break;
 	case 0x34:		/* sysenter */
 		rc = emulate_sysenter(ctxt, ops);
@@ -3385,7 +3375,6 @@ twobyte_insn:
 	case 0x80 ... 0x8f: /* jnz rel, etc*/
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
-		c->dst.type = OP_NONE;
 		break;
 	case 0xa0:	  /* push fs */
 		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
-- 
cgit v0.10.2


From c034da8b927dc682fe7944895d67f99f07e3740f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 4 Aug 2010 15:38:59 +0800
Subject: KVM: x86 emulator: using SrcOne for instruction d0/d1 decoding

Using SrcOne for instruction d0/d1 decoding.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e0216eb..d711d6a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2251,7 +2251,7 @@ static struct opcode opcode_table[256] = {
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM),
 	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
@@ -3046,7 +3046,6 @@ special_insn:
 			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
-		c->src.val = 1;
 		emulate_grp2(ctxt);
 		break;
 	case 0xd2 ... 0xd3:	/* Grp2 */
-- 
cgit v0.10.2


From 8744aa9aad56be756a58126b429f176898631c3f Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Thu, 5 Aug 2010 15:42:49 +0300
Subject: KVM: x86 emulator: Add stc instruction (opcode 0xf9)

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d711d6a..175b416 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2269,7 +2269,7 @@ static struct opcode opcode_table[256] = {
 	N, N, N, N,
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
-	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
 	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
@@ -3124,6 +3124,9 @@ special_insn:
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
 		break;
+	case 0xf9: /* stc */
+		ctxt->eflags |= EFLG_CF;
+		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
-- 
cgit v0.10.2


From 35c843c4857e2a818d1d951d87c40ee2cf5c1be8 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 9 Aug 2010 11:34:56 +0800
Subject: KVM: x86 emulator: fix negative bit offset BitOp instruction
 emulation

If bit offset operands is a negative number, BitOp instruction
will return wrong value. This patch fix it.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 175b416..5fc441c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -723,6 +723,22 @@ done:
 	return rc;
 }
 
+static void fetch_bit_operand(struct decode_cache *c)
+{
+	long sv, mask;
+
+	if (c->dst.type == OP_MEM) {
+		mask = ~(c->dst.bytes * 8 - 1);
+
+		if (c->src.bytes == 2)
+			sv = (s16)c->src.val & (s16)mask;
+		else if (c->src.bytes == 4)
+			sv = (s32)c->src.val & (s32)mask;
+
+		c->dst.addr.mem += (sv >> 3);
+	}
+}
+
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
 			 unsigned long addr, void *dest, unsigned size)
@@ -2638,12 +2654,8 @@ done_prefixes:
 			c->dst.bytes = 8;
 		else
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->dst.type == OP_MEM && (c->d & BitOp)) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-			c->dst.addr.mem = c->dst.addr.mem +
-						   (c->src.val & mask) / 8;
-		}
+		if (c->d & BitOp)
+			fetch_bit_operand(c);
 		c->dst.orig_val = c->dst.val;
 		break;
 	case DstAcc:
-- 
cgit v0.10.2


From 3885f18fe3034a10b3e3923885d70d31ba522844 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 9 Aug 2010 11:37:37 +0800
Subject: KVM: x86 emulator: do not adjust the address for immediate source

adjust the dst address for a register source but not adjust the
address for an immediate source.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5fc441c..9b81cde 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -727,7 +727,7 @@ static void fetch_bit_operand(struct decode_cache *c)
 {
 	long sv, mask;
 
-	if (c->dst.type == OP_MEM) {
+	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
 		mask = ~(c->dst.bytes * 8 - 1);
 
 		if (c->src.bytes == 2)
-- 
cgit v0.10.2


From ba7ff2b76dcf05c4681c2648019b8301ada6f3df Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 9 Aug 2010 11:39:14 +0800
Subject: KVM: x86 emulator: mask group 8 instruction as BitOp

Mask group 8 instruction as BitOp, so we can share the
code for adjust the source operand.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9b81cde..a9b2b9e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -737,6 +737,9 @@ static void fetch_bit_operand(struct decode_cache *c)
 
 		c->dst.addr.mem += (sv >> 3);
 	}
+
+	/* only subword offset */
+	c->src.val &= (c->dst.bytes << 3) - 1;
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -2336,7 +2339,7 @@ static struct opcode twobyte_table[256] = {
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
@@ -3419,8 +3422,6 @@ twobyte_insn:
 		break;
 	case 0xab:
 	      bts:		/* bts */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xac: /* shrd imm8, r, r/m */
@@ -3448,8 +3449,6 @@ twobyte_insn:
 		break;
 	case 0xb3:
 	      btr:		/* btr */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
@@ -3471,8 +3470,6 @@ twobyte_insn:
 		break;
 	case 0xbb:
 	      btc:		/* btc */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xbe ... 0xbf:	/* movsx */
-- 
cgit v0.10.2


From 3f9f53b0d599aabb03db35208fb31768568ca83f Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 8 Aug 2010 21:11:37 +0300
Subject: KVM: x86 emulator: Add unary mul, imul, div, and idiv instructions

This adds unary mul, imul, div, and idiv instructions (group 3 r/m 4-7).

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a9b2b9e..f0415ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -315,6 +315,31 @@ struct group_dual {
 		}							\
 	} while (0)
 
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix)		\
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "4", "1")			\
+			_op _suffix " %5; "				\
+			_POST_EFLAGS("0", "4", "1")			\
+			: "=m" (_eflags), "=&r" (_tmp),			\
+			  "+a" (_rax), "+d" (_rdx)			\
+			: "i" (EFLAGS_MASK), "m" ((_src).val),		\
+			  "a" (_rax), "d" (_rdx));			\
+	} while (0)
+
+/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)			\
+	do {									\
+		switch((_src).bytes) {						\
+		case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
+		case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,  _eflags, "w"); break; \
+		case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
+		case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+		}							\
+	} while (0)
+
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
@@ -1373,6 +1398,8 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
+	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
+	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1384,6 +1411,18 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 	case 3:	/* neg */
 		emulate_1op("neg", c->dst, ctxt->eflags);
 		break;
+	case 4: /* mul */
+		emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 5: /* imul */
+		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 6: /* div */
+		emulate_1op_rax_rdx("div", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 7: /* idiv */
+		emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags);
+		break;
 	default:
 		return 0;
 	}
@@ -2138,7 +2177,7 @@ static struct opcode group1A[] = {
 static struct opcode group3[] = {
 	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	X4(D(Undefined)),
+	X4(D(SrcMem | ModRM)),
 };
 
 static struct opcode group4[] = {
-- 
cgit v0.10.2


From 8c5eee30a942cb3154f14f12407755ed7da74bbc Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 8 Aug 2010 21:11:38 +0300
Subject: KVM: x86 emulator: Fix emulate_grp3 return values

This patch lets emulate_grp3() return X86EMUL_* return codes instead
of hardcoded ones.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f0415ea..8617c34 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1424,9 +1424,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags);
 		break;
 	default:
-		return 0;
+		return X86EMUL_UNHANDLEABLE;
 	}
-	return 1;
+	return X86EMUL_CONTINUE;
 }
 
 static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -3172,7 +3172,7 @@ special_insn:
 		ctxt->eflags ^= EFLG_CF;
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		if (!emulate_grp3(ctxt, ops))
+		if (emulate_grp3(ctxt, ops) != X86EMUL_CONTINUE)
 			goto cannot_emulate;
 		break;
 	case 0xf8: /* clc */
-- 
cgit v0.10.2


From d9574a25afc3cd7ccd6a0bc05252bb84189e4021 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 10 Aug 2010 13:48:22 +0800
Subject: KVM: x86 emulator: add bsf/bsr instruction emulation

Add bsf/bsr instruction emulation (opcode 0x0f 0xbc~0xbd)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8617c34..f6b124f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2379,8 +2379,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0xB8 - 0xBF */
 	N, N,
 	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
@@ -3511,6 +3511,30 @@ twobyte_insn:
 	      btc:		/* btc */
 		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xbc: {		/* bsf */
+		u8 zf;
+		__asm__ ("bsf %2, %0; setz %1"
+			 : "=r"(c->dst.val), "=q"(zf)
+			 : "r"(c->src.val));
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+		if (zf) {
+			ctxt->eflags |= X86_EFLAGS_ZF;
+			c->dst.type = OP_NONE;	/* Disable writeback. */
+		}
+		break;
+	}
+	case 0xbd: {		/* bsr */
+		u8 zf;
+		__asm__ ("bsr %2, %0; setz %1"
+			 : "=r"(c->dst.val), "=q"(zf)
+			 : "r"(c->src.val));
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+		if (zf) {
+			ctxt->eflags |= X86_EFLAGS_ZF;
+			c->dst.type = OP_NONE;	/* Disable writeback. */
+		}
+		break;
+	}
 	case 0xbe ... 0xbf:	/* movsx */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-- 
cgit v0.10.2


From 8ec4722dd2aab9b69befb919549ea0a5bfc9e670 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 16 Aug 2010 00:47:01 +0300
Subject: KVM: Separate emulation context initialization in a separate function

The code for initializing the emulation context is duplicated at two
locations (emulate_instruction() and kvm_task_switch()). Separate it
in a separate function and call it from there.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 768197a..c0004eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3931,6 +3931,28 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception);
 }
 
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	int cs_db, cs_l;
+
+	cache_all_regs(vcpu);
+
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+	vcpu->arch.emulate_ctxt.vcpu = vcpu;
+	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+	vcpu->arch.emulate_ctxt.mode =
+		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_VM86 : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+	memset(c, 0, sizeof(struct decode_cache));
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+}
+
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.insn_emulation_fail;
@@ -3987,20 +4009,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	cache_all_regs(vcpu);
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-		int cs_db, cs_l;
-		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-		vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-		vcpu->arch.emulate_ctxt.mode =
-			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-			? X86EMUL_MODE_VM86 : cs_l
-			? X86EMUL_MODE_PROT64 :	cs_db
-			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-		memset(c, 0, sizeof(struct decode_cache));
-		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		init_emulate_ctxt(vcpu);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
 		vcpu->arch.emulate_ctxt.perm_ok = false;
@@ -5052,22 +5061,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
-	int cs_db, cs_l, ret;
-	cache_all_regs(vcpu);
-
-	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+	int ret;
 
-	vcpu->arch.emulate_ctxt.vcpu = vcpu;
-	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-	vcpu->arch.emulate_ctxt.mode =
-		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-		? X86EMUL_MODE_VM86 : cs_l
-		? X86EMUL_MODE_PROT64 :	cs_db
-		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	memset(c, 0, sizeof(struct decode_cache));
-	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+	init_emulate_ctxt(vcpu);
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
 				   tss_selector, reason, has_error_code,
-- 
cgit v0.10.2


From 646bab55a278ceb1cf43b1f80d3dd468be62a421 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 17 Aug 2010 10:08:52 +0800
Subject: KVM: PPC: fix leakage of error page in kvmppc_patch_dcbz()

Add kvm_release_page_clean() after is_error_page() to avoid
leakage of error page.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index eee97b5..7656b6d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -455,8 +455,10 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 	int i;
 
 	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-	if (is_error_page(hpage))
+	if (is_error_page(hpage)) {
+		kvm_release_page_clean(hpage);
 		return;
+	}
 
 	hpage_offset = pte->raddr & ~PAGE_MASK;
 	hpage_offset &= ~0xFFFULL;
-- 
cgit v0.10.2


From 31be40b3985f09c0c89b9e28a8206df32adba842 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 17 Aug 2010 09:17:30 +0800
Subject: KVM: x86 emulator: put register operand write back to a function

Introduce function write_register_operand() to write back the
register operand.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f6b124f..0037130 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1020,6 +1020,25 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
+static void write_register_operand(struct operand *op)
+{
+	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+	switch (op->bytes) {
+	case 1:
+		*(u8 *)op->addr.reg = (u8)op->val;
+		break;
+	case 2:
+		*(u16 *)op->addr.reg = (u16)op->val;
+		break;
+	case 4:
+		*op->addr.reg = (u32)op->val;
+		break;	/* 64b: zero-extend */
+	case 8:
+		*op->addr.reg = op->val;
+		break;
+	}
+}
+
 static inline int writeback(struct x86_emulate_ctxt *ctxt,
 			    struct x86_emulate_ops *ops)
 {
@@ -1029,23 +1048,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 
 	switch (c->dst.type) {
 	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.addr.reg = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.addr.reg = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.addr.reg = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.addr.reg = c->dst.val;
-			break;
-		}
+		write_register_operand(&c->dst);
 		break;
 	case OP_MEM:
 		if (c->lock_prefix)
@@ -2970,25 +2973,13 @@ special_insn:
 	case 0x86 ... 0x87:	/* xchg */
 	xchg:
 		/* Write back the register source. */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *) c->src.addr.reg = (u8) c->dst.val;
-			break;
-		case 2:
-			*(u16 *) c->src.addr.reg = (u16) c->dst.val;
-			break;
-		case 4:
-			*c->src.addr.reg = (u32) c->dst.val;
-			break;	/* 64b reg: zero-extend */
-		case 8:
-			*c->src.addr.reg = c->dst.val;
-			break;
-		}
+		c->src.val = c->dst.val;
+		write_register_operand(&c->src);
 		/*
 		 * Write back the memory destination with implicit LOCK
 		 * prefix.
 		 */
-		c->dst.val = c->src.val;
+		c->dst.val = c->src.orig_val;
 		c->lock_prefix = 1;
 		break;
 	case 0x88 ... 0x8b:	/* mov */
-- 
cgit v0.10.2


From 92f738a52b53dc13b5dd5753634bdb8c59ac9815 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 17 Aug 2010 09:19:34 +0800
Subject: KVM: x86 emulator: add XADD instruction emulation

Add XADD instruction emulation (opcode 0x0f 0xc0~0xc1)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0037130..0c08bff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2385,7 +2385,8 @@ static struct opcode twobyte_table[256] = {
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
 	N, N, N, N, N, N, N, N,
 	/* 0xD0 - 0xDF */
@@ -3531,6 +3532,12 @@ twobyte_insn:
 		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
 							(s16) c->src.val;
 		break;
+	case 0xc0 ... 0xc1:	/* xadd */
+		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+		/* Write back the register source. */
+		c->src.val = c->dst.orig_val;
+		write_register_operand(&c->src);
+		break;
 	case 0xc3:		/* movnti */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-- 
cgit v0.10.2


From 9cf4c4fcb59704f0c50f6aa7933399db5d2edf1a Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 16 Aug 2010 17:51:20 +0200
Subject: KVM: x86: explain 'no-kvmclock' kernel parameter

no-kvmclock kernel parameter is missing its explanation in
Documentation/kernel-parameters.txt. Add it.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8dd7248..6bd2375 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1693,6 +1693,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nojitter	[IA64] Disables jitter checking for ITC timers.
 
+	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
-- 
cgit v0.10.2


From ee45b58efebc826ea2ade310f6e311702d4a5ab9 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 17:10:07 +0800
Subject: KVM: x86 emulator: add setcc instruction emulation

Add setcc instruction emulation (opcode 0x0f 0x90~0x9f)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0c08bff..df349f3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2362,7 +2362,7 @@ static struct opcode twobyte_table[256] = {
 	/* 0x80 - 0x8F */
 	X16(D(SrcImm)),
 	/* 0x90 - 0x9F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
 	/* 0xA0 - 0xA7 */
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
 	N, D(DstMem | SrcReg | ModRM | BitOp),
@@ -3424,6 +3424,9 @@ twobyte_insn:
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
 		break;
+	case 0x90 ... 0x9f:     /* setcc r/m8 */
+		c->dst.val = test_cc(c->b, ctxt->eflags);
+		break;
 	case 0xa0:	  /* push fs */
 		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
 		break;
-- 
cgit v0.10.2


From c483c02ad35256206d6c45d7170fef1e33a43e9c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 15:36:36 +0800
Subject: KVM: x86 emulator: remove useless label from x86_emulate_insn()

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index df349f3..78541e8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2787,16 +2787,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		c->eip = ctxt->eip;
 	}
 
-	if (c->src.type == OP_MEM) {
-		if (c->d & NoAccess)
-			goto no_fetch;
+	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
 		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val64 = c->src.val64;
-	no_fetch:
-		;
 	}
 
 	if (c->src2.type == OP_MEM) {
-- 
cgit v0.10.2


From 943858e27544cd10e6095093a40be911a31892b1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 11:36:51 +0800
Subject: KVM: x86 emulator: introduce DstImmUByte for dst operand decode

Introduce DstImmUByte for dst operand decode, which
will be used for out instruction.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 78541e8..dc074a0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -54,6 +54,7 @@
 #define DstAcc      (4<<1)	/* Destination Accumulator */
 #define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
 #define DstMem64    (6<<1)	/* 64bit memory operand */
+#define DstImmUByte (7<<1)	/* 8-bit unsigned immediate operand */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -2693,6 +2694,12 @@ done_prefixes:
 		decode_register_operand(&c->dst, c,
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
 		break;
+	case DstImmUByte:
+		c->dst.type = OP_IMM;
+		c->dst.addr.mem = c->eip;
+		c->dst.bytes = 1;
+		c->dst.val = insn_fetch(u8, 1, c->eip);
+		break;
 	case DstMem:
 	case DstMem64:
 		c->dst = memop;
-- 
cgit v0.10.2


From 41167be544603e077b866a2922737556dc2294e8 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 11:45:12 +0800
Subject: KVM: x86 emulator: change OUT instruction to use dst instead of src

Change OUT instruction to use dst instead of src, so we can
reuse those code for all out instructions.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index dc074a0..8e12e1b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2321,12 +2321,12 @@ static struct opcode opcode_table[256] = {
 	/* 0xE0 - 0xE7 */
 	N, N, N, N,
 	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
 	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	D(ByteOp | SrcAcc | ImplicitOps), D(SrcAcc | ImplicitOps),
 	/* 0xF0 - 0xF7 */
 	N, N, N, N,
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
@@ -3148,15 +3148,16 @@ special_insn:
 		break;
 	case 0xee: /* out dx,al */
 	case 0xef: /* out dx,(e/r)ax */
-		c->src.val = c->regs[VCPU_REGS_RDX];
+		c->dst.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
-		c->dst.bytes = min(c->dst.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+		c->src.bytes = min(c->src.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->dst.val,
+					  c->src.bytes)) {
 			emulate_gp(ctxt, 0);
 			goto done;
 		}
-		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
-				      ctxt->vcpu);
+		ops->pio_out_emulated(c->src.bytes, c->dst.val,
+				      &c->src.val, 1, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
-- 
cgit v0.10.2


From a13a63faa6237001ed80d4f4051fc028dace10d9 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 11:46:12 +0800
Subject: KVM: x86 emulator: remove dup code of in/out instruction

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8e12e1b..cffe7c2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2923,28 +2923,12 @@ special_insn:
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
-		c->dst.bytes = min(c->dst.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  c->dst.bytes)) {
-			emulate_gp(ctxt, 0);
-			goto done;
-		}
-		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
-				     c->regs[VCPU_REGS_RDX], &c->dst.val))
-			goto done; /* IO is needed, skip writeback */
-		break;
+		c->src.val = c->regs[VCPU_REGS_RDX];
+		goto do_io_in;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
-		c->src.bytes = min(c->src.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  c->src.bytes)) {
-			emulate_gp(ctxt, 0);
-			goto done;
-		}
-		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
-				      &c->src.val, 1, ctxt->vcpu);
-
-		c->dst.type = OP_NONE; /* nothing to writeback */
+		c->dst.val = c->regs[VCPU_REGS_RDX];
+		goto do_io_out;
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
-- 
cgit v0.10.2


From 5c56e1cf7a758c4772e2470b4346a8219ec7f44e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 17 Aug 2010 11:17:51 +0300
Subject: KVM: x86 emulator: fix INTn emulation not pushing EFLAGS and CS

emulate_push() only schedules a push; it doesn't actually push anything.
Call writeback() to flush out the write.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cffe7c2..b89a20e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1232,7 +1232,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops, int irq)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
+	int rc;
 	struct desc_ptr dt;
 	gva_t cs_addr;
 	gva_t eip_addr;
@@ -1242,14 +1242,25 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	/* TODO: Add limit checks */
 	c->src.val = ctxt->eflags;
 	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
 	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
 
 	c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
 	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
 	c->src.val = c->eip;
 	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.type = OP_NONE;
 
 	ops->get_idt(&dt, ctxt->vcpu);
 
-- 
cgit v0.10.2


From f6b33fc5046642b669c3197bf08639172e4cffad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 17 Aug 2010 11:20:37 +0300
Subject: KVM: x86 emulator: implement SCAS (opcodes AE, AF)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b89a20e..09c9210 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2311,7 +2311,7 @@ static struct opcode opcode_table[256] = {
 	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
 	D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String),
 	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
-	D(ByteOp | DstDI | String), D(DstDI | String),
+	D(ByteOp | SrcAcc | DstDI | String), D(SrcAcc | DstDI | String),
 	/* 0xB0 - 0xB7 */
 	X8(D(ByteOp | DstReg | SrcImm | Mov)),
 	/* 0xB8 - 0xBF */
@@ -3046,8 +3046,7 @@ special_insn:
 	case 0xac ... 0xad:	/* lods */
 		goto mov;
 	case 0xae ... 0xaf:	/* scas */
-		DPRINTF("Urk! I don't handle SCAS.\n");
-		goto cannot_emulate;
+		goto cmp;
 	case 0xb0 ... 0xbf: /* mov r, imm */
 		goto mov;
 	case 0xc0 ... 0xc1:
-- 
cgit v0.10.2


From 0fa6ccbd281221bc7d46aff82d846e1f4c1985df Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 17 Aug 2010 11:22:17 +0300
Subject: KVM: x86 emulator: fix REPZ/REPNZ termination condition

EFLAGS.ZF needs to be checked after each iteration, not before.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 09c9210..aab62d5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2781,28 +2781,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-		string_done:
 			ctxt->restart = false;
 			ctxt->eip = c->eip;
 			goto done;
 		}
-		/* The second termination condition only applies for REPE
-		 * and REPNE. Test if the repeat string operation prefix is
-		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-		 * corresponding termination condition according to:
-		 * 	- if REPE/REPZ and ZF = 0 then done
-		 * 	- if REPNE/REPNZ and ZF = 1 then done
-		 */
-		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-		    (c->b == 0xae) || (c->b == 0xaf)) {
-			if ((c->rep_prefix == REPE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == 0))
-				goto string_done;
-			if ((c->rep_prefix == REPNE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
-				goto string_done;
-		}
-		c->eip = ctxt->eip;
 	}
 
 	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
@@ -3229,20 +3211,37 @@ writeback:
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *rc = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		/* The second termination condition only applies for REPE
+		 * and REPNE. Test if the repeat string operation prefix is
+		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+		 * corresponding termination condition according to:
+		 * 	- if REPE/REPZ and ZF = 0 then done
+		 * 	- if REPNE/REPNZ and ZF = 1 then done
+		 */
+		if (((c->b == 0xa6) || (c->b == 0xa7) ||
+		     (c->b == 0xae) || (c->b == 0xaf))
+		    && (((c->rep_prefix == REPE_PREFIX) &&
+			 ((ctxt->eflags & EFLG_ZF) == 0))
+			|| ((c->rep_prefix == REPNE_PREFIX) &&
+			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+			ctxt->restart = false;
 		/*
 		 * Re-enter guest when pio read ahead buffer is empty or,
 		 * if it is not used, after each 1024 iteration.
 		 */
-		if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-		    (rc->end != 0 && rc->end == rc->pos))
+		else if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+			 (rc->end != 0 && rc->end == rc->pos)) {
 			ctxt->restart = false;
+			c->eip = ctxt->eip;
+		}
 	}
 	/*
 	 * reset read cache here in case string instruction is restared
 	 * without decoding
 	 */
 	ctxt->decode.mem_read.end = 0;
-	ctxt->eip = c->eip;
+	if (!ctxt->restart)
+		ctxt->eip = c->eip;
 
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-- 
cgit v0.10.2


From e8b6fa70e3545f0afd63434dbd0c5220d47205f6 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 18 Aug 2010 16:43:13 +0800
Subject: KVM: x86 emulator: add CBW/CWDE/CDQE instruction emulation

Add CBW/CWDE/CDQE instruction emulation.(opcode 0x98)
Used by FreeBSD's boot loader.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aab62d5..312dda5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2300,7 +2300,7 @@ static struct opcode opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
-	N, N, D(SrcImmFAddr | No64), N,
+	D(DstAcc | SrcNone), N, D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
@@ -3003,6 +3003,13 @@ special_insn:
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
 			break;
 		goto xchg;
+	case 0x98: /* cbw/cwde/cdqe */
+		switch (c->op_bytes) {
+		case 2: c->dst.val = (s8)c->dst.val; break;
+		case 4: c->dst.val = (s16)c->dst.val; break;
+		case 8: c->dst.val = (s32)c->dst.val; break;
+		}
+		break;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
 		emulate_push(ctxt, ops);
-- 
cgit v0.10.2


From f2f31845341d22e4f20438b05e83d58e71b723b5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 18 Aug 2010 16:38:21 +0800
Subject: KVM: x86 emulator: add LOOP/LOOPcc instruction emulation

Add LOOP/LOOPcc instruction emulation (opcode 0xe0~0xe2).

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 312dda5..2f816ed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2330,7 +2330,7 @@ static struct opcode opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	N, N, N, N,
+	X3(D(SrcImmByte)), N,
 	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
 	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
@@ -3084,6 +3084,12 @@ special_insn:
 		c->src.val = c->regs[VCPU_REGS_RCX];
 		emulate_grp2(ctxt);
 		break;
+	case 0xe0 ... 0xe2:	/* loop/loopz/loopnz */
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
+		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
+			jmp_rel(c, c->src.val);
+		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;
-- 
cgit v0.10.2


From b3b3d25a12986fb08666823db3e9a74649a71925 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 16 Aug 2010 17:49:52 +0300
Subject: KVM: x86 emulator: pass destination type to ____emulate_2op()

We'll need it later so we can use a register for the destination.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2f816ed..7818c91 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -194,13 +194,13 @@ struct group_dual {
 #define ON64(x)
 #endif
 
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)	\
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
 	do {								\
 		__asm__ __volatile__ (					\
 			_PRE_EFLAGS("0", "4", "2")			\
 			_op _suffix " %"_x"3,%1; "			\
 			_POST_EFLAGS("0", "4", "2")			\
-			: "=m" (_eflags), "=m" ((_dst).val),		\
+			: "=m" (_eflags), "=m" (*(_dsttype*)&(_dst).val),\
 			  "=&r" (_tmp)					\
 			: _y ((_src).val), "i" (EFLAGS_MASK));		\
 	} while (0)
@@ -213,13 +213,13 @@ struct group_dual {
 									\
 		switch ((_dst).bytes) {					\
 		case 2:							\
-			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
 			break;						\
 		case 4:							\
-			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
 			break;						\
 		case 8:							\
-			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
 			break;						\
 		}							\
 	} while (0)
@@ -229,7 +229,7 @@ struct group_dual {
 		unsigned long _tmp;					     \
 		switch ((_dst).bytes) {				             \
 		case 1:							     \
-			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
+			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
 			break;						     \
 		default:						     \
 			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
-- 
cgit v0.10.2


From fb2c264105c64511dbd1a7488b482960895aace4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 16 Aug 2010 17:50:56 +0300
Subject: KVM: x86 emulator: Use a register for ____emulate_2op() destination

Most x86 two operand instructions allow the destination to be a memory operand,
but IMUL (for example) requires that the destination be a register.  Change
____emulate_2op() to take a register for both source and destination so we
can invoke IMUL.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7818c91..81b0f88 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -200,7 +200,7 @@ struct group_dual {
 			_PRE_EFLAGS("0", "4", "2")			\
 			_op _suffix " %"_x"3,%1; "			\
 			_POST_EFLAGS("0", "4", "2")			\
-			: "=m" (_eflags), "=m" (*(_dsttype*)&(_dst).val),\
+			: "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
 			  "=&r" (_tmp)					\
 			: _y ((_src).val), "i" (EFLAGS_MASK));		\
 	} while (0)
-- 
cgit v0.10.2


From 7af04fc05cc185869271927eb470de3d25064b4a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 14:16:35 +0300
Subject: KVM: x86 emulator: implement DAS (opcode 2F)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 81b0f88..83ded7c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2175,6 +2175,45 @@ static int em_push(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u8 al, old_al;
+	bool af, cf, old_cf;
+
+	cf = ctxt->eflags & X86_EFLAGS_CF;
+	al = c->dst.val;
+
+	old_al = al;
+	old_cf = cf;
+	cf = false;
+	af = ctxt->eflags & X86_EFLAGS_AF;
+	if ((al & 0x0f) > 9 || af) {
+		al -= 6;
+		cf = old_cf | (al >= 250);
+		af = true;
+	} else {
+		af = false;
+	}
+	if (old_al > 0x99 || old_cf) {
+		al -= 0x60;
+		cf = true;
+	}
+
+	c->dst.val = al;
+	/* Set PF, ZF, SF */
+	c->src.type = OP_IMM;
+	c->src.val = 0;
+	c->src.bytes = 1;
+	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+	if (cf)
+		ctxt->eflags |= X86_EFLAGS_CF;
+	if (af)
+		ctxt->eflags |= X86_EFLAGS_AF;
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2258,7 +2297,8 @@ static struct opcode opcode_table[256] = {
 	/* 0x28 - 0x2F */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm),
+	N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-- 
cgit v0.10.2


From 0ef753b8c323f5b8d75d7dc57ceef6b35982afdb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 14:51:45 +0300
Subject: KVM: x86 emulator: implement CALL FAR (FF /3)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83ded7c..3133577 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2214,6 +2214,40 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u16 sel, old_cs;
+	ulong old_eip;
+	int rc;
+
+	old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	old_eip = c->eip;
+
+	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+		return X86EMUL_CONTINUE;
+
+	c->eip = 0;
+	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+
+	c->src.val = old_cs;
+	emulate_push(ctxt, ctxt->ops);
+	rc = writeback(ctxt, ctxt->ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->src.val = old_eip;
+	emulate_push(ctxt, ctxt->ops);
+	rc = writeback(ctxt, ctxt->ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.type = OP_NONE;
+
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2241,7 +2275,8 @@ static struct opcode group4[] = {
 
 static struct opcode group5[] = {
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack),
+	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
 	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
 	D(SrcMem | ModRM | Stack), N,
 };
-- 
cgit v0.10.2


From b250e605895d02cede78922d034f7825af72a8b5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 15:11:24 +0300
Subject: KVM: x86 emulator: add SrcImmU16 operand type

Used for RET NEAR instructions.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3133577..db80e28 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -72,6 +72,7 @@
 #define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
 #define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
 #define SrcAcc      (0xd<<4)	/* Source Accumulator */
+#define SrcImmU16   (0xe<<4)    /* Immediate operand, unsigned, 16 bits */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -2678,13 +2679,17 @@ done_prefixes:
 	srcmem_common:
 		c->src = memop;
 		break;
+	case SrcImmU16:
+		c->src.bytes = 2;
+		goto srcimm;
 	case SrcImm:
 	case SrcImmU:
-		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		if (c->src.bytes == 8)
 			c->src.bytes = 4;
+	srcimm:
+		c->src.type = OP_IMM;
+		c->src.addr.mem = c->eip;
 		/* NB. Immediates are sign-extended as necessary. */
 		switch (c->src.bytes) {
 		case 1:
@@ -2697,7 +2702,8 @@ done_prefixes:
 			c->src.val = insn_fetch(s32, 4, c->eip);
 			break;
 		}
-		if ((c->d & SrcMask) == SrcImmU) {
+		if ((c->d & SrcMask) == SrcImmU
+		    || (c->d & SrcMask) == SrcImmU16) {
 			switch (c->src.bytes) {
 			case 1:
 				c->src.val &= 0xff;
-- 
cgit v0.10.2


From 40ece7c7297da90e54e147d3bfbb4531f9fbc570 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 15:12:09 +0300
Subject: KVM: x86 emulator: implement RET imm16 (opcode C2)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index db80e28..9e58f50 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2249,6 +2249,21 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	c->dst.type = OP_REG;
+	c->dst.addr.reg = &c->eip;
+	c->dst.bytes = c->op_bytes;
+	rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2394,7 +2409,9 @@ static struct opcode opcode_table[256] = {
 	X8(D(DstReg | SrcImm | Mov)),
 	/* 0xC0 - 0xC7 */
 	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
-	N, D(ImplicitOps | Stack), N, N,
+	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
+	D(ImplicitOps | Stack),
+	N, N,
 	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
-- 
cgit v0.10.2


From f3a1b9f49647133e8c6eb6a68399ed8dbd61554a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:25:25 +0300
Subject: KVM: x86 emulator: implement IMUL REG, R/M, imm8 (opcode 6B)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9e58f50..618386f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2264,6 +2264,15 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.val = c->src2.val;
+	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2371,7 +2380,8 @@ static struct opcode opcode_table[256] = {
 	N, N, N, N,
 	/* 0x68 - 0x6F */
 	I(SrcImm | Mov | Stack, em_push), N,
-	I(SrcImmByte | Mov | Stack, em_push), N,
+	I(SrcImmByte | Mov | Stack, em_push),
+	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
 	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
 	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
-- 
cgit v0.10.2


From 5c82aa29988c0160d91f75cceebd0a07d8f2406b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:31:43 +0300
Subject: KVM: x86 emulator: implement IMUL REG, R/M (opcode 0F AF)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 618386f..a4d2a46 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2264,15 +2264,22 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 
-	c->dst.val = c->src2.val;
 	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.val = c->src2.val;
+	return em_imul(ctxt);
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2488,7 +2495,7 @@ static struct opcode twobyte_table[256] = {
 	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM),
-	D(ModRM), N,
+	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-- 
cgit v0.10.2


From 7077aec0bcd2f827aeb84ccc56c6f4367c376436 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:53:43 +0300
Subject: KVM: x86 emulator: remove SrcImplicit

Useless.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a4d2a46..7f7fc64 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -58,7 +58,6 @@
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
-#define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
 #define SrcReg      (1<<4)	/* Register operand. */
 #define SrcMem      (2<<4)	/* Memory operand. */
 #define SrcMem16    (3<<4)	/* Memory operand (16-bit). */
@@ -2435,7 +2434,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
 	D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM),
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | ModRM), D(DstMem | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
-- 
cgit v0.10.2


From 48bb5d3c401679e41e7a7f06ca31b3e54a6168f7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:54:34 +0300
Subject: KVM: x86 emulator: implement RDTSC (opcode 0F 31)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7f7fc64..ed192d2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2279,6 +2279,22 @@ static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
 	return em_imul(ctxt);
 }
 
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
+	struct decode_cache *c = &ctxt->decode;
+	u64 tsc = 0;
+
+	if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+	ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
+	c->regs[VCPU_REGS_RAX] = (u32)tsc;
+	c->regs[VCPU_REGS_RDX] = tsc >> 32;
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2469,7 +2485,8 @@ static struct opcode twobyte_table[256] = {
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
+	D(ImplicitOps | Priv), N,
 	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
-- 
cgit v0.10.2


From 39f21ee546cf7d563d813c5fb4473431c1d8fce7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 19:20:21 +0300
Subject: KVM: x86 emulator: consolidate immediate decode into a function

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ed192d2..95543a6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2541,6 +2541,55 @@ static struct opcode twobyte_table[256] = {
 #undef GD
 #undef I
 
+static unsigned imm_size(struct decode_cache *c)
+{
+	unsigned size;
+
+	size = (c->d & ByteOp) ? 1 : c->op_bytes;
+	if (size == 8)
+		size = 4;
+	return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+		      unsigned size, bool sign_extension)
+{
+	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
+	int rc = X86EMUL_CONTINUE;
+
+	op->type = OP_IMM;
+	op->bytes = size;
+	op->addr.mem = c->eip;
+	/* NB. Immediates are sign-extended as necessary. */
+	switch (op->bytes) {
+	case 1:
+		op->val = insn_fetch(s8, 1, c->eip);
+		break;
+	case 2:
+		op->val = insn_fetch(s16, 2, c->eip);
+		break;
+	case 4:
+		op->val = insn_fetch(s32, 4, c->eip);
+		break;
+	}
+	if (!sign_extension) {
+		switch (op->bytes) {
+		case 1:
+			op->val &= 0xff;
+			break;
+		case 2:
+			op->val &= 0xffff;
+			break;
+		case 4:
+			op->val &= 0xffffffff;
+			break;
+		}
+	}
+done:
+	return rc;
+}
+
 int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
@@ -2730,52 +2779,19 @@ done_prefixes:
 		c->src = memop;
 		break;
 	case SrcImmU16:
-		c->src.bytes = 2;
-		goto srcimm;
+		rc = decode_imm(ctxt, &c->src, 2, false);
+		break;
 	case SrcImm:
+		rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+		break;
 	case SrcImmU:
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-	srcimm:
-		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (c->src.bytes) {
-		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
-			break;
-		}
-		if ((c->d & SrcMask) == SrcImmU
-		    || (c->d & SrcMask) == SrcImmU16) {
-			switch (c->src.bytes) {
-			case 1:
-				c->src.val &= 0xff;
-				break;
-			case 2:
-				c->src.val &= 0xffff;
-				break;
-			case 4:
-				c->src.val &= 0xffffffff;
-				break;
-			}
-		}
+		rc = decode_imm(ctxt, &c->src, imm_size(c), false);
 		break;
 	case SrcImmByte:
+		rc = decode_imm(ctxt, &c->src, 1, true);
+		break;
 	case SrcImmUByte:
-		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
-		c->src.bytes = 1;
-		if ((c->d & SrcMask) == SrcImmByte)
-			c->src.val = insn_fetch(s8, 1, c->eip);
-		else
-			c->src.val = insn_fetch(u8, 1, c->eip);
+		rc = decode_imm(ctxt, &c->src, 1, false);
 		break;
 	case SrcAcc:
 		c->src.type = OP_REG;
@@ -2807,6 +2823,9 @@ done_prefixes:
 		break;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	/*
 	 * Decode and fetch the second source operand: register, memory
 	 * or immediate.
@@ -2819,10 +2838,7 @@ done_prefixes:
 		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
 		break;
 	case Src2ImmByte:
-		c->src2.type = OP_IMM;
-		c->src2.addr.mem = c->eip;
-		c->src2.bytes = 1;
-		c->src2.val = insn_fetch(u8, 1, c->eip);
+		rc = decode_imm(ctxt, &c->src2, 1, true);
 		break;
 	case Src2One:
 		c->src2.bytes = 1;
@@ -2830,6 +2846,9 @@ done_prefixes:
 		break;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	/* Decode and fetch the destination operand: register or memory. */
 	switch (c->d & DstMask) {
 	case DstReg:
-- 
cgit v0.10.2


From 7db41eb76244ae623de842e818e459755968a33b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 19:25:28 +0300
Subject: KVM: x86 emulator: add Src2Imm decoding

Needed for 3-operand IMUL.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 95543a6..f456d7e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -95,6 +95,7 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
+#define Src2Imm     (4<<29)
 #define Src2Mask    (7<<29)
 
 #define X2(x...) x, x
@@ -2844,6 +2845,9 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
+	case Src2Imm:
+		rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+		break;
 	}
 
 	if (rc != X86EMUL_CONTINUE)
-- 
cgit v0.10.2


From d46164dbd936bc11c7d2abed62f05b31c7a79ae7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 19:29:33 +0300
Subject: KVM: x86 emulator: implement IMUL REG, R/M, IMM (opcode 69)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f456d7e..55849c3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2402,7 +2402,8 @@ static struct opcode opcode_table[256] = {
 	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
 	N, N, N, N,
 	/* 0x68 - 0x6F */
-	I(SrcImm | Mov | Stack, em_push), N,
+	I(SrcImm | Mov | Stack, em_push),
+	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
 	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
-- 
cgit v0.10.2


From 61429142802b068609ffd8ef48d891e05eeea0b9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 19 Aug 2010 15:13:00 +0300
Subject: KVM: x86 emulator: implement CWD (opcode 99)

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 55849c3..e257f22 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2280,6 +2280,18 @@ static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
 	return em_imul(ctxt);
 }
 
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.type = OP_REG;
+	c->dst.bytes = c->src.bytes;
+	c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+	c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+
+	return X86EMUL_CONTINUE;
+}
+
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
 	unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
@@ -2425,7 +2437,8 @@ static struct opcode opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
-	D(DstAcc | SrcNone), N, D(SrcImmFAddr | No64), N,
+	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+	D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-- 
cgit v0.10.2


From e0df7b9f6cee43c01d6f4a8491bccfd410cb86e1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:05 -0700
Subject: KVM: abstract kvm x86 mmu->n_free_mmu_pages

"free" is a poor name for this value.  In this context, it means,
"the number of mmu pages which this kvm instance should be able to
allocate."  But "free" implies much more that the objects are there
and ready for use.  "available" is a much better description, especially
when you see how it is calculated.

In this patch, we abstract its use into a function.  We'll soon
replace the function's contents by calculating the value in a
different way.

All of the reads of n_free_mmu_pages are taken care of in this
patch.  The modification sites will be handled in a patch
later in the series.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff95d41..625b178 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	int used_pages;
 	LIST_HEAD(invalid_list);
 
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+	used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm);
 	used_pages = max(0, used_pages);
 
 	/*
@@ -2959,18 +2959,15 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	int free_pages;
 	LIST_HEAD(invalid_list);
 
-	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
-	while (free_pages < KVM_REFILL_PAGES &&
+	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						       &invalid_list);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3145,7 +3142,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 		npages = kvm->arch.n_alloc_mmu_pages -
-			 kvm->arch.n_free_mmu_pages;
+			 kvm_mmu_available_pages(kvm);
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759..c3a689a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -50,9 +50,14 @@
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 
+static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+{
+	return kvm->arch.n_free_mmu_pages;
+}
+
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+	if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
 		__kvm_mmu_free_some_pages(vcpu);
 }
 
-- 
cgit v0.10.2


From 39de71ec5397f374aed95e99509372d605e1407c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:14 -0700
Subject: KVM: rename x86 kvm->arch.n_alloc_mmu_pages

arch.n_alloc_mmu_pages is a poor choice of name. This value truly
means, "the number of pages which _may_ be allocated".  But,
reading the name, "n_alloc_mmu_pages" implies "the number of allocated
mmu pages", which is dead wrong.

It's really the high watermark, so let's give it a name to match:
nr_max_mmu_pages.  This change will make the next few patches
much more obvious and easy to read.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52e2eb..0296368 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -369,7 +369,7 @@ struct kvm_vcpu_arch {
 struct kvm_arch {
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
+	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 625b178..6979e7d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	int used_pages;
 	LIST_HEAD(invalid_list);
 
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm);
+	used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm);
 	used_pages = max(0, used_pages);
 
 	/*
@@ -1721,9 +1721,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	}
 	else
 		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_alloc_mmu_pages;
+					 - kvm->arch.n_max_mmu_pages;
 
-	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -3141,7 +3141,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_alloc_mmu_pages -
+		npages = kvm->arch.n_max_mmu_pages -
 			 kvm_mmu_available_pages(kvm);
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0004eb..4b4d283 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2759,7 +2759,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 
 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_alloc_mmu_pages;
+	return kvm->arch.n_max_mmu_pages;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-- 
cgit v0.10.2


From 49d5ca26636cb8feb05aff92fc4dba3e494ec683 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:28 -0700
Subject: KVM: replace x86 kvm n_free_mmu_pages with n_used_mmu_pages

Doing this makes the code much more readable.  That's
borne out by the fact that this patch removes code.  "used"
also happens to be the number that we need to return back to
the slab code when our shrinker gets called.  Keeping this
value as opposed to free makes the next patch simpler.

So, 'struct kvm' is kzalloc()'d.  'struct kvm_arch' is a
structure member (and not a pointer) of 'struct kvm'.  That
means they start out zeroed.  I _think_ they get initialized
properly by kvm_mmu_change_mmu_pages().  But, that only happens
via kvm ioctls.

Another benefit of storing 'used' intead of 'free' is
that the values are consistent from the moment the structure is
allocated: no negative "used" value.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0296368..e01b728 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -367,7 +367,7 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_arch {
-	unsigned int n_free_mmu_pages;
+	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6979e7d..ff39b85 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -980,7 +980,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	++kvm->arch.n_free_mmu_pages;
+	--kvm->arch.n_used_mmu_pages;
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1003,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	--vcpu->kvm->arch.n_free_mmu_pages;
+	++vcpu->kvm->arch.n_used_mmu_pages;
 	return sp;
 }
 
@@ -1689,41 +1689,32 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 /*
  * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
  */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
-	int used_pages;
 	LIST_HEAD(invalid_list);
-
-	used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm);
-	used_pages = max(0, used_pages);
-
 	/*
 	 * If we set the number of mmu pages to be smaller be than the
 	 * number of actived pages , we must to free some mmu pages before we
 	 * change the value
 	 */
 
-	if (used_pages > kvm_nr_mmu_pages) {
-		while (used_pages > kvm_nr_mmu_pages &&
+	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
 			!list_empty(&kvm->arch.active_mmu_pages)) {
 			struct kvm_mmu_page *page;
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+			kvm_mmu_prepare_zap_page(kvm, page,
 							       &invalid_list);
 		}
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
-		kvm_nr_mmu_pages = used_pages;
-		kvm->arch.n_free_mmu_pages = 0;
+		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
-	else
-		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_max_mmu_pages;
 
-	kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c3a689a..f05a03d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,7 +52,8 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_free_mmu_pages;
+	return kvm->arch.n_max_mmu_pages -
+		kvm->arch.n_used_mmu_pages;
 }
 
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 45221ab6684a82a5b60208b76d6f8bfb1bbcb969 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:37 -0700
Subject: KVM: create aggregate kvm_total_used_mmu_pages value

Of slab shrinkers, the VM code says:

 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
 * querying the cache size, so a fastpath for that case is appropriate.

and it *means* it.  Look at how it calls the shrinkers:

    nr_before = (*shrinker->shrink)(0, gfp_mask);
    shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);

So, if you do anything stupid in your shrinker, the VM will doubly
punish you.

The mmu_shrink() function takes the global kvm_lock, then acquires
every VM's kvm->mmu_lock in sequence.  If we have 100 VMs, then
we're going to take 101 locks.  We do it twice, so each call takes
202 locks.  If we're under memory pressure, we can have each cpu
trying to do this.  It can get really hairy, and we've seen lock
spinning in mmu_shrink() be the dominant entry in profiles.

This is guaranteed to optimize at least half of those lock
aquisitions away.  It removes the need to take any of the locks
when simply trying to count objects.

A 'percpu_counter' can be a large object, but we only have one
of these for the entire system.  There are not any better
alternatives at the moment, especially ones that handle CPU
hotplug.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff39b85..33d7af5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,6 +178,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -971,6 +972,18 @@ static int is_empty_shadow_page(u64 *spt)
 }
 #endif
 
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values.  We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+	kvm->arch.n_used_mmu_pages += nr;
+	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
@@ -980,7 +993,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	--kvm->arch.n_used_mmu_pages;
+	kvm_mod_used_mmu_pages(kvm, -1);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1016,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	++vcpu->kvm->arch.n_used_mmu_pages;
+	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
 }
 
@@ -3122,23 +3135,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	struct kvm *kvm;
 	struct kvm *kvm_freed = NULL;
-	int cache_count = 0;
+
+	if (nr_to_scan == 0)
+		goto out;
 
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx, freed_pages;
+		int idx, freed_pages;
 		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_max_mmu_pages -
-			 kvm_mmu_available_pages(kvm);
-		cache_count += npages;
-		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+		if (!kvm_freed && nr_to_scan > 0 &&
+		    kvm->arch.n_used_mmu_pages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
 							  &invalid_list);
-			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
@@ -3152,7 +3164,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 	spin_unlock(&kvm_lock);
 
-	return cache_count;
+out:
+	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
@@ -3195,6 +3208,7 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
+	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
-- 
cgit v0.10.2


From 09b5f4d3c4aa2d4928c0a3723a8de26a76b6339e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 23 Aug 2010 14:56:54 +0800
Subject: KVM: x86 emulator: add LDS/LES/LFS/LGS/LSS instruction emulation

Add LDS/LES/LFS/LGS/LSS instruction emulation.
(opcode 0xc4, 0xc5, 0x0f 0xb2, 0x0f 0xb4~0xb5)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e257f22..aece501 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1514,6 +1514,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops, int seg)
+{
+	struct decode_cache *c = &ctxt->decode;
+	unsigned short sel;
+	int rc;
+
+	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+	rc = load_segment_descriptor(ctxt, ops, sel, seg);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.val = c->src.val;
+	return rc;
+}
+
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2458,7 +2475,7 @@ static struct opcode opcode_table[256] = {
 	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
-	N, N,
+	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
 	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
@@ -2529,9 +2546,9 @@ static struct opcode twobyte_table[256] = {
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
 	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
@@ -3214,6 +3231,16 @@ special_insn:
 		c->dst.addr.reg = &c->eip;
 		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
+	case 0xc4:		/* les */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xc5:		/* lds */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
 	mov:
 		c->dst.val = c->src.val;
@@ -3659,10 +3686,25 @@ twobyte_insn:
 			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
 		}
 		break;
+	case 0xb2:		/* lss */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xb3:
 	      btr:		/* btr */
 		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xb4:		/* lfs */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xb5:		/* lgs */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-- 
cgit v0.10.2


From e4abac67b756680c63af369f053d11991616aeb4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 19 Aug 2010 14:25:48 +0800
Subject: KVM: x86 emulator: add JrCXZ instruction emulation

Add JrCXZ instruction emulation (opcode 0xe3)
Used by FreeBSD boot loader.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aece501..312e798 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2487,7 +2487,7 @@ static struct opcode opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	X3(D(SrcImmByte)), N,
+	X4(D(SrcImmByte)),
 	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
 	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
@@ -3285,6 +3285,10 @@ special_insn:
 		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
 			jmp_rel(c, c->src.val);
 		break;
+	case 0xe3:	/* jcxz/jecxz/jrcxz */
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
+			jmp_rel(c, c->src.val);
+		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;
-- 
cgit v0.10.2


From 80b63faf028fba79e630d3643b0e615bddf4067b Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Tue, 24 Aug 2010 10:31:07 +0800
Subject: KVM: MMU: fix regression from rework mmu_shrink() code

Latest kvm mmu_shrink code rework makes kernel changes kvm->arch.n_used_mmu_pages/
kvm->arch.n_max_mmu_pages at kvm_mmu_free_page/kvm_mmu_alloc_page, which is called
by kvm_mmu_commit_zap_page. So the kvm->arch.n_used_mmu_pages or
kvm_mmu_available_pages(vcpu->kvm) is unchanged after kvm_mmu_prepare_zap_page(),
This caused kvm_mmu_change_mmu_pages/__kvm_mmu_free_some_pages loops forever.
Moving kvm_mmu_commit_zap_page would make the while loop performs as normal.

Reported-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Tested-by: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 33d7af5..c2ac700 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1720,10 +1720,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			kvm_mmu_prepare_zap_page(kvm, page,
-							       &invalid_list);
+			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		}
-		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
 
@@ -2972,9 +2971,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-- 
cgit v0.10.2


From 45bf21a8ce7a2884f067a702a5c7683684846ce1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 23 Aug 2010 16:13:15 +0800
Subject: KVM: MMU: fix missing percpu counter destroy

commit ad05c88266b4cce1c820928ce8a0fb7690912ba1
(KVM: create aggregate kvm_total_used_mmu_pages value)
introduce percpu counter kvm_total_used_mmu_pages but never
destroy it, this may cause oops when rmmod & modprobe.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c2ac700..54a5026 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3185,6 +3185,7 @@ static void mmu_destroy_caches(void)
 void kvm_mmu_module_exit(void)
 {
 	mmu_destroy_caches();
+	percpu_counter_destroy(&kvm_total_used_mmu_pages);
 	unregister_shrinker(&mmu_shrinker);
 }
 
@@ -3207,7 +3208,9 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
-	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
+	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+		goto nomem;
+
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
-- 
cgit v0.10.2


From ae38436b78a8abff767e2ac10e2cd663a7eef476 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:15 -1000
Subject: KVM: x86: Drop vm_init_tsc

This is used only by the VMX code, and is not done properly;
if the TSC is indeed backwards, it is out of sync, and will
need proper handling in the logic at each and every CPU change.
For now, drop this test during init as misguided.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e01b728..6056a23 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -394,7 +394,6 @@ struct kvm_arch {
 	gpa_t ept_identity_map_addr;
 
 	unsigned long irq_sources_bitmap;
-	u64 vm_init_tsc;
 	s64 kvmclock_offset;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 751a2d2..4fbab24 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2512,7 +2512,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat, tsc_this, tsc_base;
+	u64 host_pat, tsc_this;
 	unsigned long a;
 	struct desc_ptr dt;
 	int i;
@@ -2653,12 +2653,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
-	rdtscll(tsc_this);
-	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
-		tsc_base = tsc_this;
-
-	guest_write_tsc(0, tsc_base);
+	tsc_this = native_read_tsc();
+	guest_write_tsc(0, tsc_this);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b4d283..8b0c51a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5495,8 +5495,6 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
-	rdtscll(kvm->arch.vm_init_tsc);
-
 	return kvm;
 }
 
-- 
cgit v0.10.2


From f4e1b3c8bd2a044cd0ccf80595bfd088a49fe60b Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:16 -1000
Subject: KVM: x86: Convert TSC writes to TSC offset writes

Change svm / vmx to be the same internally and write TSC offset
instead of bare TSC in helper functions.  Isolated as a single
patch to contain code movement.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af5b9ea..e06f00d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -701,6 +701,20 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 g_tsc_offset = 0;
+
+	if (is_nested(svm)) {
+		g_tsc_offset = svm->vmcb->control.tsc_offset -
+			       svm->nested.hsave->control.tsc_offset;
+		svm->nested.hsave->control.tsc_offset = offset;
+	}
+
+	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -901,7 +915,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
-	svm->vmcb->control.tsc_offset = 0-native_read_tsc();
+	svm_write_tsc_offset(&svm->vcpu, 0-native_read_tsc());
 
 	err = fx_init(&svm->vcpu);
 	if (err)
@@ -2566,20 +2580,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	switch (ecx) {
-	case MSR_IA32_TSC: {
-		u64 tsc_offset = data - native_read_tsc();
-		u64 g_tsc_offset = 0;
-
-		if (is_nested(svm)) {
-			g_tsc_offset = svm->vmcb->control.tsc_offset -
-				       svm->nested.hsave->control.tsc_offset;
-			svm->nested.hsave->control.tsc_offset = tsc_offset;
-		}
-
-		svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
-
+	case MSR_IA32_TSC:
+		svm_write_tsc_offset(vcpu, data - native_read_tsc());
 		break;
-	}
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4fbab24..d9bec5e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1149,9 +1149,9 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
+static void vmx_write_tsc_offset(u64 offset)
 {
-	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+	vmcs_write64(TSC_OFFSET, offset);
 }
 
 /*
@@ -1255,7 +1255,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		break;
 	case MSR_IA32_TSC:
 		rdtscll(host_tsc);
-		guest_write_tsc(data, host_tsc);
+		vmx_write_tsc_offset(data - host_tsc);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2512,7 +2512,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat, tsc_this;
+	u64 host_pat;
 	unsigned long a;
 	struct desc_ptr dt;
 	int i;
@@ -2653,8 +2653,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	tsc_this = native_read_tsc();
-	guest_write_tsc(0, tsc_this);
+	vmx_write_tsc_offset(0-native_read_tsc());
 
 	return 0;
 }
-- 
cgit v0.10.2


From 99e3e30aee1a326a98bf3a5f47b8622219c685f3 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:17 -1000
Subject: KVM: x86: Move TSC offset writes to common code

Also, ensure that the storing of the offset and the reading of the TSC
are never preempted by taking a spinlock.  While the lock is overkill
now, it is useful later in this patch series.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6056a23..a215153 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -395,6 +395,7 @@ struct kvm_arch {
 
 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
+	spinlock_t tsc_write_lock;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -521,6 +522,8 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e06f00d..ea41c55 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -915,7 +915,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
-	svm_write_tsc_offset(&svm->vcpu, 0-native_read_tsc());
+	kvm_write_tsc(&svm->vcpu, 0);
 
 	err = fx_init(&svm->vcpu);
 	if (err)
@@ -2581,7 +2581,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
 	switch (ecx) {
 	case MSR_IA32_TSC:
-		svm_write_tsc_offset(vcpu, data - native_read_tsc());
+		kvm_write_tsc(vcpu, data);
 		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
@@ -3551,6 +3551,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
+
+	.write_tsc_offset = svm_write_tsc_offset,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d9bec5e..138746d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1146,10 +1146,9 @@ static u64 guest_read_tsc(void)
 }
 
 /*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ * writes 'offset' into guest's timestamp counter offset register
  */
-static void vmx_write_tsc_offset(u64 offset)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	vmcs_write64(TSC_OFFSET, offset);
 }
@@ -1224,7 +1223,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct shared_msr_entry *msr;
-	u64 host_tsc;
 	int ret = 0;
 
 	switch (msr_index) {
@@ -1254,8 +1252,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TSC:
-		rdtscll(host_tsc);
-		vmx_write_tsc_offset(data - host_tsc);
+		kvm_write_tsc(vcpu, data);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2653,7 +2650,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	vmx_write_tsc_offset(0-native_read_tsc());
+	kvm_write_tsc(&vmx->vcpu, 0);
 
 	return 0;
 }
@@ -4348,6 +4345,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+	.write_tsc_offset = vmx_write_tsc_offset,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b0c51a..886132b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,6 +895,22 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u64 offset;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+	offset = data - native_read_tsc();
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+	/* Reset of TSC must disable overshoot protection below */
+	vcpu->arch.hv_clock.tsc_timestamp = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -5495,6 +5511,8 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
+	spin_lock_init(&kvm->arch.tsc_write_lock);
+
 	return kvm;
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a4047..2d6385e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -68,4 +68,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+
 #endif
-- 
cgit v0.10.2


From f38e098ff3a315bb74abbb4a35cba11bbea8e2fa Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:20 -1000
Subject: KVM: x86: TSC reset compensation

Attempt to synchronize TSCs which are reset to the same value.  In the
case of a reliable hardware TSC, we can just re-use the same offset, but
on non-reliable hardware, we can get closer by adjusting the offset to
match the elapsed time.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a215153..57b4394 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -396,6 +396,9 @@ struct kvm_arch {
 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
 	spinlock_t tsc_write_lock;
+	u64 last_tsc_nsec;
+	u64 last_tsc_offset;
+	u64 last_tsc_write;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 886132b..e7da14c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -898,11 +898,40 @@ static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
-	u64 offset;
+	u64 offset, ns, elapsed;
 	unsigned long flags;
+	struct timespec ts;
 
 	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = data - native_read_tsc();
+	ktime_get_ts(&ts);
+	monotonic_to_bootbased(&ts);
+	ns = timespec_to_ns(&ts);
+	elapsed = ns - kvm->arch.last_tsc_nsec;
+
+	/*
+	 * Special case: identical write to TSC within 5 seconds of
+	 * another CPU is interpreted as an attempt to synchronize
+	 * (the 5 seconds is to accomodate host load / swapping).
+	 *
+	 * In that case, for a reliable TSC, we can match TSC offsets,
+	 * or make a best guest using kernel_ns value.
+	 */
+	if (data == kvm->arch.last_tsc_write && elapsed < 5ULL * NSEC_PER_SEC) {
+		if (!check_tsc_unstable()) {
+			offset = kvm->arch.last_tsc_offset;
+			pr_debug("kvm: matched tsc offset for %llu\n", data);
+		} else {
+			u64 tsc_delta = elapsed * __get_cpu_var(cpu_tsc_khz);
+			tsc_delta = tsc_delta / USEC_PER_SEC;
+			offset += tsc_delta;
+			pr_debug("kvm: adjusted tsc offset by %llu\n", tsc_delta);
+		}
+		ns = kvm->arch.last_tsc_nsec;
+	}
+	kvm->arch.last_tsc_nsec = ns;
+	kvm->arch.last_tsc_write = data;
+	kvm->arch.last_tsc_offset = offset;
 	kvm_x86_ops->write_tsc_offset(vcpu, offset);
 	spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-- 
cgit v0.10.2


From 8cfdc0008542b57caadbfe013da163131a8293f4 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:21 -1000
Subject: KVM: x86: Make cpu_tsc_khz updates use local CPU

This simplifies much of the init code; we can now simply always
call tsc_khz_changed, optionally passing it a new value, or letting
it figure out the existing value (while interrupts are disabled, and
thus, by inference from the rule, not raceful against CPU hotplug or
frequency updates, which will issue IPIs to the local CPU to perform
this very same task).

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e7da14c..699c6b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,6 +895,15 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
+static inline int kvm_tsc_changes_freq(void)
+{
+	int cpu = get_cpu();
+	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+		  cpufreq_quick_get(cpu) != 0;
+	put_cpu();
+	return ret;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -940,7 +949,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static int kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
 	unsigned long flags;
@@ -949,24 +958,27 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	unsigned long this_tsc_khz;
 
 	if ((!vcpu->time_page))
-		return;
-
-	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
-	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
-		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = this_tsc_khz;
-	}
-	put_cpu_var(cpu_tsc_khz);
+		return 0;
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
 	ktime_get_ts(&ts);
 	monotonic_to_bootbased(&ts);
+	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	local_irq_restore(flags);
 
-	/* With all the info we got, fill in the values */
+	if (unlikely(this_tsc_khz == 0)) {
+		kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+		return 1;
+	}
 
+	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = this_tsc_khz;
+	}
+
+	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.system_time = ts.tv_nsec +
 				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 
@@ -987,6 +999,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	kunmap_atomic(shared_kaddr, KM_USER0);
 
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+	return 0;
 }
 
 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
@@ -1853,12 +1866,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
-		unsigned long khz = cpufreq_quick_get(cpu);
-		if (!khz)
-			khz = tsc_khz;
-		per_cpu(cpu_tsc_khz, cpu) = khz;
-	}
 	kvm_request_guest_time_update(vcpu);
 }
 
@@ -4152,9 +4159,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
-static void bounce_off(void *info)
+static void tsc_bad(void *info)
+{
+	__get_cpu_var(cpu_tsc_khz) = 0;
+}
+
+static void tsc_khz_changed(void *data)
 {
-	/* nothing */
+	struct cpufreq_freqs *freq = data;
+	unsigned long khz = 0;
+
+	if (data)
+		khz = freq->new;
+	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		khz = cpufreq_quick_get(raw_smp_processor_id());
+	if (!khz)
+		khz = tsc_khz;
+	__get_cpu_var(cpu_tsc_khz) = khz;
 }
 
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4165,11 +4186,51 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	struct kvm_vcpu *vcpu;
 	int i, send_ipi = 0;
 
+	/*
+	 * We allow guests to temporarily run on slowing clocks,
+	 * provided we notify them after, or to run on accelerating
+	 * clocks, provided we notify them before.  Thus time never
+	 * goes backwards.
+	 *
+	 * However, we have a problem.  We can't atomically update
+	 * the frequency of a given CPU from this function; it is
+	 * merely a notifier, which can be called from any CPU.
+	 * Changing the TSC frequency at arbitrary points in time
+	 * requires a recomputation of local variables related to
+	 * the TSC for each VCPU.  We must flag these local variables
+	 * to be updated and be sure the update takes place with the
+	 * new frequency before any guests proceed.
+	 *
+	 * Unfortunately, the combination of hotplug CPU and frequency
+	 * change creates an intractable locking scenario; the order
+	 * of when these callouts happen is undefined with respect to
+	 * CPU hotplug, and they can race with each other.  As such,
+	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+	 * undefined; you can actually have a CPU frequency change take
+	 * place in between the computation of X and the setting of the
+	 * variable.  To protect against this problem, all updates of
+	 * the per_cpu tsc_khz variable are done in an interrupt
+	 * protected IPI, and all callers wishing to update the value
+	 * must wait for a synchronous IPI to complete (which is trivial
+	 * if the caller is on the CPU already).  This establishes the
+	 * necessary total order on variable updates.
+	 *
+	 * Note that because a guest time update may take place
+	 * anytime after the setting of the VCPU's request bit, the
+	 * correct TSC value must be set before the request.  However,
+	 * to ensure the update actually makes it to any guest which
+	 * starts running in hardware virtualization between the set
+	 * and the acquisition of the spinlock, we must also ping the
+	 * CPU after setting the request bit.
+	 *
+	 */
+
 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
 		return 0;
 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
 		return 0;
-	per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+
+	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -4179,7 +4240,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 			if (!kvm_request_guest_time_update(vcpu))
 				continue;
 			if (vcpu->cpu != smp_processor_id())
-				send_ipi++;
+				send_ipi = 1;
 		}
 	}
 	spin_unlock(&kvm_lock);
@@ -4197,32 +4258,48 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		 * guest context is entered kvmclock will be updated,
 		 * so the guest will not see stale values.
 		 */
-		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 	}
 	return 0;
 }
 
 static struct notifier_block kvmclock_cpufreq_notifier_block = {
-        .notifier_call  = kvmclock_cpufreq_notifier
+	.notifier_call  = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_notifier(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+		case CPU_ONLINE:
+		case CPU_DOWN_FAILED:
+			smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+			break;
+		case CPU_DOWN_PREPARE:
+			smp_call_function_single(cpu, tsc_bad, NULL, 1);
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvmclock_cpu_notifier_block = {
+	.notifier_call  = kvmclock_cpu_notifier,
+	.priority = -INT_MAX
 };
 
 static void kvm_timer_init(void)
 {
 	int cpu;
 
+	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
-		for_each_online_cpu(cpu) {
-			unsigned long khz = cpufreq_get(cpu);
-			if (!khz)
-				khz = tsc_khz;
-			per_cpu(cpu_tsc_khz, cpu) = khz;
-		}
-	} else {
-		for_each_possible_cpu(cpu)
-			per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
 	}
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4324,6 +4401,7 @@ void kvm_arch_exit(void)
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
+	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 }
@@ -4739,8 +4817,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
-			kvm_write_guest_time(vcpu);
+		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) {
+			r = kvm_write_guest_time(vcpu);
+			if (unlikely(r))
+				goto out;
+		}
 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -5423,17 +5504,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void *garbage)
 {
-	/*
-	 * Since this may be called from a hotplug notifcation,
-	 * we can't get the CPU frequency directly.
-	 */
-	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-		int cpu = raw_smp_processor_id();
-		per_cpu(cpu_tsc_khz, cpu) = 0;
-	}
-
 	kvm_shared_msr_cpu_online();
-
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
-- 
cgit v0.10.2


From 6755bae8e69093b2994b6f29cd3eaecdf610374e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:22 -1000
Subject: KVM: x86: Warn about unstable TSC

If creating an SMP guest with unstable host TSC, issue a warning

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 699c6b8..a8dee58 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5457,6 +5457,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
+	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+		printk_once(KERN_WARNING
+		"kvm: SMP vm created on host with unstable TSC; "
+		"guest TSC will not be reliable\n");
 	return kvm_x86_ops->vcpu_create(kvm, id);
 }
 
-- 
cgit v0.10.2


From e48672fa25e879f7ae21785c7efd187738139593 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:23 -1000
Subject: KVM: x86: Unify TSC logic

Move the TSC control logic from the vendor backends into x86.c
by adding adjust_tsc_offset to x86 ops.  Now all TSC decisions
can be done in one place.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 57b4394..5ab1c3f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -255,7 +255,6 @@ struct kvm_mmu {
 };
 
 struct kvm_vcpu_arch {
-	u64 host_tsc;
 	/*
 	 * rip and regs accesses must go through
 	 * kvm_{register,rip}_{read,write} functions.
@@ -336,9 +335,10 @@ struct kvm_vcpu_arch {
 
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
-	unsigned int hv_clock_tsc_khz;
+	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+	u64 last_host_tsc;
 
 	bool nmi_pending;
 	bool nmi_injected;
@@ -520,6 +520,7 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
+	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ea41c55..ff28f65 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -715,6 +715,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 }
 
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.tsc_offset += adjustment;
+	if (is_nested(svm))
+		svm->nested.hsave->control.tsc_offset += adjustment;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -961,20 +970,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	int i;
 
 	if (unlikely(cpu != vcpu->cpu)) {
-		u64 delta;
-
-		if (check_tsc_unstable()) {
-			/*
-			 * Make sure that the guest sees a monotonically
-			 * increasing TSC.
-			 */
-			delta = vcpu->arch.host_tsc - native_read_tsc();
-			svm->vmcb->control.tsc_offset += delta;
-			if (is_nested(svm))
-				svm->nested.hsave->control.tsc_offset += delta;
-		}
-		vcpu->cpu = cpu;
-		kvm_migrate_timers(vcpu);
 		svm->asid_generation = 0;
 	}
 
@@ -990,8 +985,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	++vcpu->stat.host_state_reload;
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-	vcpu->arch.host_tsc = native_read_tsc();
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -3553,6 +3546,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
 	.write_tsc_offset = svm_write_tsc_offset,
+	.adjust_tsc_offset = svm_adjust_tsc_offset,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 138746d..275a81d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -505,7 +505,6 @@ static void __vcpu_clear(void *arg)
 		vmcs_clear(vmx->vmcs);
 	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	rdtscll(vmx->vcpu.arch.host_tsc);
 	list_del(&vmx->local_vcpus_link);
 	vmx->vcpu.cpu = -1;
 	vmx->launched = 0;
@@ -881,7 +880,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tsc_this, delta, new_offset;
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 
 	if (!vmm_exclusive)
@@ -898,14 +896,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
-		kvm_migrate_timers(vcpu);
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
 		local_irq_enable();
 
-		vcpu->cpu = cpu;
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
 		 * processors.
@@ -915,16 +911,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-		/*
-		 * Make sure the time stamp counter is monotonous.
-		 */
-		rdtscll(tsc_this);
-		if (tsc_this < vcpu->arch.host_tsc) {
-			delta = vcpu->arch.host_tsc - tsc_this;
-			new_offset = vmcs_read64(TSC_OFFSET) + delta;
-			vmcs_write64(TSC_OFFSET, new_offset);
-		}
 	}
 }
 
@@ -1153,6 +1139,12 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	vmcs_write64(TSC_OFFSET, offset);
 }
 
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	u64 offset = vmcs_read64(TSC_OFFSET);
+	vmcs_write64(TSC_OFFSET, offset + adjustment);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -4108,6 +4100,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
+	vmx->vcpu.cpu = cpu;
 	err = vmx_vcpu_setup(vmx);
 	vmx_vcpu_put(&vmx->vcpu);
 	put_cpu();
@@ -4347,6 +4340,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
 	.write_tsc_offset = vmx_write_tsc_offset,
+	.adjust_tsc_offset = vmx_adjust_tsc_offset,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a8dee58..468fafa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -973,9 +973,9 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 		return 1;
 	}
 
-	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = this_tsc_khz;
+		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
 	/* With all the info we got, fill in the values */
@@ -1866,13 +1866,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	kvm_request_guest_time_update(vcpu);
+	if (unlikely(vcpu->cpu != cpu)) {
+		/* Make sure TSC doesn't go backwards */
+		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+				native_read_tsc() - vcpu->arch.last_host_tsc;
+		if (tsc_delta < 0)
+			mark_tsc_unstable("KVM discovered backwards TSC");
+		if (check_tsc_unstable())
+			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+		kvm_migrate_timers(vcpu);
+		vcpu->cpu = cpu;
+	}
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
+	vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
 static int is_efer_nx(void)
-- 
cgit v0.10.2


From 48434c20e18d59001469699fcaaf9cf30b815a20 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:24 -1000
Subject: KVM: x86: Fix deep C-state TSC desynchronization

When CPUs with unstable TSCs enter deep C-state, TSC may stop
running.  This causes us to require resynchronization.  Since
we can't tell when this may potentially happen, we assume the
worst by forcing re-compensation for it at every point the VCPU
task is descheduled.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 468fafa..9396b3f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1866,7 +1866,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	if (unlikely(vcpu->cpu != cpu)) {
+	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		/* Make sure TSC doesn't go backwards */
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 				native_read_tsc() - vcpu->arch.last_host_tsc;
-- 
cgit v0.10.2


From 759379dd68c2885d1fafa433083d4487e710a685 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:25 -1000
Subject: KVM: x86: Add helper functions for time computation

Add a helper function to compute the kernel time and convert nanoseconds
back to CPU specific cycles.  Note that these must not be called in preemptible
context, as that would mean the kernel could enter software suspend state,
which would cause non-atomic operation.

Also, convert the KVM_SET_CLOCK / KVM_GET_CLOCK ioctls to use the kernel
time helper, these should be bootbased as well.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9396b3f..4bcb120 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -893,6 +893,16 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 		 hv_clock->tsc_to_system_mul);
 }
 
+static inline u64 get_kernel_ns(void)
+{
+	struct timespec ts;
+
+	WARN_ON(preemptible());
+	ktime_get_ts(&ts);
+	monotonic_to_bootbased(&ts);
+	return timespec_to_ns(&ts);
+}
+
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
 static inline int kvm_tsc_changes_freq(void)
@@ -904,18 +914,24 @@ static inline int kvm_tsc_changes_freq(void)
 	return ret;
 }
 
+static inline u64 nsec_to_cycles(u64 nsec)
+{
+	WARN_ON(preemptible());
+	if (kvm_tsc_changes_freq())
+		printk_once(KERN_WARNING
+		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
+	return (nsec * __get_cpu_var(cpu_tsc_khz)) / USEC_PER_SEC;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	struct timespec ts;
 
 	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = data - native_read_tsc();
-	ktime_get_ts(&ts);
-	monotonic_to_bootbased(&ts);
-	ns = timespec_to_ns(&ts);
+	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	/*
@@ -931,10 +947,9 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 			offset = kvm->arch.last_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
-			u64 tsc_delta = elapsed * __get_cpu_var(cpu_tsc_khz);
-			tsc_delta = tsc_delta / USEC_PER_SEC;
-			offset += tsc_delta;
-			pr_debug("kvm: adjusted tsc offset by %llu\n", tsc_delta);
+			u64 delta = nsec_to_cycles(elapsed);
+			offset += delta;
+			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
 		ns = kvm->arch.last_tsc_nsec;
 	}
@@ -951,11 +966,11 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static int kvm_write_guest_time(struct kvm_vcpu *v)
 {
-	struct timespec ts;
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
+	s64 kernel_ns;
 
 	if ((!vcpu->time_page))
 		return 0;
@@ -963,8 +978,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
-	ktime_get_ts(&ts);
-	monotonic_to_bootbased(&ts);
+	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	local_irq_restore(flags);
 
@@ -979,9 +993,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	}
 
 	/* With all the info we got, fill in the values */
-	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
-
+	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->hv_clock.flags = 0;
 
 	/*
@@ -3263,7 +3275,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_CLOCK: {
-		struct timespec now;
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 		s64 delta;
@@ -3277,19 +3288,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
-		ktime_get_ts(&now);
-		now_ns = timespec_to_ns(&now);
+		now_ns = get_kernel_ns();
 		delta = user_ns.clock - now_ns;
 		kvm->arch.kvmclock_offset = delta;
 		break;
 	}
 	case KVM_GET_CLOCK: {
-		struct timespec now;
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		ktime_get_ts(&now);
-		now_ns = timespec_to_ns(&now);
+		now_ns = get_kernel_ns();
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
 		user_ns.flags = 0;
 
-- 
cgit v0.10.2


From 46543ba45fc4b64ca32655efdc8d9c599b4164e2 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:26 -1000
Subject: KVM: x86: Robust TSC compensation

Make the match of TSC find TSC writes that are close to each other
instead of perfectly identical; this allows the compensator to also
work in migration / suspend scenarios.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4bcb120..4ff0c27 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -928,21 +928,27 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
+	s64 sdiff;
 
 	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = data - native_read_tsc();
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
+	sdiff = data - kvm->arch.last_tsc_write;
+	if (sdiff < 0)
+		sdiff = -sdiff;
 
 	/*
-	 * Special case: identical write to TSC within 5 seconds of
+	 * Special case: close write to TSC within 5 seconds of
 	 * another CPU is interpreted as an attempt to synchronize
-	 * (the 5 seconds is to accomodate host load / swapping).
+	 * The 5 seconds is to accomodate host load / swapping as
+	 * well as any reset of TSC during the boot process.
 	 *
 	 * In that case, for a reliable TSC, we can match TSC offsets,
-	 * or make a best guest using kernel_ns value.
+	 * or make a best guest using elapsed value.
 	 */
-	if (data == kvm->arch.last_tsc_write && elapsed < 5ULL * NSEC_PER_SEC) {
+	if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+	    elapsed < 5ULL * NSEC_PER_SEC) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.last_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
-- 
cgit v0.10.2


From ca84d1a24c376e0841f35db08dab7b829c8c0b1e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:28 -1000
Subject: KVM: x86: Add clock sync request to hardware enable

If there are active VCPUs which are marked as belonging to
a particular hardware CPU, request a clock sync for them when
enabling hardware; the TSC could be desynchronized on a newly
arriving CPU, and we need to recompute guests system time
relative to boot after a suspend event.

This covers both cases.

Note that it is acceptable to take the spinlock, as either
no other tasks will be running and no locks held (BSP after
resume), or other tasks will be guaranteed to drop the lock
relatively quickly (AP on CPU_STARTING).

Noting we now get clock synchronization requests for VCPUs
which are starting up (or restarting), it is tempting to
attempt to remove the arch/x86/kvm/x86.c CPU hot-notifiers
at this time, however it is not correct to do so; they are
required for systems with non-constant TSC as the frequency
may not be known immediately after the processor has started
until the cpufreq driver has had a chance to run and query
the chipset.

Updated: implement better locking semantics for hardware_enable

Removed the hack of dropping and retaking the lock by adding the
semantic that we always hold kvm_lock when hardware_enable is
called.  The one place that doesn't need to worry about it is
resume, as resuming a frozen CPU, the spinlock won't be taken.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4ff0c27..d0764a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5533,7 +5533,15 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void *garbage)
 {
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
 	kvm_shared_msr_cpu_online();
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			if (vcpu->cpu == smp_processor_id())
+				kvm_request_guest_time_update(vcpu);
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5186e72..da117a6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1961,7 +1961,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	case CPU_STARTING:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
+		spin_lock(&kvm_lock);
 		hardware_enable(NULL);
+		spin_unlock(&kvm_lock);
 		break;
 	}
 	return NOTIFY_OK;
@@ -2168,8 +2170,10 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 
 static int kvm_resume(struct sys_device *dev)
 {
-	if (kvm_usage_count)
+	if (kvm_usage_count) {
+		WARN_ON(spin_is_locked(&kvm_lock));
 		hardware_enable(NULL);
+	}
 	return 0;
 }
 
-- 
cgit v0.10.2


From 347bb4448c2155eb2310923ccaa4be5677649003 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:29 -1000
Subject: x86: pvclock: Move scale_delta into common header

The scale_delta function for shift / multiply with 31-bit
precision moves to a common header so it can be used by both
kernel and kvm module.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f32..7f7e577 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
 			    struct timespec *ts);
 
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427c..bab3b9e 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
 static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
 {
 	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+	return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+				   shadow->tsc_shift);
 }
 
 /*
-- 
cgit v0.10.2


From 1d5f066e0b63271b67eac6d3752f8aa96adcbddb Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:30 -1000
Subject: KVM: x86: Fix a possible backwards warp of kvmclock

Kernel time, which advances in discrete steps may progress much slower
than TSC.  As a result, when kvmclock is adjusted to a new base, the
apparent time to the guest, which runs at a much higher, nsec scaled
rate based on the current TSC, may have already been observed to have
a larger value (kernel_ns + scaled tsc) than the value to which we are
setting it (kernel_ns + 0).

We must instead compute the clock as potentially observed by the guest
for kernel_ns to make sure it does not go backwards.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ab1c3f..789e946 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -339,6 +339,8 @@ struct kvm_vcpu_arch {
 	unsigned int time_offset;
 	struct page *time_page;
 	u64 last_host_tsc;
+	u64 last_guest_tsc;
+	u64 last_kernel_ns;
 
 	bool nmi_pending;
 	bool nmi_injected;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0764a2..d4d33f9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,6 +55,7 @@
 #include <asm/mce.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
+#include <asm/pvclock.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -976,14 +977,15 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
-	s64 kernel_ns;
+	s64 kernel_ns, max_kernel_ns;
+	u64 tsc_timestamp;
 
 	if ((!vcpu->time_page))
 		return 0;
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
 	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	local_irq_restore(flags);
@@ -993,13 +995,49 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 		return 1;
 	}
 
+	/*
+	 * Time as measured by the TSC may go backwards when resetting the base
+	 * tsc_timestamp.  The reason for this is that the TSC resolution is
+	 * higher than the resolution of the other clock scales.  Thus, many
+	 * possible measurments of the TSC correspond to one measurement of any
+	 * other clock, and so a spread of values is possible.  This is not a
+	 * problem for the computation of the nanosecond clock; with TSC rates
+	 * around 1GHZ, there can only be a few cycles which correspond to one
+	 * nanosecond value, and any path through this code will inevitably
+	 * take longer than that.  However, with the kernel_ns value itself,
+	 * the precision may be much lower, down to HZ granularity.  If the
+	 * first sampling of TSC against kernel_ns ends in the low part of the
+	 * range, and the second in the high end of the range, we can get:
+	 *
+	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+	 *
+	 * As the sampling errors potentially range in the thousands of cycles,
+	 * it is possible such a time value has already been observed by the
+	 * guest.  To protect against this, we must compute the system time as
+	 * observed by the guest and ensure the new system time is greater.
+	 */
+	max_kernel_ns = 0;
+	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+		max_kernel_ns = vcpu->last_guest_tsc -
+				vcpu->hv_clock.tsc_timestamp;
+		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+				    vcpu->hv_clock.tsc_to_system_mul,
+				    vcpu->hv_clock.tsc_shift);
+		max_kernel_ns += vcpu->last_kernel_ns;
+	}
+
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
+	if (max_kernel_ns > kernel_ns)
+		kernel_ns = max_kernel_ns;
+
 	/* With all the info we got, fill in the values */
+	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+	vcpu->last_kernel_ns = kernel_ns;
 	vcpu->hv_clock.flags = 0;
 
 	/*
@@ -4931,6 +4969,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
+	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
 	atomic_set(&vcpu->guest_mode, 0);
 	smp_wmb();
 	local_irq_enable();
-- 
cgit v0.10.2


From f392eb2546170e539668a5ab8df6c1254d15a201 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:33 -1000
Subject: KVM: x86: Add timekeeping documentation

Basic informational document about x86 timekeeping and how KVM
is affected.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kvm/timekeeping.txt b/Documentation/kvm/timekeeping.txt
new file mode 100644
index 0000000..0c5033a
--- /dev/null
+++ b/Documentation/kvm/timekeeping.txt
@@ -0,0 +1,612 @@
+
+	Timekeeping Virtualization for X86-Based Architectures
+
+	Zachary Amsden <zamsden@redhat.com>
+	Copyright (c) 2010, Red Hat.  All rights reserved.
+
+1) Overview
+2) Timing Devices
+3) TSC Hardware
+4) Virtualization Problems
+
+=========================================================================
+
+1) Overview
+
+One of the most complicated parts of the X86 platform, and specifically,
+the virtualization of this platform is the plethora of timing devices available
+and the complexity of emulating those devices.  In addition, virtualization of
+time introduces a new set of challenges because it introduces a multiplexed
+division of time beyond the control of the guest CPU.
+
+First, we will describe the various timekeeping hardware available, then
+present some of the problems which arise and solutions available, giving
+specific recommendations for certain classes of KVM guests.
+
+The purpose of this document is to collect data and information relevant to
+timekeeping which may be difficult to find elsewhere, specifically,
+information relevant to KVM and hardware-based virtualization.
+
+=========================================================================
+
+2) Timing Devices
+
+First we discuss the basic hardware devices available.  TSC and the related
+KVM clock are special enough to warrant a full exposition and are described in
+the following section.
+
+2.1) i8254 - PIT
+
+One of the first timer devices available is the programmable interrupt timer,
+or PIT.  The PIT has a fixed frequency 1.193182 MHz base clock and three
+channels which can be programmed to deliver periodic or one-shot interrupts.
+These three channels can be configured in different modes and have individual
+counters.  Channel 1 and 2 were not available for general use in the original
+IBM PC, and historically were connected to control RAM refresh and the PC
+speaker.  Now the PIT is typically integrated as part of an emulated chipset
+and a separate physical PIT is not used.
+
+The PIT uses I/O ports 0x40 - 0x43.  Access to the 16-bit counters is done
+using single or multiple byte access to the I/O ports.  There are 6 modes
+available, but not all modes are available to all timers, as only timer 2
+has a connected gate input, required for modes 1 and 5.  The gate line is
+controlled by port 61h, bit 0, as illustrated in the following diagram.
+
+ --------------             ----------------
+|              |           |                |
+|  1.1932 MHz  |---------->| CLOCK      OUT | ---------> IRQ 0
+|    Clock     |   |       |                |
+ --------------    |    +->| GATE  TIMER 0  |
+                   |        ----------------
+                   |
+                   |        ----------------
+                   |       |                |
+                   |------>| CLOCK      OUT | ---------> 66.3 KHZ DRAM
+                   |       |                |            (aka /dev/null)
+                   |    +->| GATE  TIMER 1  |
+                   |        ----------------
+                   |
+                   |        ----------------
+                   |       |                |
+                   |------>| CLOCK      OUT | ---------> Port 61h, bit 5
+                           |                |      |
+Port 61h, bit 0 ---------->| GATE  TIMER 2  |       \_.----   ____
+                            ----------------         _|    )--|LPF|---Speaker
+                                                    / *----   \___/
+Port 61h, bit 1 -----------------------------------/
+
+The timer modes are now described.
+
+Mode 0: Single Timeout.   This is a one-shot software timeout that counts down
+ when the gate is high (always true for timers 0 and 1).  When the count
+ reaches zero, the output goes high.
+
+Mode 1: Triggered One-shot.  The output is intially set high.  When the gate
+ line is set high, a countdown is initiated (which does not stop if the gate is
+ lowered), during which the output is set low.  When the count reaches zero,
+ the output goes high.
+
+Mode 2: Rate Generator.  The output is initially set high.  When the countdown
+ reaches 1, the output goes low for one count and then returns high.  The value
+ is reloaded and the countdown automatically resumes.  If the gate line goes
+ low, the count is halted.  If the output is low when the gate is lowered, the
+ output automatically goes high (this only affects timer 2).
+
+Mode 3: Square Wave.   This generates a high / low square wave.  The count
+ determines the length of the pulse, which alternates between high and low
+ when zero is reached.  The count only proceeds when gate is high and is
+ automatically reloaded on reaching zero.  The count is decremented twice at
+ each clock to generate a full high / low cycle at the full periodic rate.
+ If the count is even, the clock remains high for N/2 counts and low for N/2
+ counts; if the clock is odd, the clock is high for (N+1)/2 counts and low
+ for (N-1)/2 counts.  Only even values are latched by the counter, so odd
+ values are not observed when reading.  This is the intended mode for timer 2,
+ which generates sine-like tones by low-pass filtering the square wave output.
+
+Mode 4: Software Strobe.  After programming this mode and loading the counter,
+ the output remains high until the counter reaches zero.  Then the output
+ goes low for 1 clock cycle and returns high.  The counter is not reloaded.
+ Counting only occurs when gate is high.
+
+Mode 5: Hardware Strobe.  After programming and loading the counter, the
+ output remains high.  When the gate is raised, a countdown is initiated
+ (which does not stop if the gate is lowered).  When the counter reaches zero,
+ the output goes low for 1 clock cycle and then returns high.  The counter is
+ not reloaded.
+
+In addition to normal binary counting, the PIT supports BCD counting.  The
+command port, 0x43 is used to set the counter and mode for each of the three
+timers.
+
+PIT commands, issued to port 0x43, using the following bit encoding:
+
+Bit 7-4: Command (See table below)
+Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined)
+Bit 0  : Binary (0) / BCD (1)
+
+Command table:
+
+0000 - Latch Timer 0 count for port 0x40
+	sample and hold the count to be read in port 0x40;
+	additional commands ignored until counter is read;
+	mode bits ignored.
+
+0001 - Set Timer 0 LSB mode for port 0x40
+	set timer to read LSB only and force MSB to zero;
+	mode bits set timer mode
+
+0010 - Set Timer 0 MSB mode for port 0x40
+	set timer to read MSB only and force LSB to zero;
+	mode bits set timer mode
+
+0011 - Set Timer 0 16-bit mode for port 0x40
+	set timer to read / write LSB first, then MSB;
+	mode bits set timer mode
+
+0100 - Latch Timer 1 count for port 0x41 - as described above
+0101 - Set Timer 1 LSB mode for port 0x41 - as described above
+0110 - Set Timer 1 MSB mode for port 0x41 - as described above
+0111 - Set Timer 1 16-bit mode for port 0x41 - as described above
+
+1000 - Latch Timer 2 count for port 0x42 - as described above
+1001 - Set Timer 2 LSB mode for port 0x42 - as described above
+1010 - Set Timer 2 MSB mode for port 0x42 - as described above
+1011 - Set Timer 2 16-bit mode for port 0x42 as described above
+
+1101 - General counter latch
+	Latch combination of counters into corresponding ports
+	Bit 3 = Counter 2
+	Bit 2 = Counter 1
+	Bit 1 = Counter 0
+	Bit 0 = Unused
+
+1110 - Latch timer status
+	Latch combination of counter mode into corresponding ports
+	Bit 3 = Counter 2
+	Bit 2 = Counter 1
+	Bit 1 = Counter 0
+
+	The output of ports 0x40-0x42 following this command will be:
+
+	Bit 7 = Output pin
+	Bit 6 = Count loaded (0 if timer has expired)
+	Bit 5-4 = Read / Write mode
+	    01 = MSB only
+	    10 = LSB only
+	    11 = LSB / MSB (16-bit)
+	Bit 3-1 = Mode
+	Bit 0 = Binary (0) / BCD mode (1)
+
+2.2) RTC
+
+The second device which was available in the original PC was the MC146818 real
+time clock.  The original device is now obsolete, and usually emulated by the
+system chipset, sometimes by an HPET and some frankenstein IRQ routing.
+
+The RTC is accessed through CMOS variables, which uses an index register to
+control which bytes are read.  Since there is only one index register, read
+of the CMOS and read of the RTC require lock protection (in addition, it is
+dangerous to allow userspace utilities such as hwclock to have direct RTC
+access, as they could corrupt kernel reads and writes of CMOS memory).
+
+The RTC generates an interrupt which is usually routed to IRQ 8.  The interrupt
+can function as a periodic timer, an additional once a day alarm, and can issue
+interrupts after an update of the CMOS registers by the MC146818 is complete.
+The type of interrupt is signalled in the RTC status registers.
+
+The RTC will update the current time fields by battery power even while the
+system is off.  The current time fields should not be read while an update is
+in progress, as indicated in the status register.
+
+The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be
+programmed to a 32kHz divider if the RTC is to count seconds.
+
+This is the RAM map originally used for the RTC/CMOS:
+
+Location    Size    Description
+------------------------------------------
+00h         byte    Current second (BCD)
+01h         byte    Seconds alarm (BCD)
+02h         byte    Current minute (BCD)
+03h         byte    Minutes alarm (BCD)
+04h         byte    Current hour (BCD)
+05h         byte    Hours alarm (BCD)
+06h         byte    Current day of week (BCD)
+07h         byte    Current day of month (BCD)
+08h         byte    Current month (BCD)
+09h         byte    Current year (BCD)
+0Ah         byte    Register A
+                       bit 7   = Update in progress
+                       bit 6-4 = Divider for clock
+                                  000 = 4.194 MHz
+                                  001 = 1.049 MHz
+                                  010 = 32 kHz
+                                  10X = test modes
+                                  110 = reset / disable
+                                  111 = reset / disable
+                       bit 3-0 = Rate selection for periodic interrupt
+                                  000 = periodic timer disabled
+                                  001 = 3.90625 uS
+                                  010 = 7.8125 uS
+                                  011 = .122070 mS
+                                  100 = .244141 mS
+                                     ...
+                                 1101 = 125 mS
+                                 1110 = 250 mS
+                                 1111 = 500 mS
+0Bh         byte    Register B
+                       bit 7   = Run (0) / Halt (1)
+                       bit 6   = Periodic interrupt enable
+                       bit 5   = Alarm interrupt enable
+                       bit 4   = Update-ended interrupt enable
+                       bit 3   = Square wave interrupt enable
+                       bit 2   = BCD calendar (0) / Binary (1)
+                       bit 1   = 12-hour mode (0) / 24-hour mode (1)
+                       bit 0   = 0 (DST off) / 1 (DST enabled)
+OCh         byte    Register C (read only)
+                       bit 7   = interrupt request flag (IRQF)
+                       bit 6   = periodic interrupt flag (PF)
+                       bit 5   = alarm interrupt flag (AF)
+                       bit 4   = update interrupt flag (UF)
+                       bit 3-0 = reserved
+ODh         byte    Register D (read only)
+                       bit 7   = RTC has power
+                       bit 6-0 = reserved
+32h         byte    Current century BCD (*)
+  (*) location vendor specific and now determined from ACPI global tables
+
+2.3) APIC
+
+On Pentium and later processors, an on-board timer is available to each CPU
+as part of the Advanced Programmable Interrupt Controller.  The APIC is
+accessed through memory-mapped registers and provides interrupt service to each
+CPU, used for IPIs and local timer interrupts.
+
+Although in theory the APIC is a safe and stable source for local interrupts,
+in practice, many bugs and glitches have occurred due to the special nature of
+the APIC CPU-local memory-mapped hardware.  Beware that CPU errata may affect
+the use of the APIC and that workarounds may be required.  In addition, some of
+these workarounds pose unique constraints for virtualization - requiring either
+extra overhead incurred from extra reads of memory-mapped I/O or additional
+functionality that may be more computationally expensive to implement.
+
+Since the APIC is documented quite well in the Intel and AMD manuals, we will
+avoid repetition of the detail here.  It should be pointed out that the APIC
+timer is programmed through the LVT (local vector timer) register, is capable
+of one-shot or periodic operation, and is based on the bus clock divided down
+by the programmable divider register.
+
+2.4) HPET
+
+HPET is quite complex, and was originally intended to replace the PIT / RTC
+support of the X86 PC.  It remains to be seen whether that will be the case, as
+the de facto standard of PC hardware is to emulate these older devices.  Some
+systems designated as legacy free may support only the HPET as a hardware timer
+device.
+
+The HPET spec is rather loose and vague, requiring at least 3 hardware timers,
+but allowing implementation freedom to support many more.  It also imposes no
+fixed rate on the timer frequency, but does impose some extremal values on
+frequency, error and slew.
+
+In general, the HPET is recommended as a high precision (compared to PIT /RTC)
+time source which is independent of local variation (as there is only one HPET
+in any given system).  The HPET is also memory-mapped, and its presence is
+indicated through ACPI tables by the BIOS.
+
+Detailed specification of the HPET is beyond the current scope of this
+document, as it is also very well documented elsewhere.
+
+2.5) Offboard Timers
+
+Several cards, both proprietary (watchdog boards) and commonplace (e1000) have
+timing chips built into the cards which may have registers which are accessible
+to kernel or user drivers.  To the author's knowledge, using these to generate
+a clocksource for a Linux or other kernel has not yet been attempted and is in
+general frowned upon as not playing by the agreed rules of the game.  Such a
+timer device would require additional support to be virtualized properly and is
+not considered important at this time as no known operating system does this.
+
+=========================================================================
+
+3) TSC Hardware
+
+The TSC or time stamp counter is relatively simple in theory; it counts
+instruction cycles issued by the processor, which can be used as a measure of
+time.  In practice, due to a number of problems, it is the most complicated
+timekeeping device to use.
+
+The TSC is represented internally as a 64-bit MSR which can be read with the
+RDMSR, RDTSC, or RDTSCP (when available) instructions.  In the past, hardware
+limitations made it possible to write the TSC, but generally on old hardware it
+was only possible to write the low 32-bits of the 64-bit counter, and the upper
+32-bits of the counter were cleared.  Now, however, on Intel processors family
+0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction
+has been lifted and all 64-bits are writable.  On AMD systems, the ability to
+write the TSC MSR is not an architectural guarantee.
+
+The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by
+means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access.
+
+Some vendors have implemented an additional instruction, RDTSCP, which returns
+atomically not just the TSC, but an indicator which corresponds to the
+processor number.  This can be used to index into an array of TSC variables to
+determine offset information in SMP systems where TSCs are not synchronized.
+The presence of this instruction must be determined by consulting CPUID feature
+bits.
+
+Both VMX and SVM provide extension fields in the virtualization hardware which
+allows the guest visible TSC to be offset by a constant.  Newer implementations
+promise to allow the TSC to additionally be scaled, but this hardware is not
+yet widely available.
+
+3.1) TSC synchronization
+
+The TSC is a CPU-local clock in most implementations.  This means, on SMP
+platforms, the TSCs of different CPUs may start at different times depending
+on when the CPUs are powered on.  Generally, CPUs on the same die will share
+the same clock, however, this is not always the case.
+
+The BIOS may attempt to resynchronize the TSCs during the poweron process and
+the operating system or other system software may attempt to do this as well.
+Several hardware limitations make the problem worse - if it is not possible to
+write the full 64-bits of the TSC, it may be impossible to match the TSC in
+newly arriving CPUs to that of the rest of the system, resulting in
+unsynchronized TSCs.  This may be done by BIOS or system software, but in
+practice, getting a perfectly synchronized TSC will not be possible unless all
+values are read from the same clock, which generally only is possible on single
+socket systems or those with special hardware support.
+
+3.2) TSC and CPU hotplug
+
+As touched on already, CPUs which arrive later than the boot time of the system
+may not have a TSC value that is synchronized with the rest of the system.
+Either system software, BIOS, or SMM code may actually try to establish the TSC
+to a value matching the rest of the system, but a perfect match is usually not
+a guarantee.  This can have the effect of bringing a system from a state where
+TSC is synchronized back to a state where TSC synchronization flaws, however
+small, may be exposed to the OS and any virtualization environment.
+
+3.3) TSC and multi-socket / NUMA
+
+Multi-socket systems, especially large multi-socket systems are likely to have
+individual clocksources rather than a single, universally distributed clock.
+Since these clocks are driven by different crystals, they will not have
+perfectly matched frequency, and temperature and electrical variations will
+cause the CPU clocks, and thus the TSCs to drift over time.  Depending on the
+exact clock and bus design, the drift may or may not be fixed in absolute
+error, and may accumulate over time.
+
+In addition, very large systems may deliberately slew the clocks of individual
+cores.  This technique, known as spread-spectrum clocking, reduces EMI at the
+clock frequency and harmonics of it, which may be required to pass FCC
+standards for telecommunications and computer equipment.
+
+It is recommended not to trust the TSCs to remain synchronized on NUMA or
+multiple socket systems for these reasons.
+
+3.4) TSC and C-states
+
+C-states, or idling states of the processor, especially C1E and deeper sleep
+states may be problematic for TSC as well.  The TSC may stop advancing in such
+a state, resulting in a TSC which is behind that of other CPUs when execution
+is resumed.  Such CPUs must be detected and flagged by the operating system
+based on CPU and chipset identifications.
+
+The TSC in such a case may be corrected by catching it up to a known external
+clocksource.
+
+3.5) TSC frequency change / P-states
+
+To make things slightly more interesting, some CPUs may change frequency.  They
+may or may not run the TSC at the same rate, and because the frequency change
+may be staggered or slewed, at some points in time, the TSC rate may not be
+known other than falling within a range of values.  In this case, the TSC will
+not be a stable time source, and must be calibrated against a known, stable,
+external clock to be a usable source of time.
+
+Whether the TSC runs at a constant rate or scales with the P-state is model
+dependent and must be determined by inspecting CPUID, chipset or vendor
+specific MSR fields.
+
+In addition, some vendors have known bugs where the P-state is actually
+compensated for properly during normal operation, but when the processor is
+inactive, the P-state may be raised temporarily to service cache misses from
+other processors.  In such cases, the TSC on halted CPUs could advance faster
+than that of non-halted processors.  AMD Turion processors are known to have
+this problem.
+
+3.6) TSC and STPCLK / T-states
+
+External signals given to the processor may also have the effect of stopping
+the TSC.  This is typically done for thermal emergency power control to prevent
+an overheating condition, and typically, there is no way to detect that this
+condition has happened.
+
+3.7) TSC virtualization - VMX
+
+VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP
+instructions, which is enough for full virtualization of TSC in any manner.  In
+addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET
+field specified in the VMCS.  Special instructions must be used to read and
+write the VMCS field.
+
+3.8) TSC virtualization - SVM
+
+SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP
+instructions, which is enough for full virtualization of TSC in any manner.  In
+addition, SVM allows passing through the host TSC plus an additional offset
+field specified in the SVM control block.
+
+3.9) TSC feature bits in Linux
+
+In summary, there is no way to guarantee the TSC remains in perfect
+synchronization unless it is explicitly guaranteed by the architecture.  Even
+if so, the TSCs in multi-sockets or NUMA systems may still run independently
+despite being locally consistent.
+
+The following feature bits are used by Linux to signal various TSC attributes,
+but they can only be taken to be meaningful for UP or single node systems.
+
+X86_FEATURE_TSC 		: The TSC is available in hardware
+X86_FEATURE_RDTSCP		: The RDTSCP instruction is available
+X86_FEATURE_CONSTANT_TSC 	: The TSC rate is unchanged with P-states
+X86_FEATURE_NONSTOP_TSC		: The TSC does not stop in C-states
+X86_FEATURE_TSC_RELIABLE	: TSC sync checks are skipped (VMware)
+
+4) Virtualization Problems
+
+Timekeeping is especially problematic for virtualization because a number of
+challenges arise.  The most obvious problem is that time is now shared between
+the host and, potentially, a number of virtual machines.  Thus the virtual
+operating system does not run with 100% usage of the CPU, despite the fact that
+it may very well make that assumption.  It may expect it to remain true to very
+exacting bounds when interrupt sources are disabled, but in reality only its
+virtual interrupt sources are disabled, and the machine may still be preempted
+at any time.  This causes problems as the passage of real time, the injection
+of machine interrupts and the associated clock sources are no longer completely
+synchronized with real time.
+
+This same problem can occur on native harware to a degree, as SMM mode may
+steal cycles from the naturally on X86 systems when SMM mode is used by the
+BIOS, but not in such an extreme fashion.  However, the fact that SMM mode may
+cause similar problems to virtualization makes it a good justification for
+solving many of these problems on bare metal.
+
+4.1) Interrupt clocking
+
+One of the most immediate problems that occurs with legacy operating systems
+is that the system timekeeping routines are often designed to keep track of
+time by counting periodic interrupts.  These interrupts may come from the PIT
+or the RTC, but the problem is the same: the host virtualization engine may not
+be able to deliver the proper number of interrupts per second, and so guest
+time may fall behind.  This is especially problematic if a high interrupt rate
+is selected, such as 1000 HZ, which is unfortunately the default for many Linux
+guests.
+
+There are three approaches to solving this problem; first, it may be possible
+to simply ignore it.  Guests which have a separate time source for tracking
+'wall clock' or 'real time' may not need any adjustment of their interrupts to
+maintain proper time.  If this is not sufficient, it may be necessary to inject
+additional interrupts into the guest in order to increase the effective
+interrupt rate.  This approach leads to complications in extreme conditions,
+where host load or guest lag is too much to compensate for, and thus another
+solution to the problem has risen: the guest may need to become aware of lost
+ticks and compensate for them internally.  Although promising in theory, the
+implementation of this policy in Linux has been extremely error prone, and a
+number of buggy variants of lost tick compensation are distributed across
+commonly used Linux systems.
+
+Windows uses periodic RTC clocking as a means of keeping time internally, and
+thus requires interrupt slewing to keep proper time.  It does use a low enough
+rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in
+practice.
+
+4.2) TSC sampling and serialization
+
+As the highest precision time source available, the cycle counter of the CPU
+has aroused much interest from developers.  As explained above, this timer has
+many problems unique to its nature as a local, potentially unstable and
+potentially unsynchronized source.  One issue which is not unique to the TSC,
+but is highlighted because of its very precise nature is sampling delay.  By
+definition, the counter, once read is already old.  However, it is also
+possible for the counter to be read ahead of the actual use of the result.
+This is a consequence of the superscalar execution of the instruction stream,
+which may execute instructions out of order.  Such execution is called
+non-serialized.  Forcing serialized execution is necessary for precise
+measurement with the TSC, and requires a serializing instruction, such as CPUID
+or an MSR read.
+
+Since CPUID may actually be virtualized by a trap and emulate mechanism, this
+serialization can pose a performance issue for hardware virtualization.  An
+accurate time stamp counter reading may therefore not always be available, and
+it may be necessary for an implementation to guard against "backwards" reads of
+the TSC as seen from other CPUs, even in an otherwise perfectly synchronized
+system.
+
+4.3) Timespec aliasing
+
+Additionally, this lack of serialization from the TSC poses another challenge
+when using results of the TSC when measured against another time source.  As
+the TSC is much higher precision, many possible values of the TSC may be read
+while another clock is still expressing the same value.
+
+That is, you may read (T,T+10) while external clock C maintains the same value.
+Due to non-serialized reads, you may actually end up with a range which
+fluctuates - from (T-1.. T+10).  Thus, any time calculated from a TSC, but
+calibrated against an external value may have a range of valid values.
+Re-calibrating this computation may actually cause time, as computed after the
+calibration, to go backwards, compared with time computed before the
+calibration.
+
+This problem is particularly pronounced with an internal time source in Linux,
+the kernel time, which is expressed in the theoretically high resolution
+timespec - but which advances in much larger granularity intervals, sometimes
+at the rate of jiffies, and possibly in catchup modes, at a much larger step.
+
+This aliasing requires care in the computation and recalibration of kvmclock
+and any other values derived from TSC computation (such as TSC virtualization
+itself).
+
+4.4) Migration
+
+Migration of a virtual machine raises problems for timekeeping in two ways.
+First, the migration itself may take time, during which interrupts cannot be
+delivered, and after which, the guest time may need to be caught up.  NTP may
+be able to help to some degree here, as the clock correction required is
+typically small enough to fall in the NTP-correctable window.
+
+An additional concern is that timers based off the TSC (or HPET, if the raw bus
+clock is exposed) may now be running at different rates, requiring compensation
+in some way in the hypervisor by virtualizing these timers.  In addition,
+migrating to a faster machine may preclude the use of a passthrough TSC, as a
+faster clock cannot be made visible to a guest without the potential of time
+advancing faster than usual.  A slower clock is less of a problem, as it can
+always be caught up to the original rate.  KVM clock avoids these problems by
+simply storing multipliers and offsets against the TSC for the guest to convert
+back into nanosecond resolution values.
+
+4.5) Scheduling
+
+Since scheduling may be based on precise timing and firing of interrupts, the
+scheduling algorithms of an operating system may be adversely affected by
+virtualization.  In theory, the effect is random and should be universally
+distributed, but in contrived as well as real scenarios (guest device access,
+causes of virtualization exits, possible context switch), this may not always
+be the case.  The effect of this has not been well studied.
+
+In an attempt to work around this, several implementations have provided a
+paravirtualized scheduler clock, which reveals the true amount of CPU time for
+which a virtual machine has been running.
+
+4.6) Watchdogs
+
+Watchdog timers, such as the lock detector in Linux may fire accidentally when
+running under hardware virtualization due to timer interrupts being delayed or
+misinterpretation of the passage of real time.  Usually, these warnings are
+spurious and can be ignored, but in some circumstances it may be necessary to
+disable such detection.
+
+4.7) Delays and precision timing
+
+Precise timing and delays may not be possible in a virtualized system.  This
+can happen if the system is controlling physical hardware, or issues delays to
+compensate for slower I/O to and from devices.  The first issue is not solvable
+in general for a virtualized system; hardware control software can't be
+adequately virtualized without a full real-time operating system, which would
+require an RT aware virtualization platform.
+
+The second issue may cause performance problems, but this is unlikely to be a
+significant issue.  In many cases these delays may be eliminated through
+configuration or paravirtualization.
+
+4.8) Covert channels and leaks
+
+In addition to the above problems, time information will inevitably leak to the
+guest about the host in anything but a perfect implementation of virtualized
+time.  This may allow the guest to infer the presence of a hypervisor (as in a
+red-pill type detection), and it may allow information to leak between guests
+by using CPU utilization itself as a signalling channel.  Preventing such
+problems would require completely isolated virtual time which may not track
+real time any longer.  This may be useful in certain security or QA contexts,
+but in general isn't recommended for real-world deployment scenarios.
-- 
cgit v0.10.2


From 45888a0c6edc305495b6bd72a30e66bc40b324c6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:08:57 +0800
Subject: export __get_user_pages_fast() function

This function is used by KVM to pin process's page in the atomic context.

Define the 'weak' function to avoid other architecture not support it

Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/mm/util.c b/mm/util.c
index 4735ea4..73dac81 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+				 int nr_pages, int write, struct page **pages)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
-- 
cgit v0.10.2


From 887c08ac191efb103e33e589aacbc2ce1a3f131e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:10:28 +0800
Subject: KVM: MMU: introduce hva_to_pfn_atomic function

Introduce hva_to_pfn_atomic(), it's the fast path and can used in atomic
context, the later patch will use it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c13cc48..307d0e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -296,6 +296,7 @@ void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
@@ -518,6 +519,12 @@ static inline void kvm_guest_exit(void)
 	current->flags &= ~PF_VCPU;
 }
 
+static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+					       gfn_t gfn)
+{
+	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
 static inline gpa_t gfn_to_gpa(gfn_t gfn)
 {
 	return (gpa_t)gfn << PAGE_SHIFT;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index da117a6..08bd304 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -927,11 +927,6 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return memslot - slots->memslots;
 }
 
-static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
-{
-	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
-}
-
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
@@ -943,19 +938,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
 {
 	struct page *page[1];
 	int npages;
 	pfn_t pfn;
 
-	might_sleep();
-
-	npages = get_user_pages_fast(addr, 1, 1, page);
+	if (atomic)
+		npages = __get_user_pages_fast(addr, 1, 1, page);
+	else {
+		might_sleep();
+		npages = get_user_pages_fast(addr, 1, 1, page);
+	}
 
 	if (unlikely(npages != 1)) {
 		struct vm_area_struct *vma;
 
+		if (atomic)
+			goto return_fault_page;
+
 		down_read(&current->mm->mmap_sem);
 		if (is_hwpoison_address(addr)) {
 			up_read(&current->mm->mmap_sem);
@@ -968,6 +969,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
 			up_read(&current->mm->mmap_sem);
+return_fault_page:
 			get_page(fault_page);
 			return page_to_pfn(fault_page);
 		}
@@ -981,6 +983,12 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 	return pfn;
 }
 
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+{
+	return hva_to_pfn(kvm, addr, true);
+}
+EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
+
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned long addr;
@@ -991,7 +999,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr);
+	return hva_to_pfn(kvm, addr, false);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
@@ -999,7 +1007,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr);
+	return hva_to_pfn(kvm, addr, false);
 }
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-- 
cgit v0.10.2


From 48987781eb1d1e8ded41f55cd5806615fda92c6e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:11:43 +0800
Subject: KVM: MMU: introduce gfn_to_page_many_atomic() function

Introduce this function to get consecutive gfn's pages, it can reduce
gup's overload, used by later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 307d0e2..b837ec8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -289,6 +289,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+			    int nr_pages);
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 08bd304..2eb0b75 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -927,15 +927,25 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return memslot - slots->memslots;
 }
 
-unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
+				     gfn_t *nr_pages)
 {
 	struct kvm_memory_slot *slot;
 
 	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
+
+	if (nr_pages)
+		*nr_pages = slot->npages - (gfn - slot->base_gfn);
+
 	return gfn_to_hva_memslot(slot, gfn);
 }
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+	return gfn_to_hva_many(kvm, gfn, NULL);
+}
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
@@ -1010,6 +1020,23 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 	return hva_to_pfn(kvm, addr, false);
 }
 
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
+								  int nr_pages)
+{
+	unsigned long addr;
+	gfn_t entry;
+
+	addr = gfn_to_hva_many(kvm, gfn, &entry);
+	if (kvm_is_error_hva(addr))
+		return -1;
+
+	if (entry < nr_pages)
+		return 0;
+
+	return __get_user_pages_fast(addr, nr_pages, 1, pages);
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
 	pfn_t pfn;
-- 
cgit v0.10.2


From 957ed9effd80b04482cbdce8c95bdf803a656b94 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:12:48 +0800
Subject: KVM: MMU: prefetch ptes when intercepted guest #PF

Support prefetch ptes when intercept guest #PF, avoid to #PF by later
access

If we meet any failure in the prefetch path, we will exit it and
not try other ptes to avoid become heavy path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 54a5026..b0037a77 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
 	}
 #endif
 
+#define PTE_PREFETCH_NUM		8
+
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
@@ -400,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 4);
+				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -2089,6 +2091,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(vcpu->kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+	      (no_dirty_log && slot->dirty_bitmap))
+		slot = NULL;
+
+	return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+				     bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long hva;
+
+	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+	if (!slot) {
+		get_page(bad_page);
+		return page_to_pfn(bad_page);
+	}
+
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp,
+				    u64 *start, u64 *end)
+{
+	struct page *pages[PTE_PREFETCH_NUM];
+	unsigned access = sp->role.access;
+	int i, ret;
+	gfn_t gfn;
+
+	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+	if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+		return -1;
+
+	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	if (ret <= 0)
+		return -1;
+
+	for (i = 0; i < ret; i++, gfn++, start++)
+		mmu_set_spte(vcpu, start, ACC_ALL,
+			     access, 0, 0, 1, NULL,
+			     sp->role.level, gfn,
+			     page_to_pfn(pages[i]), true, true);
+
+	return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *sptep)
+{
+	u64 *spte, *start = NULL;
+	int i;
+
+	WARN_ON(!sp->role.direct);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+			if (!start)
+				continue;
+			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+				break;
+			start = NULL;
+		} else if (!start)
+			start = spte;
+	}
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+
+	/*
+	 * Since it's no accessed bit on EPT, it's no way to
+	 * distinguish between actually accessed translations
+	 * and prefetched, so disable pte prefetch if EPT is
+	 * enabled.
+	 */
+	if (!shadow_accessed_mask)
+		return;
+
+	sp = page_header(__pa(sptep));
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	__direct_pte_prefetch(vcpu, sp, sptep);
+}
+
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			int level, gfn_t gfn, pfn_t pfn)
 {
@@ -2102,6 +2203,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
 				     level, gfn, pfn, false, true);
+			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef909..872ff26 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -310,6 +310,77 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 	return r || curr_pte != gw->ptes[level - 1];
 }
 
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+	pt_element_t gptep[PTE_PREFETCH_NUM];
+	gpa_t first_pte_gpa;
+	int offset = 0, i;
+	u64 *spte;
+
+	sp = page_header(__pa(sptep));
+
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	if (sp->role.direct)
+		return __direct_pte_prefetch(vcpu, sp, sptep);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	first_pte_gpa = gfn_to_gpa(sp->gfn) +
+				(offset + i) * sizeof(pt_element_t);
+
+	if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
+					sizeof(gptep)) < 0)
+		return;
+
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		pt_element_t gpte;
+		unsigned pte_access;
+		gfn_t gfn;
+		pfn_t pfn;
+		bool dirty;
+
+		if (spte == sptep)
+			continue;
+
+		if (*spte != shadow_trap_nonpresent_pte)
+			continue;
+
+		gpte = gptep[i];
+
+		if (!is_present_gpte(gpte) ||
+		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+			if (!sp->unsync)
+				__set_spte(spte, shadow_notrap_nonpresent_pte);
+			continue;
+		}
+
+		if (!(gpte & PT_ACCESSED_MASK))
+			continue;
+
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		gfn = gpte_to_gfn(gpte);
+		dirty = is_dirty_gpte(gpte);
+		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+				      (pte_access & ACC_WRITE_MASK) && dirty);
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			break;
+		}
+
+		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+			     pfn, true, true);
+	}
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -391,6 +462,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
+	FNAME(pte_prefetch)(vcpu, it.sptep);
 
 	return it.sptep;
 
-- 
cgit v0.10.2


From 189be38db3dde12699a8b9dc22d33e8c95efe110 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:13:33 +0800
Subject: KVM: MMU: combine guest pte read between fetch and pte prefetch

Combine guest pte read between guest pte check in the fetch path and pte prefetch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 872ff26..a4e8389 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
 	int level;
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
+	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
 	unsigned pt_access;
 	unsigned pte_access;
@@ -302,21 +303,33 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 				struct guest_walker *gw, int level)
 {
-	int r;
 	pt_element_t curr_pte;
-
-	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+	u64 mask;
+	int r, index;
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+		base_gpa = pte_gpa & ~mask;
+		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+		curr_pte = gw->prefetch_ptes[index];
+	} else
+		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
 				  &curr_pte, sizeof(curr_pte));
+
 	return r || curr_pte != gw->ptes[level - 1];
 }
 
-static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+				u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
-	pt_element_t gptep[PTE_PREFETCH_NUM];
-	gpa_t first_pte_gpa;
-	int offset = 0, i;
+	pt_element_t *gptep = gw->prefetch_ptes;
 	u64 *spte;
+	int i;
 
 	sp = page_header(__pa(sptep));
 
@@ -327,17 +340,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
 		return __direct_pte_prefetch(vcpu, sp, sptep);
 
 	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
-
-	if (PTTYPE == 32)
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
-	first_pte_gpa = gfn_to_gpa(sp->gfn) +
-				(offset + i) * sizeof(pt_element_t);
-
-	if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
-					sizeof(gptep)) < 0)
-		return;
-
 	spte = sp->spt + i;
 
 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
@@ -462,7 +464,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
-	FNAME(pte_prefetch)(vcpu, it.sptep);
+	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
 	return it.sptep;
 
-- 
cgit v0.10.2


From fc678d67fee1acccf21322318dd833b892a572e4 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 24 Aug 2010 15:48:50 +0200
Subject: KVM: S390: take a full byte as ext_param indicator

Currenty the ext_param field only distinguishes between "config change" and
"vring interrupt". We can do a lot more with it though, so let's enable a
full byte of possible values and constants to #defines while at it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h
index acdfdff..3f5d100 100644
--- a/arch/s390/include/asm/kvm_virtio.h
+++ b/arch/s390/include/asm/kvm_virtio.h
@@ -54,4 +54,10 @@ struct kvm_vqconfig {
  * This is pagesize for historical reasons. */
 #define KVM_S390_VIRTIO_RING_ALIGN	4096
 
+
+/* These values are supposed to be in ext_params on an interrupt */
+#define VIRTIO_PARAM_MASK		0xff
+#define VIRTIO_PARAM_VRING_INTERRUPT	0x0
+#define VIRTIO_PARAM_CONFIG_CHANGED	0x1
+
 #endif
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 4e298bc..68cef4d 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -334,7 +334,7 @@ static void kvm_extint_handler(u16 code)
 {
 	struct virtqueue *vq;
 	u16 subcode;
-	int config_changed;
+	u32 param;
 
 	subcode = S390_lowcore.cpu_addr;
 	if ((subcode & 0xff00) != VIRTIO_SUBCODE_64)
@@ -343,18 +343,25 @@ static void kvm_extint_handler(u16 code)
 	/* The LSB might be overloaded, we have to mask it */
 	vq = (struct virtqueue *)(S390_lowcore.ext_params2 & ~1UL);
 
-	/* We use the LSB of extparam, to decide, if this interrupt is a config
-	 * change or a "standard" interrupt */
-	config_changed = S390_lowcore.ext_params & 1;
+	/* We use ext_params to decide what this interrupt means */
+	param = S390_lowcore.ext_params & VIRTIO_PARAM_MASK;
 
-	if (config_changed) {
+	switch (param) {
+	case VIRTIO_PARAM_CONFIG_CHANGED:
+	{
 		struct virtio_driver *drv;
 		drv = container_of(vq->vdev->dev.driver,
 				   struct virtio_driver, driver);
 		if (drv->config_changed)
 			drv->config_changed(vq->vdev);
-	} else
+
+		break;
+	}
+	case VIRTIO_PARAM_VRING_INTERRUPT:
+	default:
 		vring_interrupt(0, vq);
+		break;
+	}
 }
 
 /*
-- 
cgit v0.10.2


From cefa33e2f8f7852abb42f22ec25a6084a931c5ac Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 24 Aug 2010 15:48:51 +0200
Subject: KVM: S390: Add virtio hotplug add support

The one big missing feature in s390-virtio was hotplugging. This is no more.
This patch implements hotplug add support, so you can on the fly add new devices
in the guest.

Keep in mind that this needs a patch for qemu to actually leverage the
functionality.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h
index 3f5d100..72f6141 100644
--- a/arch/s390/include/asm/kvm_virtio.h
+++ b/arch/s390/include/asm/kvm_virtio.h
@@ -59,5 +59,6 @@ struct kvm_vqconfig {
 #define VIRTIO_PARAM_MASK		0xff
 #define VIRTIO_PARAM_VRING_INTERRUPT	0x0
 #define VIRTIO_PARAM_CONFIG_CHANGED	0x1
+#define VIRTIO_PARAM_DEV_ADD		0x2
 
 #endif
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 68cef4d..5a46b8c 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -32,6 +32,7 @@
  * The pointer to our (page) of device descriptions.
  */
 static void *kvm_devices;
+struct work_struct hotplug_work;
 
 struct kvm_device {
 	struct virtio_device vdev;
@@ -328,6 +329,47 @@ static void scan_devices(void)
 }
 
 /*
+ * match for a kvm device with a specific desc pointer
+ */
+static int match_desc(struct device *dev, void *data)
+{
+	if ((ulong)to_kvmdev(dev_to_virtio(dev))->desc == (ulong)data)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * hotplug_device tries to find changes in the device page.
+ */
+static void hotplug_devices(struct work_struct *dummy)
+{
+	unsigned int i;
+	struct kvm_device_desc *d;
+	struct device *dev;
+
+	for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+		d = kvm_devices + i;
+
+		/* end of list */
+		if (d->type == 0)
+			break;
+
+		/* device already exists */
+		dev = device_find_child(kvm_root, d, match_desc);
+		if (dev) {
+			/* XXX check for hotplug remove */
+			put_device(dev);
+			continue;
+		}
+
+		/* new device */
+		printk(KERN_INFO "Adding new virtio device %p\n", d);
+		add_kvm_device(d, i);
+	}
+}
+
+/*
  * we emulate the request_irq behaviour on top of s390 extints
  */
 static void kvm_extint_handler(u16 code)
@@ -357,6 +399,9 @@ static void kvm_extint_handler(u16 code)
 
 		break;
 	}
+	case VIRTIO_PARAM_DEV_ADD:
+		schedule_work(&hotplug_work);
+		break;
 	case VIRTIO_PARAM_VRING_INTERRUPT:
 	default:
 		vring_interrupt(0, vq);
@@ -390,6 +435,8 @@ static int __init kvm_devices_init(void)
 
 	kvm_devices = (void *) real_memory_size;
 
+	INIT_WORK(&hotplug_work, hotplug_devices);
+
 	ctl_set_bit(0, 9);
 	register_external_interrupt(0x2603, kvm_extint_handler);
 
-- 
cgit v0.10.2


From a3c321c6e27e6134f1e8eece5f9954e8121fdc12 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 24 Aug 2010 15:48:52 +0200
Subject: KVM: S390: Export kvm_virtio.h

As suggested by Christian, we should expose headers to user space with
information that might be valuable there. The s390 virtio interface is
one of those cases. It defines an ABI between hypervisor and guest, so
it should be exposed to user space.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 42e512b..287d7bb 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,6 +5,7 @@ header-y += chsc.h
 header-y += cmb.h
 header-y += dasd.h
 header-y += debug.h
+header-y += kvm_virtio.h
 header-y += monwriter.h
 header-y += qeth.h
 header-y += schid.h
-- 
cgit v0.10.2


From cc4feed57fcd4934b89aaac51d64dbff921e2f2b Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 25 Aug 2010 14:10:53 +0800
Subject: KVM: x86 emulator: add CALL FAR instruction emulation (opcode 9a)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 312e798..1702ea8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2455,7 +2455,7 @@ static struct opcode opcode_table[256] = {
 	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
 	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
-	D(SrcImmFAddr | No64), N,
+	I(SrcImmFAddr | No64, em_call_far), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-- 
cgit v0.10.2


From 6e2fb2cadd9a523ff5494d4c4d53c0d3e0024691 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Aug 2010 12:47:41 +0300
Subject: KVM: x86 emulator: Rename variable that shadows another local
 variable.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1702ea8..42d42ca 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3421,7 +3421,7 @@ writeback:
 				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
-		struct read_cache *rc = &ctxt->decode.io_read;
+		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 		/* The second termination condition only applies for REPE
 		 * and REPNE. Test if the repeat string operation prefix is
@@ -3441,8 +3441,8 @@ writeback:
 		 * Re-enter guest when pio read ahead buffer is empty or,
 		 * if it is not used, after each 1024 iteration.
 		 */
-		else if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-			 (rc->end != 0 && rc->end == rc->pos)) {
+		else if ((r->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+			 (r->end != 0 && r->end == r->pos)) {
 			ctxt->restart = false;
 			c->eip = ctxt->eip;
 		}
-- 
cgit v0.10.2


From 3e2f65d57a0c1897fcc3287eeb41f117f4d021e5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Aug 2010 12:47:42 +0300
Subject: KVM: x86 emulator: move string instruction completion check into
 separate function

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 42d42ca..3dcbc1d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2933,6 +2933,28 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	/* The second termination condition only applies for REPE
+	 * and REPNE. Test if the repeat string operation prefix is
+	 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+	 * corresponding termination condition according to:
+	 * 	- if REPE/REPZ and ZF = 0 then done
+	 * 	- if REPNE/REPNZ and ZF = 1 then done
+	 */
+	if (((c->b == 0xa6) || (c->b == 0xa7) ||
+	     (c->b == 0xae) || (c->b == 0xaf))
+	    && (((c->rep_prefix == REPE_PREFIX) &&
+		 ((ctxt->eflags & EFLG_ZF) == 0))
+		|| ((c->rep_prefix == REPNE_PREFIX) &&
+		    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+		return true;
+
+	return false;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
@@ -3423,19 +3445,8 @@ writeback:
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		/* The second termination condition only applies for REPE
-		 * and REPNE. Test if the repeat string operation prefix is
-		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-		 * corresponding termination condition according to:
-		 * 	- if REPE/REPZ and ZF = 0 then done
-		 * 	- if REPNE/REPNZ and ZF = 1 then done
-		 */
-		if (((c->b == 0xa6) || (c->b == 0xa7) ||
-		     (c->b == 0xae) || (c->b == 0xaf))
-		    && (((c->rep_prefix == REPE_PREFIX) &&
-			 ((ctxt->eflags & EFLG_ZF) == 0))
-			|| ((c->rep_prefix == REPNE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+
+		if (string_insn_completed(ctxt))
 			ctxt->restart = false;
 		/*
 		 * Re-enter guest when pio read ahead buffer is empty or,
-- 
cgit v0.10.2


From d2ddd1c48364e4161052d6089f06b2cf3c50496b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Aug 2010 12:47:43 +0300
Subject: KVM: x86 emulator: get rid of "restart" in emulation context.

x86_emulate_insn() will return 1 if instruction can be restarted
without re-entering a guest.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1bbf2b6..1bf1140 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -224,7 +224,6 @@ struct x86_emulate_ctxt {
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
 
-	bool restart; /* restart string instruction after writeback */
 	bool perm_ok; /* do not check permissions if true */
 
 	int exception; /* exception that happens during emulation or -1 */
@@ -255,6 +254,9 @@ struct x86_emulate_ctxt {
 #endif
 
 int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3dcbc1d..ec35a71 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -437,7 +437,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
 	ctxt->exception = vec;
 	ctxt->error_code = error;
 	ctxt->error_code_valid = valid;
-	ctxt->restart = false;
 }
 
 static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -2633,9 +2632,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 	struct opcode opcode, *g_mod012, *g_mod3;
 	struct operand memop = { .type = OP_NONE };
 
-	/* we cannot decode insn before we complete previous rep insn */
-	WARN_ON(ctxt->restart);
-
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
@@ -2985,10 +2981,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->rep_prefix && (c->d & String)) {
-		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-			ctxt->restart = false;
 			ctxt->eip = c->eip;
 			goto done;
 		}
@@ -3446,28 +3440,29 @@ writeback:
 		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 
-		if (string_insn_completed(ctxt))
-			ctxt->restart = false;
-		/*
-		 * Re-enter guest when pio read ahead buffer is empty or,
-		 * if it is not used, after each 1024 iteration.
-		 */
-		else if ((r->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-			 (r->end != 0 && r->end == r->pos)) {
-			ctxt->restart = false;
-			c->eip = ctxt->eip;
+		if (!string_insn_completed(ctxt)) {
+			/*
+			 * Re-enter guest when pio read ahead buffer is empty
+			 * or, if it is not used, after each 1024 iteration.
+			 */
+			if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+			    (r->end == 0 || r->end != r->pos)) {
+				/*
+				 * Reset read cache. Usually happens before
+				 * decode, but since instruction is restarted
+				 * we have to do it here.
+				 */
+				ctxt->decode.mem_read.end = 0;
+				return EMULATION_RESTART;
+			}
+			goto done; /* skip rip writeback */
 		}
 	}
-	/*
-	 * reset read cache here in case string instruction is restared
-	 * without decoding
-	 */
-	ctxt->decode.mem_read.end = 0;
-	if (!ctxt->restart)
-		ctxt->eip = c->eip;
+
+	ctxt->eip = c->eip;
 
 done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
 	switch (c->b) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d4d33f9..bc96ac9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4181,18 +4181,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
 
-	if (r) { /* emulation failed */
+	if (r == EMULATION_FAILED) {
 		if (reexecute_instruction(vcpu, cr2))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
 	}
 
-	r = EMULATE_DONE;
-
-	if (vcpu->arch.emulate_ctxt.exception >= 0)
+	if (vcpu->arch.emulate_ctxt.exception >= 0) {
 		inject_emulated_exception(vcpu);
-	else if (vcpu->arch.pio.count) {
+		r = EMULATE_DONE;
+	} else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
 		r = EMULATE_DO_MMIO;
@@ -4200,8 +4199,10 @@ restart:
 		if (vcpu->mmio_is_write)
 			vcpu->mmio_needed = 0;
 		r = EMULATE_DO_MMIO;
-	} else if (vcpu->arch.emulate_ctxt.restart)
+	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else
+		r = EMULATE_DONE;
 
 	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
@@ -5100,8 +5101,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
-	    vcpu->arch.emulate_ctxt.restart) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
-- 
cgit v0.10.2


From 081bca0e6b87d0c7b9ade7ffee1f44aca336a8fa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:06:15 +0300
Subject: KVM: x86 emulator: refuse SrcMemFAddr (e.g. LDS) with register
 operand

SrcMemFAddr is not defined with the modrm operand designating a register
instead of a memory address.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ec35a71..2b9b0fe 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2974,6 +2974,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		goto done;
 	}
 
+	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+		emulate_ud(ctxt);
+		goto done;
+	}
+
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		emulate_gp(ctxt, 0);
-- 
cgit v0.10.2


From 8d8f4e9f66ab36e4fcc75eca1e828af8466309f1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:06 +0300
Subject: KVM: x86 emulator: support byte/word opcode pairs

Many x86 instructions come in byte and word variants distinguished with bit
0 of the opcode.  Add macros to aid in defining them.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2b9b0fe..1a230b5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2330,6 +2330,9 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 
+#define D2bv(_f)      D((_f) | ByteOp), D(_f)
+#define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
+
 static struct opcode group1[] = {
 	X7(D(Lock)), N
 };
@@ -2572,6 +2575,9 @@ static struct opcode twobyte_table[256] = {
 #undef GD
 #undef I
 
+#undef D2bv
+#undef I2bv
+
 static unsigned imm_size(struct decode_cache *c)
 {
 	unsigned size;
-- 
cgit v0.10.2


From 5315fbb223086c078c979d16734844ccff12f087 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:07 +0300
Subject: KVM: x86 emulator: simplify ALU block (opcodes 00-3F) decode flags

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1a230b5..277e667 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2385,42 +2385,34 @@ static struct group_dual group9 = { {
 
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x08 - 0x0F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), N,
 	/* 0x10 - 0x17 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x18 - 0x1F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x20 - 0x27 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm), N, N,
 	/* 0x28 - 0x2F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm), N, N,
 	/* 0x38 - 0x3F */
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	N, N,
 	/* 0x40 - 0x4F */
 	X16(D(DstReg)),
-- 
cgit v0.10.2


From 48fe67b5f7f71bb954dc97b18096cef12f6618b4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:08 +0300
Subject: KVM: x86 emulator: simplify string instruction decode flags

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 277e667..749322e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2429,8 +2429,8 @@ static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
-	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+	D2bv(DstDI | Mov | String), /* insb, insw/insd */
+	D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
@@ -2454,13 +2454,12 @@ static struct opcode opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
 	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
-	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
+	D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
 	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
-	D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
-	D(ByteOp | SrcAcc | DstDI | String), D(SrcAcc | DstDI | String),
+	D2bv(SrcAcc | DstDI | Mov | String),
+	D2bv(SrcSI | DstAcc | Mov | String),
+	D2bv(SrcAcc | DstDI | String),
 	/* 0xB0 - 0xB7 */
 	X8(D(ByteOp | DstReg | SrcImm | Mov)),
 	/* 0xB8 - 0xBF */
-- 
cgit v0.10.2


From 76e8e68d4435bb894a1a03be853a55a4a2b45247 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:09 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 80-8F

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 749322e..661013f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2438,11 +2438,10 @@ static struct opcode opcode_table[256] = {
 	G(DstMem | SrcImm | ModRM | Group, group1),
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
+	D2bv(DstMem | SrcReg | ModRM | Mov),
+	D2bv(DstReg | SrcMem | ModRM | Mov),
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
-- 
cgit v0.10.2


From 50748613d16f55cbf7da14bc6e92b7cb1cd4fa7d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:10 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 A0-AF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 661013f..d59e54b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2451,11 +2451,11 @@ static struct opcode opcode_table[256] = {
 	I(SrcImmFAddr | No64, em_call_far), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
-	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
+	D2bv(DstAcc | SrcMem | Mov | MemAbs),
+	D2bv(DstMem | SrcAcc | Mov | MemAbs),
 	D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
-	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
+	D2bv(DstAcc | SrcImm),
 	D2bv(SrcAcc | DstDI | Mov | String),
 	D2bv(SrcSI | DstAcc | Mov | String),
 	D2bv(SrcAcc | DstDI | String),
-- 
cgit v0.10.2


From d2c6c7adb181eac5b18dbefdf24c0e6745470939 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:11 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 C0-DF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d59e54b..02566c1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2464,17 +2464,16 @@ static struct opcode opcode_table[256] = {
 	/* 0xB8 - 0xBF */
 	X8(D(DstReg | SrcImm | Mov)),
 	/* 0xC0 - 0xC7 */
-	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
+	D2bv(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
-	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
+	D2bv(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
-	D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM),
-	D(ByteOp | DstMem | ModRM), D(DstMem | ModRM),
+	D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
-- 
cgit v0.10.2


From d269e3961a65bbf6a76a8dc37b70cb578216e2c0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:12 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 E0-FF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 02566c1..b43572a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2479,13 +2479,11 @@ static struct opcode opcode_table[256] = {
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
 	X4(D(SrcImmByte)),
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
+	D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	D(ByteOp | SrcAcc | ImplicitOps), D(SrcAcc | ImplicitOps),
+	D2bv(SrcNone | DstAcc),	D2bv(SrcAcc | ImplicitOps),
 	/* 0xF0 - 0xF7 */
 	N, N, N, N,
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
-- 
cgit v0.10.2


From 739ae406068211b235b488f247aab349e486c382 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:13 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes 0F
 00-FF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b43572a..58e715c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2534,7 +2534,7 @@ static struct opcode twobyte_table[256] = {
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM | Lock),
 	D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
@@ -2544,7 +2544,7 @@ static struct opcode twobyte_table[256] = {
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
 	N, N, N, N, N, N, N, N,
-- 
cgit v0.10.2


From f6b3597bded9ed261b42fdcb5e741489cb5ccbfe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:59:00 +0300
Subject: KVM: x86 emulator: add macros for executing instructions that may
 trap

Like DIV and IDIV.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 58e715c..e96cce1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -331,6 +331,27 @@ struct group_dual {
 			  "a" (_rax), "d" (_rdx));			\
 	} while (0)
 
+#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "5", "1")			\
+			"1: \n\t"					\
+			_op _suffix " %6; "				\
+			"2: \n\t"					\
+			_POST_EFLAGS("0", "5", "1")			\
+			".pushsection .fixup,\"ax\" \n\t"		\
+			"3: movb $1, %4 \n\t"				\
+			"jmp 2b \n\t"					\
+			".popsection \n\t"				\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "=m" (_eflags), "=&r" (_tmp),			\
+			  "+a" (_rax), "+d" (_rdx), "+qm"(_ex)		\
+			: "i" (EFLAGS_MASK), "m" ((_src).val),		\
+			  "a" (_rax), "d" (_rdx));			\
+	} while (0)
+
 /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
 #define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)			\
 	do {									\
@@ -342,6 +363,28 @@ struct group_dual {
 		}							\
 	} while (0)
 
+#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex)	\
+	do {								\
+		switch((_src).bytes) {					\
+		case 1:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx,	\
+						 _eflags, "b", _ex);	\
+			break;						\
+		case 2:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "w", _ex);	\
+			break;						\
+		case 4:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "l", _ex);	\
+			break;						\
+		case 8: ON64(						\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "q", _ex));	\
+			break;						\
+		}							\
+	} while (0)
+
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
-- 
cgit v0.10.2


From 34d1f4905eb66478a890ea808ec58bc842e6e589 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:59:01 +0300
Subject: KVM: x86 emulator: trap and propagate #DE from DIV and IDIV

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e96cce1..917b9b5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -504,6 +504,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_exception(ctxt, DE_VECTOR, 0, false);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
@@ -1458,6 +1464,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
 	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+	u8 de = 0;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1476,14 +1483,18 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
 		break;
 	case 6: /* div */
-		emulate_1op_rax_rdx("div", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+				       ctxt->eflags, de);
 		break;
 	case 7: /* idiv */
-		emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+				       ctxt->eflags, de);
 		break;
 	default:
 		return X86EMUL_UNHANDLEABLE;
 	}
+	if (de)
+		return emulate_de(ctxt);
 	return X86EMUL_CONTINUE;
 }
 
@@ -3413,8 +3424,9 @@ special_insn:
 		ctxt->eflags ^= EFLG_CF;
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		if (emulate_grp3(ctxt, ops) != X86EMUL_CONTINUE)
-			goto cannot_emulate;
+		rc = emulate_grp3(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-- 
cgit v0.10.2


From 217fc9cfca21a0bc2f4246183ebd8ee9863b019d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 13:38:03 +0300
Subject: KVM: Fix build error due to 64-bit division in nsec_to_cycles()

Use do_div() instead.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bc96ac9..bdba1d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -56,6 +56,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/pvclock.h>
+#include <asm/div64.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -917,11 +918,15 @@ static inline int kvm_tsc_changes_freq(void)
 
 static inline u64 nsec_to_cycles(u64 nsec)
 {
+	u64 ret;
+
 	WARN_ON(preemptible());
 	if (kvm_tsc_changes_freq())
 		printk_once(KERN_WARNING
 		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-	return (nsec * __get_cpu_var(cpu_tsc_khz)) / USEC_PER_SEC;
+	ret = nsec * __get_cpu_var(cpu_tsc_khz);
+	do_div(ret, USEC_PER_SEC);
+	return ret;
 }
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
-- 
cgit v0.10.2


From 6230f7fc0453c5bc5daa8e053773021e1c4a2f16 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 18:34:55 +0300
Subject: KVM: x86 emulator: simplify ALU opcode block decode further

The ALU opcode block is very regular; introduce D6ALU() to define decode
flags for 6 instructions at a time.

Suggested by Paolo Bonzini.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 917b9b5..8bfa3e3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2387,6 +2387,11 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 #define D2bv(_f)      D((_f) | ByteOp), D(_f)
 #define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
 
+#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM),			\
+		D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock),		\
+		D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+
+
 static struct opcode group1[] = {
 	X7(D(Lock)), N
 };
@@ -2439,35 +2444,25 @@ static struct group_dual group9 = { {
 
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x08 - 0x0F */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), N,
 	/* 0x10 - 0x17 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x18 - 0x1F */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x20 - 0x27 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm), N, N,
+	D6ALU(Lock), N, N,
 	/* 0x28 - 0x2F */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
-	N, I(ByteOp | DstAcc | No64, em_das),
+	D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm), N, N,
+	D6ALU(Lock), N, N,
 	/* 0x38 - 0x3F */
-	D2bv(DstMem | SrcReg | ModRM), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
-	N, N,
+	D6ALU(0), N, N,
 	/* 0x40 - 0x4F */
 	X16(D(DstReg)),
 	/* 0x50 - 0x57 */
@@ -2618,6 +2613,7 @@ static struct opcode twobyte_table[256] = {
 
 #undef D2bv
 #undef I2bv
+#undef D6ALU
 
 static unsigned imm_size(struct decode_cache *c)
 {
-- 
cgit v0.10.2


From 989044ee0fdc6c22a11ea1d22e2a3d17463cb564 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 30 Aug 2010 12:01:56 +0200
Subject: KVM: PPC: Fix CONFIG_KVM_GUEST && !CONFIG_KVM case

When CONFIG_KVM_GUEST is selected, but CONFIG_KVM is not, we were missing
some defines in asm-offsets.c and included too many headers at other places.

This patch makes above configuration work.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 37486ca..6d92b4e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -48,11 +48,11 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
 #endif
-#ifdef CONFIG_KVM
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST)
 #include <linux/kvm_host.h>
-#ifndef CONFIG_BOOKE
-#include <asm/kvm_book3s.h>
 #endif
+#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
+#include <asm/kvm_book3s.h>
 #endif
 
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index e936817..d3a2cc5 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -25,7 +25,6 @@
 #include <linux/of.h>
 
 #include <asm/reg.h>
-#include <asm/kvm_ppc.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/disassemble.h>
-- 
cgit v0.10.2


From 23e7a7944f3779155e2f6bbc831b544eb925f387 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 27 Aug 2010 17:15:06 +0800
Subject: KVM: pit: Do not check pending pit timer in vcpu thread

Pit interrupt injection was done by workqueue, so no need to check
pending pit timer in vcpu thread which could lead unnecessary
unblocking of vcpu.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb231..2ad40a4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
 
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
-	if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
-		return atomic_read(&pit->pit_state.pit_timer.pending);
-	return 0;
-}
-
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a04..f994da4 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -33,12 +33,7 @@
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	int ret;
-
-	ret = pit_has_pending_timer(vcpu);
-	ret |= apic_has_pending_timer(vcpu);
-
-	return ret;
+	return apic_has_pending_timer(vcpu);
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
-- 
cgit v0.10.2


From 9ad17b10011702cb56c5e32e41ecd5fe281c3574 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:19:42 +0800
Subject: KVM: MMU: fix compile warning in audit code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kvm/mmu.c: In function ‘kvm_mmu_unprotect_page’:
arch/x86/kvm/mmu.c:1741: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c:1745: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘mmu_unshadow’:
arch/x86/kvm/mmu.c:1761: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘set_spte’:
arch/x86/kvm/mmu.c:2005: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘mmu_set_spte’:
arch/x86/kvm/mmu.c:2033: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 7 has type ‘gfn_t’

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b0037a77..59bf1d9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1738,11 +1738,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 	int r;
 
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1758,7 +1758,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: zap %lx %x\n",
+		pgprintk("%s: zap %llx %x\n",
 			 __func__, gfn, sp->role.word);
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
@@ -2002,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto set_pte;
 
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
-			pgprintk("%s: found shadow page for %lx, marking ro\n",
+			pgprintk("%s: found shadow page for %llx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
@@ -2031,7 +2031,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
+		 " user_fault %d gfn %llx\n",
 		 __func__, *sptep, pt_access,
 		 write_fault, user_fault, gfn);
 
@@ -2050,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			__set_spte(sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
-			pgprintk("hfn old %lx new %lx\n",
+			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2067,7 +2067,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
 		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
 		 *sptep, sptep);
@@ -3651,9 +3651,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 		if (!gfn_to_memslot(kvm, gfn)) {
 			if (!printk_ratelimit())
 				return;
-			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+			printk(KERN_ERR "%s: no memslot for gfn %llx\n",
 					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+			printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
 			       audit_msg, (long int)(sptep - rev_sp->spt),
 					rev_sp->gfn);
 			dump_stack();
@@ -3728,7 +3728,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		while (spte) {
 			if (is_writable_pte(*spte))
 				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %lx role %x\n",
+				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
 			spte = rmap_next(vcpu->kvm, rmapp, spte);
-- 
cgit v0.10.2


From 0beb8d660425aab339ff68e6f4d4528739e8fc4f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:20:47 +0800
Subject: KVM: MMU: check rmap for every spte

The read-only spte also has reverse mapping, so fix the code to check them,
also modify the function name to fit its doing

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 59bf1d9..1c784b9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,40 +3644,38 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-	if (is_writable_pte(*sptep)) {
-		rev_sp = page_header(__pa(sptep));
-		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-		if (!gfn_to_memslot(kvm, gfn)) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-			       audit_msg, (long int)(sptep - rev_sp->spt),
-					rev_sp->gfn);
-			dump_stack();
-			return;
-		}
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-		if (!*rmapp) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-					 audit_msg, *sptep);
-			dump_stack();
-		}
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
 	}
 
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
 }
 
-void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 {
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
 	int i;
@@ -3689,12 +3687,9 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 			continue;
 
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = pt[i];
-
-			if (!(ent & PT_PRESENT_MASK))
-				continue;
-			if (!is_writable_pte(ent))
+			if (!is_rmap_spte(pt[i]))
 				continue;
+
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 	}
@@ -3703,7 +3698,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
-	check_writable_mappings_rmap(vcpu);
+	check_mappings_rmap(vcpu);
 	count_rmaps(vcpu);
 }
 
@@ -3746,7 +3741,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
-	audit_writable_sptes_have_rmaps(vcpu);
+	audit_sptes_have_rmaps(vcpu);
 	dbg = olddbg;
 }
 
-- 
cgit v0.10.2


From bc32ce2152406431acf4daf4a81dc1664bb7b91b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:22:46 +0800
Subject: KVM: MMU: fix wrong not write protected sp report

The audit code reports some sp not write protected in current code, it's just the
bug in audit_write_protection(), since:

- the invalid sp not need write protected
- using uninitialize local variable('gfn')
- call kvm_mmu_audit() out of mmu_lock's protection

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1c784b9..68575dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3708,16 +3708,17 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
-	gfn_t gfn;
 
 	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
 		if (sp->role.direct)
 			continue;
 		if (sp->unsync)
 			continue;
+		if (sp->role.invalid)
+			continue;
 
 		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[gfn - slot->base_gfn];
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		while (spte) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a4e8389..a0f2feb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -504,7 +504,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned long mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
-	kvm_mmu_audit(vcpu, "pre page fault");
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -542,6 +541,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
+
+	kvm_mmu_audit(vcpu, "pre page fault");
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
-- 
cgit v0.10.2


From 365fb3fdf6769d3553999d8eb6cc2a8c56c747c1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:24:13 +0800
Subject: KVM: MMU: rewrite audit_mappings_page() function

There is a bugs in this function, we call gfn_to_pfn() and kvm_mmu_gva_to_gpa_read() in
atomic context(kvm_mmu_audit() is called under the spinlock(mmu_lock)'s protection).

This patch fix it by:
- introduce gfn_to_pfn_atomic instead of gfn_to_pfn
- get the mapping gfn from kvm_mmu_page_get_gfn()

And it adds 'notrap' ptes check in unsync/direct sps

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 68575dc..0d91f60 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3487,15 +3487,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 static const char *audit_msg;
 
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-	gva = (long long)(gva << 16) >> 16;
-#endif
-	return gva;
-}
-
-
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
 
 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -3550,39 +3541,53 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 ent = pt[i];
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
 
-		if (ent == shadow_trap_nonpresent_pte)
-			continue;
+		sp = page_header(__pa(sptep));
 
-		va = canonicalize(va);
-		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
-			audit_mappings_page(vcpu, ent, va, level - 1);
-		else {
-			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
-			gfn_t gfn = gpa >> PAGE_SHIFT;
-			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
-			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
 
-			if (is_error_pfn(pfn)) {
-				kvm_release_pfn_clean(pfn);
-				continue;
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
 			}
+		}
 
-			if (is_shadow_present_pte(ent)
-			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-				printk(KERN_ERR "xx audit error: (%s) levels %d"
-				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
-				       audit_msg, vcpu->arch.mmu.root_level,
-				       va, gpa, hpa, ent,
-				       is_shadow_present_pte(ent));
-			else if (ent == shadow_notrap_nonpresent_pte
-				 && !is_error_hpa(hpa))
-				printk(KERN_ERR "audit: (%s) notrap shadow,"
-				       " valid guest gva %lx\n", audit_msg, va);
-			kvm_release_pfn_clean(pfn);
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
 		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
 	}
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b837ec8..f2ecdd5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -300,6 +300,7 @@ void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2eb0b75..c7a57b4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -999,7 +999,7 @@ pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
 {
 	unsigned long addr;
 
@@ -1009,7 +1009,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, false);
+	return hva_to_pfn(kvm, addr, atomic);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, true);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, false);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
-- 
cgit v0.10.2


From 8e0e8afa82018a3c751ea474eb47dfc65f00f4c3 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:25:09 +0800
Subject: KVM: MMU: remove count_rmaps()

Nothing is checked in count_rmaps(), so remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0d91f60..0bff4d5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3606,43 +3606,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 						    2);
 }
 
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_memslots *slots;
-	int nmaps = 0;
-	int i, j, k, idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *m = &slots->memslots[i];
-		struct kvm_rmap_desc *d;
-
-		for (j = 0; j < m->npages; ++j) {
-			unsigned long *rmapp = &m->rmap[j];
-
-			if (!*rmapp)
-				continue;
-			if (!(*rmapp & 1)) {
-				++nmaps;
-				continue;
-			}
-			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-			while (d) {
-				for (k = 0; k < RMAP_EXT; ++k)
-					if (d->sptes[k])
-						++nmaps;
-					else
-						break;
-				d = d->more;
-			}
-		}
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-	return nmaps;
-}
-
 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
@@ -3704,7 +3667,6 @@ static void check_mappings_rmap(struct kvm_vcpu *vcpu)
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
 	check_mappings_rmap(vcpu);
-	count_rmaps(vcpu);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From c41a15dd4632499b9c1a00871e160276999767d9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Aug 2010 10:46:56 +0300
Subject: KVM: Fix pio trace direction

out = write, in = read, not the other way round.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bdba1d0..d0ba857 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3743,7 +3743,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 	if (vcpu->arch.pio.count)
 		goto data_avail;
 
-	trace_kvm_pio(1, port, size, 1);
+	trace_kvm_pio(0, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 1;
@@ -3771,7 +3771,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
 			      const void *val, unsigned int count,
 			      struct kvm_vcpu *vcpu)
 {
-	trace_kvm_pio(0, port, size, 1);
+	trace_kvm_pio(1, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 0;
-- 
cgit v0.10.2


From 678041ad9dc82eedc598f709e8a3d620139d4105 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 31 Aug 2010 19:13:13 -0300
Subject: KVM: SVM: reset mmu context in init_vmcb

Since commit aad827034e419fa no mmu reinitialization is performed
via init_vmcb.

Zero vcpu->arch.cr0 and pass the reset value as a parameter to
kvm_set_cr0.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff28f65..60bc1e5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -827,8 +827,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
-	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+	svm->vcpu.arch.cr0 = 0;
+	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
 
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
-- 
cgit v0.10.2


From eaa48512ba9df32aab8be5fceec10f3f80369379 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 31 Aug 2010 19:13:14 -0300
Subject: KVM: SVM: init_vmcb should reset vcpu->efer

Otherwise EFER_LMA bit is retained across a SIPI reset.

Fixes guest cpu onlining.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 60bc1e5..a1a83b9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -816,7 +816,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	save->efer = EFER_SVME;
+	svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
-- 
cgit v0.10.2


From e90aa41e6ca76cd7be021d4d5560e64954cd4585 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 1 Sep 2010 10:23:35 +0300
Subject: KVM: Don't save/restore MSR_IA32_PERF_STATUS

It is read/only; restoring it only results in annoying messages.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0ba857..1c97238 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -739,7 +739,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
-- 
cgit v0.10.2


From b9eac5f4d146dc6cb88c8e6d891f8abe60493338 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 3 Aug 2010 14:46:56 +0300
Subject: KVM: x86 emulator: use single stage decoding for mov instructions

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8bfa3e3..c0715ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2378,6 +2378,13 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	c->dst.val = c->src.val;
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2489,8 +2496,8 @@ static struct opcode opcode_table[256] = {
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-	D2bv(DstMem | SrcReg | ModRM | Mov),
-	D2bv(DstReg | SrcMem | ModRM | Mov),
+	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
@@ -2500,24 +2507,25 @@ static struct opcode opcode_table[256] = {
 	I(SrcImmFAddr | No64, em_call_far), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
-	D2bv(DstAcc | SrcMem | Mov | MemAbs),
-	D2bv(DstMem | SrcAcc | Mov | MemAbs),
-	D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String),
+	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+	I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+	I2bv(SrcSI | DstDI | Mov | String, em_mov),
+	D2bv(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
 	D2bv(DstAcc | SrcImm),
-	D2bv(SrcAcc | DstDI | Mov | String),
-	D2bv(SrcSI | DstAcc | Mov | String),
+	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+	I2bv(SrcSI | DstAcc | Mov | String, em_mov),
 	D2bv(SrcAcc | DstDI | String),
 	/* 0xB0 - 0xB7 */
-	X8(D(ByteOp | DstReg | SrcImm | Mov)),
+	X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
 	/* 0xB8 - 0xBF */
-	X8(D(DstReg | SrcImm | Mov)),
+	X8(I(DstReg | SrcImm | Mov, em_mov)),
 	/* 0xC0 - 0xC7 */
 	D2bv(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
-	D2bv(DstMem | SrcImm | ModRM | Mov),
+	I2bv(DstMem | SrcImm | ModRM | Mov, em_mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
@@ -3212,8 +3220,6 @@ special_insn:
 		c->dst.val = c->src.orig_val;
 		c->lock_prefix = 1;
 		break;
-	case 0x88 ... 0x8b:	/* mov */
-		goto mov;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
 			emulate_ud(ctxt);
@@ -3271,22 +3277,14 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0xa0 ... 0xa3:	/* mov */
-	case 0xa4 ... 0xa5:	/* movs */
-		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
 		goto cmp;
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
-	case 0xaa ... 0xab:	/* stos */
-	case 0xac ... 0xad:	/* lods */
-		goto mov;
 	case 0xae ... 0xaf:	/* scas */
 		goto cmp;
-	case 0xb0 ... 0xbf: /* mov r, imm */
-		goto mov;
 	case 0xc0 ... 0xc1:
 		emulate_grp2(ctxt);
 		break;
@@ -3305,10 +3303,6 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
-	mov:
-		c->dst.val = c->src.val;
-		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
-- 
cgit v0.10.2


From a4d4a7c1880db98a521bc27c15348185fa30c256 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 3 Aug 2010 15:05:46 +0300
Subject: KVM: x86 emulator: fix group 11 decoding for reg != 0

These are all undefined.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c0715ae..9940d16 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2449,6 +2449,10 @@ static struct group_dual group9 = { {
 	N, N, N, N, N, N, N, N,
 } };
 
+static struct opcode group11[] = {
+	I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+};
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	D6ALU(Lock),
@@ -2525,7 +2529,7 @@ static struct opcode opcode_table[256] = {
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
-	I2bv(DstMem | SrcImm | ModRM | Mov, em_mov),
+	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
-- 
cgit v0.10.2


From 7d9ddaedd8a9d0442fda5b5a90f22a33becbd235 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Aug 2010 17:12:28 +0300
Subject: KVM: x86 emulator: clean up control flow in x86_emulate_insn()

x86_emulate_insn() is full of things like

    if (rc != X86EMUL_CONTINUE)
        goto done;
    break;

consolidate all of those at the end of the switch statement.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9940d16..27d2c22 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3098,8 +3098,6 @@ special_insn:
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x08 ... 0x0d:
 	      or:		/* or */
@@ -3117,8 +3115,6 @@ special_insn:
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x18 ... 0x1d:
 	      sbb:		/* sbb */
@@ -3129,8 +3125,6 @@ special_insn:
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x20 ... 0x25:
 	      and:		/* and */
@@ -3157,18 +3151,12 @@ special_insn:
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x60:	/* pusha */
 		rc = emulate_pusha(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -3255,8 +3243,6 @@ special_insn:
 	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
@@ -3278,8 +3264,6 @@ special_insn:
 		c->dst.addr.reg = &ctxt->eflags;
 		c->dst.bytes = c->op_bytes;
 		rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
@@ -3299,18 +3283,12 @@ special_insn:
 		goto pop_instruction;
 	case 0xc4:		/* les */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xc5:		/* lds */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xcc:		/* int3 */
 		irq = 3;
@@ -3319,8 +3297,6 @@ special_insn:
 		irq = c->src.val;
 	do_interrupt:
 		rc = emulate_int(ctxt, ops, irq);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xce:		/* into */
 		if (ctxt->eflags & EFLG_OF) {
@@ -3330,9 +3306,6 @@ special_insn:
 		break;
 	case 0xcf:		/* iret */
 		rc = emulate_iret(ctxt, ops);
-
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		emulate_grp2(ctxt);
@@ -3419,8 +3392,6 @@ special_insn:
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		rc = emulate_grp3(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
@@ -3453,8 +3424,6 @@ special_insn:
 	case 0xfe: /* Grp4 */
 	grp45:
 		rc = emulate_grp45(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xff: /* Grp5 */
 		if (c->modrm_reg == 5)
@@ -3464,6 +3433,9 @@ special_insn:
 		goto cannot_emulate;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 writeback:
 	rc = writeback(ctxt, ops);
 	if (rc != X86EMUL_CONTINUE)
@@ -3545,8 +3517,6 @@ twobyte_insn:
 				switch (c->modrm_rm) {
 				case 1:
 					rc = kvm_fix_hypercall(ctxt->vcpu);
-					if (rc != X86EMUL_CONTINUE)
-						goto done;
 					break;
 				default:
 					goto cannot_emulate;
@@ -3585,10 +3555,6 @@ twobyte_insn:
 		break;
 	case 0x05: 		/* syscall */
 		rc = emulate_syscall(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
@@ -3665,17 +3631,9 @@ twobyte_insn:
 		break;
 	case 0x34:		/* sysenter */
 		rc = emulate_sysenter(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x35:		/* sysexit */
 		rc = emulate_sysexit(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;
@@ -3694,8 +3652,6 @@ twobyte_insn:
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xa3:
 	      bt:		/* bt */
@@ -3713,8 +3669,6 @@ twobyte_insn:
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xab:
 	      bts:		/* bts */
@@ -3745,8 +3699,6 @@ twobyte_insn:
 		break;
 	case 0xb2:		/* lss */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xb3:
 	      btr:		/* btr */
@@ -3754,13 +3706,9 @@ twobyte_insn:
 		break;
 	case 0xb4:		/* lfs */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xb5:		/* lgs */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		c->dst.bytes = c->op_bytes;
@@ -3825,12 +3773,14 @@ twobyte_insn:
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = emulate_grp9(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	default:
 		goto cannot_emulate;
 	}
+
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	goto writeback;
 
 cannot_emulate:
-- 
cgit v0.10.2


From 9ed049c3b6230b68985da31f8243d4bec95e0b3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Aug 2010 12:18:24 +0300
Subject: KVM: i8259: Make ICW1 conform to spec

ICW is not a full reset, instead it resets a limited number of registers
in the PIC.  Change ICW1 emulation to only reset those registers.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73c..6e77471 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -308,13 +308,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 	addr &= 1;
 	if (addr == 0) {
 		if (val & 0x10) {
-			kvm_pic_reset(s);	/* init */
-			/*
-			 * deassert a pending interrupt
-			 */
-			pic_irq_request(s->pics_state->kvm, 0);
-			s->init_state = 1;
 			s->init4 = val & 1;
+			s->last_irr = 0;
+			s->imr = 0;
+			s->priority_add = 0;
+			s->special_mask = 0;
+			s->read_reg_select = 0;
+			if (!s->init4) {
+				s->special_fully_nested_mode = 0;
+				s->auto_eoi = 0;
+			}
+			s->init_state = 1;
 			if (val & 0x02)
 				printk(KERN_ERR "single mode not supported");
 			if (val & 0x08)
-- 
cgit v0.10.2


From 84e0cefa8ddd5d5018d3b582e1e90585ed551757 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Wed, 1 Sep 2010 11:42:04 +0200
Subject: KVM: Fix guest kernel crash on MSR_K7_CLK_CTL

MSR_K7_CLK_CTL is a no longer documented MSR, which is only relevant
on said old AMD CPU models. This change returns the expected value,
which the Linux kernel is expecting to avoid writing back the MSR,
plus it ignores all writes to the MSR.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c97238..f47db25 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1449,6 +1449,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 			"0x%x data 0x%llx\n", msr, data);
 		break;
+	case MSR_K7_CLK_CTL:
+		/*
+		 * Ignore all writes to this no longer documented MSR.
+		 * Writes are only relevant for old K7 processors,
+		 * all pre-dating SVM, but a recommended workaround from
+		 * AMD for these chips. It is possible to speicify the
+		 * affected processor models on the command line, hence
+		 * the need to ignore the workaround.
+		 */
+		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
@@ -1674,6 +1684,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 		return get_msr_mce(vcpu, msr, pdata);
+	case MSR_K7_CLK_CTL:
+		/*
+		 * Provide expected ramp-up count for K7. All other
+		 * are set to zero, indicating minimum divisors for
+		 * every field.
+		 *
+		 * This prevents guest kernels on AMD host with CPU
+		 * type 6, model 8 and higher from exploding due to
+		 * the rdmsr failing.
+		 */
+		data = 0x20000000;
+		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
-- 
cgit v0.10.2


From 8b1fe17cc7a8b2c62b400dcbfaebd96da6b4f58e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:22:53 +0800
Subject: KVM: MMU: support disable/enable mmu audit dynamicly

Add a r/w module parameter named 'mmu_audit', it can control audit
enable/disable:

enable:
  echo 1 > /sys/module/kvm/parameters/mmu_audit

disable:
  echo 0 > /sys/module/kvm/parameters/mmu_audit

This patch not change the logic

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..ddc131f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_MMU_AUDIT
+	bool "Audit KVM MMU"
+	depends on KVM && TRACEPOINTS
+	---help---
+	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
+	 audit  KVM MMU at runtime.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8b750ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,15 +49,21 @@
  */
 bool tdp_enabled = false;
 
-#undef MMU_DEBUG
+enum {
+	AUDIT_PRE_PAGE_FAULT,
+	AUDIT_POST_PAGE_FAULT,
+	AUDIT_PRE_PTE_WRITE,
+	AUDIT_POST_PTE_WRITE
+};
 
-#undef AUDIT
+char *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write"
+};
 
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
 
@@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
 
 #endif
 
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
 static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
@@ -2964,7 +2970,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3043,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	kvm_mmu_audit(vcpu, "post pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3483,8 +3489,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef AUDIT
-
+#ifdef CONFIG_KVM_MMU_AUDIT
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
@@ -3699,18 +3704,68 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
+	audit_msg = audit_point_name[audit_point];
 	audit_rmap(vcpu);
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-	dbg = olddbg;
 }
 
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
 #endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..b60b4fd 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 	TP_ARGS(sp)
 );
+
+TRACE_EVENT(
+	kvm_mmu_audit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+	TP_ARGS(vcpu, audit_point),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(int, audit_point)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->audit_point = audit_point;
+	),
+
+	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+		  audit_point_name[__entry->audit_point])
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2feb..debe770 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	kvm_mmu_audit(vcpu, "pre page fault");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
-- 
cgit v0.10.2


From 2f4f337248cd5660040b7e09b7287a7a0a861f3f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:24:10 +0800
Subject: KVM: MMU: move audit to a separate file

Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff..d2dad65 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
-
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
-			return;
-		}
-
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
-			return;
-
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
-
-		hpa =  pfn << PAGE_SHIFT;
-
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-
-	rev_sp = page_header(__pa(sptep));
-	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
-		dump_stack();
-		return;
-	}
-
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-	if (!*rmapp) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
-		dump_stack();
-	}
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
-	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
-	int ret;
-
-	if (mmu_audit)
-		return;
-
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
-	mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
-	if (!mmu_audit)
-		return;
-
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
-	mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
-	int ret;
-	unsigned long enable;
-
-	ret = strict_strtoul(val, 10, &enable);
-	if (ret < 0)
-		return -EINVAL;
-
-	switch (enable) {
-	case 0:
-		mmu_audit_disable();
-		break;
-	case 1:
-		mmu_audit_enable();
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
-	.set = mmu_audit_set,
-	.get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 0000000..fb8a461
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (!is_last_spte(ent, sp->role.level)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			} else
+				fn(kvm, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
+
+		sp = page_header(__pa(sptep));
+
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
+
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
+			}
+		}
+
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (!is_rmap_spte(pt[i]))
+				continue;
+
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+		}
+	}
+	return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.direct)
+			continue;
+		if (sp->unsync)
+			continue;
+		if (sp->role.invalid)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+		spte = rmap_next(vcpu->kvm, rmapp, NULL);
+		while (spte) {
+			if (is_writable_pte(*spte))
+				printk(KERN_ERR "%s: (%s) shadow page has "
+				"writable mappings: gfn %llx role %x\n",
+			       __func__, audit_msg, sp->gfn,
+			       sp->role.word);
+			spte = rmap_next(vcpu->kvm, rmapp, spte);
+		}
+	}
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+	audit_msg = audit_point_name[audit_point];
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	if (strcmp("pre pte write", audit_msg) != 0)
+		audit_mappings(vcpu);
+	audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
-- 
cgit v0.10.2


From 49edf87806f52a005152beaed9f4731862efc8fe Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:25:03 +0800
Subject: KVM: MMU: improve active sp audit

Both audit_rmap() and audit_write_protection() need to walk all active sp, so
we can do these checking in a sp walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fb8a461..8becb86 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -65,6 +65,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 	return;
 }
 
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+		fn(kvm, sp);
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				gva_t va, int level)
 {
@@ -175,67 +185,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	struct kvm_mmu_page *sp;
 	int i;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+		return;
 
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (!is_rmap_spte(sp->spt[i]))
 			continue;
 
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
+		inspect_spte_has_rmap(kvm, sp->spt + i);
 	}
-	return;
 }
 
-static void audit_rmap(struct kvm_vcpu *vcpu)
+void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
+	if (sp->role.direct || sp->unsync || sp->role.invalid)
+		return;
 
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+	slot = gfn_to_memslot(kvm, sp->gfn);
+	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		if (is_writable_pte(*spte))
+			printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
+		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
 
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	check_mappings_rmap(kvm, sp);
+	audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+	walk_all_active_sps(kvm, audit_sp);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
 	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
+	audit_all_active_sps(vcpu->kvm);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-- 
cgit v0.10.2


From eb2591865a234c6fb1162085d9b277236fa890b6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:25:51 +0800
Subject: KVM: MMU: improve spte audit

Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page
table, so we can do these checking in a spte walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 8becb86..3bde186 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,23 +19,24 @@
 
 static const char *audit_msg;
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn, int level)
 {
 	int i;
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
+		u64 *ent = sp->spt;
+
+		fn(vcpu, ent + i, level);
+
+		if (is_shadow_present_pte(ent[i]) &&
+		      !is_last_spte(ent[i], level)) {
+			struct kvm_mmu_page *child;
+
+			child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+			__mmu_spte_walk(vcpu, child, fn, level - 1);
 		}
 	}
 }
@@ -47,21 +48,25 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
+
 		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		__mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
 		return;
 	}
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
 		if (root && VALID_PAGE(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
+			__mmu_spte_walk(vcpu, sp, fn, 2);
 		}
 	}
+
 	return;
 }
 
@@ -75,80 +80,55 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
 		fn(kvm, sp);
 }
 
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
+	struct kvm_mmu_page *sp;
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
 
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
+	sp = page_header(__pa(sptep));
+
+	if (sp->unsync) {
+		if (level != PT_PAGE_TABLE_LEVEL) {
+			printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+				audit_msg, sp, level);
 			return;
 		}
 
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
+		if (*sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+				audit_msg, sp);
 			return;
+		}
+	}
 
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+		printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+			audit_msg, sp);
+		return;
+	}
 
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
+	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+		return;
 
-		hpa =  pfn << PAGE_SHIFT;
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return;
 	}
-}
 
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
+	hpa =  pfn << PAGE_SHIFT;
+	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+		printk(KERN_ERR "xx audit error: (%s) levels %d"
+				   "pfn %llx hpa %llx ent %llxn",
+				   audit_msg, vcpu->arch.mmu.root_level,
+				   pfn, hpa, *sptep);
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -180,9 +160,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	}
 }
 
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+	if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -234,13 +215,22 @@ static void audit_all_active_sps(struct kvm *kvm)
 	walk_all_active_sps(kvm, audit_sp);
 }
 
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	audit_sptes_have_rmaps(vcpu, sptep, level);
+	audit_mappings(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, audit_spte);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
 	audit_msg = audit_point_name[audit_point];
 	audit_all_active_sps(vcpu->kvm);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
+	audit_vcpu_spte(vcpu);
 }
 
 static bool mmu_audit;
-- 
cgit v0.10.2


From 30644b902c5eef5328d37a2e15f1921aaca2588b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:26:33 +0800
Subject: KVM: MMU: lower the aduit frequency

The audit is very high overhead, so we need lower the frequency to assure
the guest is running.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 3bde186..bd2b1be 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -17,6 +17,8 @@
  *
  */
 
+#include <linux/ratelimit.h>
+
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+	if (!__ratelimit(&ratelimit_state))
+		return;
+
 	audit_msg = audit_point_name[audit_point];
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
-- 
cgit v0.10.2


From 55438cc751c32cfafac52938403a86069e25b1bf Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Thu, 2 Sep 2010 17:55:00 +0900
Subject: KVM: ia64: define kvm_lapic_enabled() to fix a compile error

The following patch

  commit 57ce1659316f4ca298919649f9b1b55862ac3826
  KVM: x86: In DM_LOWEST, only deliver interrupts to vcpus with enabled LAPIC's

ignored the fact that kvm_irq_delivery_to_apic() was also used by ia64.

We define kvm_lapic_enabled() to fix a compile error caused by this.
This will have the same effect as reverting the problematic patch for ia64.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index ee541ce..c5f92a9 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -25,5 +25,6 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
+#define kvm_lapic_enabled(x) (true)
 
 #endif
-- 
cgit v0.10.2


From bed1ed9860d3744cc6488831fa5672d5c7aff4be Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 11:06:26 +0200
Subject: KVM: PPC: Move EXIT_DEBUG partially to tracepoints

We have a debug printk on every exit that is usually #ifdef'ed out. Using
tracepoints makes a lot more sense here though, as they can be dynamically
enabled.

This patch converts the most commonly used debug printks of EXIT_DEBUG to
tracepoints.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 7656b6d..37db61d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -17,6 +17,7 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include "trace.h"
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -35,7 +36,6 @@
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
-/* #define EXIT_DEBUG_SIMPLE */
 /* #define DEBUG_EXT */
 
 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
@@ -105,14 +105,6 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 	kvmppc_giveup_ext(vcpu, MSR_VSX);
 }
 
-#if defined(EXIT_DEBUG)
-static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
-{
-	u64 jd = mftb() - vcpu->arch.dec_jiffies;
-	return vcpu->arch.dec - jd;
-}
-#endif
-
 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 {
 	ulong smsr = vcpu->arch.shared->msr;
@@ -850,16 +842,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
 	run->ready_for_interrupt_injection = 1;
-#ifdef EXIT_DEBUG
-	printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n",
-		exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
-		kvmppc_get_dec(vcpu), to_svcpu(vcpu)->shadow_srr1);
-#elif defined (EXIT_DEBUG_SIMPLE)
-	if ((exit_nr != 0x900) && (exit_nr != 0x500))
-		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n",
-			exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
-			vcpu->arch.shared->msr);
-#endif
+
+	trace_kvm_book3s_exit(exit_nr, vcpu);
 	kvm_resched(vcpu);
 	switch (exit_nr) {
 	case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1091,9 +1075,7 @@ program_interrupt:
 		}
 	}
 
-#ifdef EXIT_DEBUG
-	printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, kvmppc_get_pc(vcpu), r);
-#endif
+	trace_kvm_book3s_reenter(r, vcpu);
 
 	return r;
 }
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index a8e8400..b5e9d81 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -98,6 +98,57 @@ TRACE_EVENT(kvm_gtlb_write,
 		__entry->word1, __entry->word2)
 );
 
+
+/*************************************************************************
+ *                         Book3S trace points                           *
+ *************************************************************************/
+
+#ifdef CONFIG_PPC_BOOK3S
+
+TRACE_EVENT(kvm_book3s_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+		__field(	unsigned long,	srr1		)
+	),
+
+	TP_fast_assign(
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= vcpu->arch.shared->msr;
+		__entry->srr1		= to_svcpu(vcpu)->shadow_srr1;
+	),
+
+	TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
+		  __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar,
+		  __entry->srr1)
+);
+
+TRACE_EVENT(kvm_book3s_reenter,
+	TP_PROTO(int r, struct kvm_vcpu *vcpu),
+	TP_ARGS(r, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	r		)
+		__field(	unsigned long,	pc		)
+	),
+
+	TP_fast_assign(
+		__entry->r		= r;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+	),
+
+	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
+);
+
+#endif /* CONFIG_PPC_BOOK3S */
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
-- 
cgit v0.10.2


From 82fdee7bce546c3ce38dcf0db6096eea73dbe7bd Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 11:38:54 +0200
Subject: KVM: PPC: Move book3s_64 mmu map debug print to trace point

This patch moves Book3s MMU debugging over to tracepoints.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 672b149..aa516ad 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -28,19 +28,13 @@
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
+#include "trace.h"
 
 #define PTE_SIZE 12
 #define VSID_ALL 0
 
-/* #define DEBUG_MMU */
 /* #define DEBUG_SLB */
 
-#ifdef DEBUG_MMU
-#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
-#else
-#define dprintk_mmu(a, ...) do { } while(0)
-#endif
-
 #ifdef DEBUG_SLB
 #define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
 #else
@@ -156,10 +150,7 @@ map_again:
 	} else {
 		struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
-		dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
-			    ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
-			    (rflags & HPTE_R_N) ? '-' : 'x',
-			    orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr);
+		trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte);
 
 		/* The ppc_md code may give us a secondary entry even though we
 		   asked for a primary. Fix up. */
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index b5e9d81..8ed6f1c 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -147,6 +147,40 @@ TRACE_EVENT(kvm_book3s_reenter,
 	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
 );
 
+#ifdef CONFIG_PPC_BOOK3S_64
+
+TRACE_EVENT(kvm_book3s_64_mmu_map,
+	TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
+		 struct kvmppc_pte *orig_pte),
+	TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
+
+	TP_STRUCT__entry(
+		__field(	unsigned char,		flag_w		)
+		__field(	unsigned char,		flag_x		)
+		__field(	unsigned long,		eaddr		)
+		__field(	unsigned long,		hpteg		)
+		__field(	unsigned long,		va		)
+		__field(	unsigned long long,	vpage		)
+		__field(	unsigned long,		hpaddr		)
+	),
+
+	TP_fast_assign(
+		__entry->flag_w	= ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
+		__entry->flag_x	= (rflags & HPTE_R_N) ? '-' : 'x';
+		__entry->eaddr	= orig_pte->eaddr;
+		__entry->hpteg	= hpteg;
+		__entry->va	= va;
+		__entry->vpage	= orig_pte->vpage;
+		__entry->hpaddr	= hpaddr;
+	),
+
+	TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
+		  __entry->flag_w, __entry->flag_x, __entry->eaddr,
+		  __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
+);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* CONFIG_PPC_BOOK3S */
 
 #endif /* _TRACE_KVM_H */
-- 
cgit v0.10.2


From 4c4eea7769d0099ea09f9bdb7aed1cc61d57c9d6 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 12:51:07 +0200
Subject: KVM: PPC: Add tracepoint for generic mmu map

This patch moves the generic mmu map debugging over to tracepoints.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 02c64ab..ac94bd9 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,6 +21,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include "trace.h"
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -66,6 +67,8 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	u64 index;
 
+	trace_kvm_book3s_mmu_map(pte);
+
 	spin_lock(&vcpu->arch.mmu_lock);
 
 	/* Add to ePTE list */
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 8ed6f1c..68a8444 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -181,6 +181,35 @@ TRACE_EVENT(kvm_book3s_64_mmu_map,
 
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+TRACE_EVENT(kvm_book3s_mmu_map,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_va		)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_va	= pte->host_va;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_va, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
 #endif /* CONFIG_PPC_BOOK3S */
 
 #endif /* _TRACE_KVM_H */
-- 
cgit v0.10.2


From 8696ee431233171b3c1cc82bae0193efc4fef2ac Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 12:55:19 +0200
Subject: KVM: PPC: Move pte invalidate debug code to tracepoint

This patch moves the SPTE flush debug printk over to tracepoints.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index ac94bd9..3397152 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -104,8 +104,7 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 	if (hlist_unhashed(&pte->list_pte))
 		return;
 
-	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
-		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+	trace_kvm_book3s_mmu_invalidate(pte);
 
 	/* Different for 32 and 64 bit */
 	kvmppc_mmu_invalidate_pte(vcpu, pte);
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 68a8444..06ad93e 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -210,6 +210,35 @@ TRACE_EVENT(kvm_book3s_mmu_map,
 		  __entry->vpage, __entry->raddr, __entry->flags)
 );
 
+TRACE_EVENT(kvm_book3s_mmu_invalidate,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_va		)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_va	= pte->host_va;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_va, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
 #endif /* CONFIG_PPC_BOOK3S */
 
 #endif /* _TRACE_KVM_H */
-- 
cgit v0.10.2


From c22c31963b4b0c23250e8f520a76427b3986b73b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 13:38:18 +0200
Subject: KVM: PPC: Fix sid map search after flush

After a flush the sid map contained lots of entries with 0 for their gvsid and
hvsid value. Unfortunately, 0 can be a real value the guest searches for when
looking up a vsid so it would incorrectly find the host's 0 hvsid mapping which
doesn't belong to our sid space.

So let's also check for the valid bit that indicated that the sid we're
looking at actually contains useful data.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index aa516ad..ebb1b5d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -65,14 +65,14 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 
 	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
 	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
-	if (map->guest_vsid == gvsid) {
+	if (map->valid && (map->guest_vsid == gvsid)) {
 		dprintk_slb("SLB: Searching: 0x%llx -> 0x%llx\n",
 			    gvsid, map->host_vsid);
 		return map;
 	}
 
 	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
-	if (map->guest_vsid == gvsid) {
+	if (map->valid && (map->guest_vsid == gvsid)) {
 		dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
 			    gvsid, map->host_vsid);
 		return map;
-- 
cgit v0.10.2


From c60b4cf70127941e2f944a7971a7f6b3ecb367ac Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 13:40:30 +0200
Subject: KVM: PPC: Add tracepoints for generic spte flushes

The different ways of flusing shadow ptes have their own debug prints which use
stupid old printk.

Let's move them to tracepoints, making them easier available, faster and
possible to activate on demand

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 3397152..bd6a767 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -31,14 +31,6 @@
 
 #define PTE_SIZE	12
 
-/* #define DEBUG_MMU */
-
-#ifdef DEBUG_MMU
-#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
-#else
-#define dprintk_mmu(a, ...) do { } while(0)
-#endif
-
 static struct kmem_cache *hpte_cache;
 
 static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
@@ -186,9 +178,7 @@ static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
 
 void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 {
-	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
-		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
-
+	trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask);
 	guest_ea &= ea_mask;
 
 	switch (ea_mask) {
@@ -251,8 +241,7 @@ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 
 void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 {
-	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+	trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask);
 	guest_vp &= vp_mask;
 
 	switch(vp_mask) {
@@ -274,8 +263,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 	struct hpte_cache *pte;
 	int i;
 
-	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
-		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
+	trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end);
 
 	rcu_read_lock();
 
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 06ad93e..23f757a 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -239,6 +239,29 @@ TRACE_EVENT(kvm_book3s_mmu_invalidate,
 		  __entry->vpage, __entry->raddr, __entry->flags)
 );
 
+TRACE_EVENT(kvm_book3s_mmu_flush,
+	TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
+		 unsigned long long p2),
+	TP_ARGS(type, vcpu, p1, p2),
+
+	TP_STRUCT__entry(
+		__field(	int,			count		)
+		__field(	unsigned long long,	p1		)
+		__field(	unsigned long long,	p2		)
+		__field(	const char *,		type		)
+	),
+
+	TP_fast_assign(
+		__entry->count		= vcpu->arch.hpte_cache_count;
+		__entry->p1		= p1;
+		__entry->p2		= p2;
+		__entry->type		= type;
+	),
+
+	TP_printk("Flush %d %sPTEs: %llx - %llx",
+		  __entry->count, __entry->type, __entry->p1, __entry->p2)
+);
+
 #endif /* CONFIG_PPC_BOOK3S */
 
 #endif /* _TRACE_KVM_H */
-- 
cgit v0.10.2


From 4cb6b7ea0cd085e6613153ad69608cad6421abcc Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 16:08:22 +0200
Subject: KVM: PPC: Preload magic page when in kernel mode

When the guest jumps into kernel mode and has the magic page mapped, theres a
very high chance that it will also use it. So let's detect that scenario and
map the segment accordingly.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 37db61d..54ca578 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -145,6 +145,16 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
 		kvmppc_mmu_flush_segments(vcpu);
 		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+		/* Preload magic page segment when in kernel mode */
+		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+			struct kvm_vcpu_arch *a = &vcpu->arch;
+
+			if (msr & MSR_DR)
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+			else
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+		}
 	}
 
 	/* Preload FPU if it's enabled */
-- 
cgit v0.10.2


From 2e602847d9c2d6b487bda62bbbe550db40ca912f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 20:11:39 +0200
Subject: KVM: PPC: Don't flush PTEs on NX/RO hit

When hitting a no-execute or read-only data/inst storage interrupt we were
flushing the respective PTE so we're sure it gets properly overwritten next.

According to the spec, this is unnecessary though. The guest issues a tlbie
anyways, so we're safe to just keep the PTE around and have it manually removed
from the guest, saving us a flush.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 54ca578..2fb528f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -887,7 +887,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			vcpu->arch.shared->msr |=
 				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
 			r = RESUME_GUEST;
 		}
 		break;
@@ -913,7 +912,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			vcpu->arch.shared->dar = dar;
 			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
 			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			kvmppc_mmu_pte_flush(vcpu, dar, ~0xFFFUL);
 			r = RESUME_GUEST;
 		}
 		break;
-- 
cgit v0.10.2


From e7c1d14e3bf40b87e6a3f68964b36dbb2c875c0f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 21:24:48 +0200
Subject: KVM: PPC: Make invalidation code more reliable

There is a race condition in the pte invalidation code path where we can't
be sure if a pte was invalidated already. So let's move the spin lock around
to get rid of the race.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index bd6a767..79751d8 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -92,10 +92,6 @@ static void free_pte_rcu(struct rcu_head *head)
 
 static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
-	/* pte already invalidated? */
-	if (hlist_unhashed(&pte->list_pte))
-		return;
-
 	trace_kvm_book3s_mmu_invalidate(pte);
 
 	/* Different for 32 and 64 bit */
@@ -103,18 +99,24 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 
 	spin_lock(&vcpu->arch.mmu_lock);
 
+	/* pte already invalidated in between? */
+	if (hlist_unhashed(&pte->list_pte)) {
+		spin_unlock(&vcpu->arch.mmu_lock);
+		return;
+	}
+
 	hlist_del_init_rcu(&pte->list_pte);
 	hlist_del_init_rcu(&pte->list_pte_long);
 	hlist_del_init_rcu(&pte->list_vpte);
 	hlist_del_init_rcu(&pte->list_vpte_long);
 
-	spin_unlock(&vcpu->arch.mmu_lock);
-
 	if (pte->pte.may_write)
 		kvm_release_pfn_dirty(pte->pfn);
 	else
 		kvm_release_pfn_clean(pte->pfn);
 
+	spin_unlock(&vcpu->arch.mmu_lock);
+
 	vcpu->arch.hpte_cache_count--;
 	call_rcu(&pte->rcu_head, free_pte_rcu);
 }
-- 
cgit v0.10.2


From 928d78be54014e65498e289fdc3f82acc4b804a9 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 21:25:33 +0200
Subject: KVM: PPC: Move slb debugging to tracepoints

This patch moves debugging printks for shadow SLB debugging over to tracepoints.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index ebb1b5d..321c931 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -33,14 +33,6 @@
 #define PTE_SIZE 12
 #define VSID_ALL 0
 
-/* #define DEBUG_SLB */
-
-#ifdef DEBUG_SLB
-#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
-#else
-#define dprintk_slb(a, ...) do { } while(0)
-#endif
-
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	ppc_md.hpte_invalidate(pte->slot, pte->host_va,
@@ -66,20 +58,17 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
 	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
 	if (map->valid && (map->guest_vsid == gvsid)) {
-		dprintk_slb("SLB: Searching: 0x%llx -> 0x%llx\n",
-			    gvsid, map->host_vsid);
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
 		return map;
 	}
 
 	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
 	if (map->valid && (map->guest_vsid == gvsid)) {
-		dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
-			    gvsid, map->host_vsid);
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
 		return map;
 	}
 
-	dprintk_slb("SLB: Searching %d/%d: 0x%llx -> not found\n",
-		    sid_map_mask, SID_MAP_MASK - sid_map_mask, gvsid);
+	trace_kvm_book3s_slb_fail(sid_map_mask, gvsid);
 	return NULL;
 }
 
@@ -205,8 +194,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	map->guest_vsid = gvsid;
 	map->valid = true;
 
-	dprintk_slb("SLB: New mapping at %d: 0x%llx -> 0x%llx\n",
-		    sid_map_mask, gvsid, map->host_vsid);
+	trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid);
 
 	return map;
 }
@@ -278,7 +266,7 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
 	to_svcpu(vcpu)->slb[slb_index].esid = slb_esid;
 	to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid;
 
-	dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid);
+	trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 23f757a..3aca1b0 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -262,6 +262,79 @@ TRACE_EVENT(kvm_book3s_mmu_flush,
 		  __entry->count, __entry->type, __entry->p1, __entry->p2)
 );
 
+TRACE_EVENT(kvm_book3s_slb_found,
+	TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
+	TP_ARGS(gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	gvsid		)
+		__field(	unsigned long long,	hvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->gvsid		= gvsid;
+		__entry->hvsid		= hvsid;
+	),
+
+	TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_fail,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
+	TP_ARGS(sid_map_mask, gvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	gvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->gvsid		= gvsid;
+	),
+
+	TP_printk("%x/%x: %llx", __entry->sid_map_mask,
+		  SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_map,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
+		 unsigned long long hvsid),
+	TP_ARGS(sid_map_mask, gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	guest_vsid	)
+		__field(	unsigned long long,	host_vsid	)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->guest_vsid	= gvsid;
+		__entry->host_vsid	= hvsid;
+	),
+
+	TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
+		  __entry->guest_vsid, __entry->host_vsid)
+);
+
+TRACE_EVENT(kvm_book3s_slbmte,
+	TP_PROTO(u64 slb_vsid, u64 slb_esid),
+	TP_ARGS(slb_vsid, slb_esid),
+
+	TP_STRUCT__entry(
+		__field(	u64,	slb_vsid	)
+		__field(	u64,	slb_esid	)
+	),
+
+	TP_fast_assign(
+		__entry->slb_vsid	= slb_vsid;
+		__entry->slb_esid	= slb_esid;
+	),
+
+	TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
+);
+
 #endif /* CONFIG_PPC_BOOK3S */
 
 #endif /* _TRACE_KVM_H */
-- 
cgit v0.10.2


From b9877ce2994cc812f00dbb2adb88c1749b6dac86 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 21:48:53 +0200
Subject: KVM: PPC: Revert "KVM: PPC: Use kernel hash function"

It turns out the in-kernel hash function is sub-optimal for our subtle
hash inputs where every bit is significant. So let's revert to the original
hash functions.

This reverts commit 05340ab4f9a6626f7a2e8f9fe5397c61d494f445.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 343452c..57dddeb 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -19,7 +19,6 @@
  */
 
 #include <linux/kvm_host.h>
-#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -77,7 +76,14 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-	return hash_64(gvsid, SID_MAP_BITS);
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 321c931..e7c4d00 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -20,7 +20,6 @@
  */
 
 #include <linux/kvm_host.h>
-#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -44,9 +43,17 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-	return hash_64(gvsid, SID_MAP_BITS);
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
 }
 
+
 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 {
 	struct kvmppc_sid_map *map;
-- 
cgit v0.10.2


From cb24c50826e0722bffb0674f088954cd4980818b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 22:05:00 +0200
Subject: KVM: PPC: Remove unused define

The define VSID_ALL is unused. Let's remove it.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index e7c4d00..4040c8d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -30,7 +30,6 @@
 #include "trace.h"
 
 #define PTE_SIZE 12
-#define VSID_ALL 0
 
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
-- 
cgit v0.10.2


From 7508e16c9f2a20f7721d7bc47c33a7b34c873a2c Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 3 Aug 2010 11:32:56 +0200
Subject: KVM: PPC: Add feature bitmap for magic page

We will soon add SR PV support to the shared page, so we need some
infrastructure that allows the guest to query for features KVM exports.

This patch adds a second return value to the magic mapping that
indicated to the guest which features are available.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 7438ab3..43c1b22 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -47,6 +47,8 @@ struct kvm_vcpu_arch_shared {
 
 #define KVM_FEATURE_MAGIC_PAGE	1
 
+#define KVM_MAGIC_FEAT_SR	(1 << 0)
+
 #ifdef __KERNEL__
 
 #ifdef CONFIG_KVM_GUEST
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index d3a2cc5..226882f 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -266,12 +266,20 @@ static void kvm_patch_ins_wrteei(u32 *inst)
 
 static void kvm_map_magic_page(void *data)
 {
-	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
-		       KVM_MAGIC_PAGE,  /* Physical Address */
-		       KVM_MAGIC_PAGE); /* Effective Address */
+	u32 *features = data;
+
+	ulong in[8];
+	ulong out[8];
+
+	in[0] = KVM_MAGIC_PAGE;
+	in[1] = KVM_MAGIC_PAGE;
+
+	kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+
+	*features = out[0];
 }
 
-static void kvm_check_ins(u32 *inst)
+static void kvm_check_ins(u32 *inst, u32 features)
 {
 	u32 _inst = *inst;
 	u32 inst_no_rt = _inst & ~KVM_MASK_RT;
@@ -367,9 +375,10 @@ static void kvm_use_magic_page(void)
 	u32 *p;
 	u32 *start, *end;
 	u32 tmp;
+	u32 features;
 
 	/* Tell the host to map the magic page to -4096 on all CPUs */
-	on_each_cpu(kvm_map_magic_page, NULL, 1);
+	on_each_cpu(kvm_map_magic_page, &features, 1);
 
 	/* Quick self-test to see if the mapping works */
 	if (__get_user(tmp, (u32*)KVM_MAGIC_PAGE)) {
@@ -382,7 +391,7 @@ static void kvm_use_magic_page(void)
 	end = (void*)_etext;
 
 	for (p = start; p < end; p++)
-		kvm_check_ins(p);
+		kvm_check_ins(p, features);
 
 	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
 			 kvm_patching_worked ? "worked" : "failed");
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6a53a3f..496d7a5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -66,6 +66,8 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 		vcpu->arch.magic_page_pa = param1;
 		vcpu->arch.magic_page_ea = param2;
 
+		r2 = 0;
+
 		r = HC_EV_SUCCESS;
 		break;
 	}
@@ -76,13 +78,14 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 #endif
 
 		/* Second return value is in r4 */
-		kvmppc_set_gpr(vcpu, 4, r2);
 		break;
 	default:
 		r = HC_EV_UNIMPLEMENTED;
 		break;
 	}
 
+	kvmppc_set_gpr(vcpu, 4, r2);
+
 	return r;
 }
 
-- 
cgit v0.10.2


From c1c88e2fa16f979ba3e99018a53962abe852b30f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Aug 2010 23:23:04 +0200
Subject: KVM: PPC: Move BAT handling code into spr handler

The current approach duplicates the spr->bat finding logic and makes it harder
to reuse the actually used variables. So let's move everything down to the spr
handler.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index f333cb4..4668465 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -264,7 +264,7 @@ void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
 	}
 }
 
-static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn)
+static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_bat *bat;
@@ -286,35 +286,7 @@ static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn)
 		BUG();
 	}
 
-	if (sprn % 2)
-		return bat->raw >> 32;
-	else
-		return bat->raw;
-}
-
-static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
-{
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-	struct kvmppc_bat *bat;
-
-	switch (sprn) {
-	case SPRN_IBAT0U ... SPRN_IBAT3L:
-		bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
-		break;
-	case SPRN_IBAT4U ... SPRN_IBAT7L:
-		bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
-		break;
-	case SPRN_DBAT0U ... SPRN_DBAT3L:
-		bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
-		break;
-	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
-		break;
-	default:
-		BUG();
-	}
-
-	kvmppc_set_bat(vcpu, bat, !(sprn % 2), val);
+	return bat;
 }
 
 int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
@@ -339,12 +311,16 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_IBAT4U ... SPRN_IBAT7L:
 	case SPRN_DBAT0U ... SPRN_DBAT3L:
 	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		kvmppc_write_bat(vcpu, sprn, (u32)spr_val);
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val);
 		/* BAT writes happen so rarely that we're ok to flush
 		 * everything here */
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
 		kvmppc_mmu_flush_segments(vcpu);
 		break;
+	}
 	case SPRN_HID0:
 		to_book3s(vcpu)->hid[0] = spr_val;
 		break;
@@ -434,8 +410,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_IBAT4U ... SPRN_IBAT7L:
 	case SPRN_DBAT0U ... SPRN_DBAT3L:
 	case SPRN_DBAT4U ... SPRN_DBAT7L:
-		kvmppc_set_gpr(vcpu, rt, kvmppc_read_bat(vcpu, sprn));
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		if (sprn % 2)
+			kvmppc_set_gpr(vcpu, rt, bat->raw >> 32);
+		else
+			kvmppc_set_gpr(vcpu, rt, bat->raw);
+
 		break;
+	}
 	case SPRN_SDR1:
 		kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
 		break;
-- 
cgit v0.10.2


From 8e8651783ff2458f31098be7c2abacf2fcab054a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 3 Aug 2010 01:06:11 +0200
Subject: KVM: PPC: Interpret SR registers on demand

Right now we're examining the contents of Book3s_32's segment registers when
the register is written and put the interpreted contents into a struct.

There are two reasons this is bad. For starters, the struct has worse real-time
performance, as it occupies more ram. But the more important part is that with
segment registers being interpreted from their raw values, we can put them in
the shared page, allowing guests to mess with them directly.

This patch makes the internal representation of SRs be u32s.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f04f516..0884652 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -38,15 +38,6 @@ struct kvmppc_slb {
 	bool class	: 1;
 };
 
-struct kvmppc_sr {
-	u32 raw;
-	u32 vsid;
-	bool Ks		: 1;
-	bool Kp		: 1;
-	bool nx		: 1;
-	bool valid	: 1;
-};
-
 struct kvmppc_bat {
 	u64 raw;
 	u32 bepi;
@@ -79,7 +70,7 @@ struct kvmppc_vcpu_book3s {
 		u64 vsid;
 	} slb_shadow[64];
 	u8 slb_shadow_max;
-	struct kvmppc_sr sr[16];
+	u32 sr[16];
 	struct kvmppc_bat ibat[8];
 	struct kvmppc_bat dbat[8];
 	u64 hid[6];
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 2fb528f..34472af 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1162,8 +1162,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		}
 	} else {
 		for (i = 0; i < 16; i++) {
-			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw;
-			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw;
+			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i];
+			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i];
 		}
 		for (i = 0; i < 8; i++) {
 			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 5bf4bf8..d4ff76f 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -58,14 +58,39 @@ static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static inline u32 sr_vsid(u32 sr_raw)
+{
+	return sr_raw & 0x0fffffff;
+}
+
+static inline bool sr_valid(u32 sr_raw)
+{
+	return (sr_raw & 0x80000000) ? false : true;
+}
+
+static inline bool sr_ks(u32 sr_raw)
+{
+	return (sr_raw & 0x40000000) ? true: false;
+}
+
+static inline bool sr_kp(u32 sr_raw)
+{
+	return (sr_raw & 0x20000000) ? true: false;
+}
+
+static inline bool sr_nx(u32 sr_raw)
+{
+	return (sr_raw & 0x10000000) ? true: false;
+}
+
 static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
 					  struct kvmppc_pte *pte, bool data);
 static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 					     u64 *vsid);
 
-static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr)
+static u32 find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr)
 {
-	return &vcpu_book3s->sr[(eaddr >> 28) & 0xf];
+	return vcpu_book3s->sr[(eaddr >> 28) & 0xf];
 }
 
 static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -87,7 +112,7 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s,
-				      struct kvmppc_sr *sre, gva_t eaddr,
+				      u32 sre, gva_t eaddr,
 				      bool primary)
 {
 	u32 page, hash, pteg, htabmask;
@@ -96,7 +121,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 	page = (eaddr & 0x0FFFFFFF) >> 12;
 	htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0;
 
-	hash = ((sre->vsid ^ page) << 6);
+	hash = ((sr_vsid(sre) ^ page) << 6);
 	if (!primary)
 		hash = ~hash;
 	hash &= htabmask;
@@ -105,7 +130,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 
 	dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
 		kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg,
-		sre->vsid);
+		sr_vsid(sre));
 
 	r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
 	if (kvm_is_error_hva(r))
@@ -113,10 +138,9 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
 	return r | (pteg & ~PAGE_MASK);
 }
 
-static u32 kvmppc_mmu_book3s_32_get_ptem(struct kvmppc_sr *sre, gva_t eaddr,
-				    bool primary)
+static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)
 {
-	return ((eaddr & 0x0fffffff) >> 22) | (sre->vsid << 7) |
+	return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) |
 	       (primary ? 0 : 0x40) | 0x80000000;
 }
 
@@ -180,7 +204,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 				     bool primary)
 {
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-	struct kvmppc_sr *sre;
+	u32 sre;
 	hva_t ptegp;
 	u32 pteg[16];
 	u32 ptem = 0;
@@ -190,7 +214,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 	sre = find_sr(vcpu_book3s, eaddr);
 
 	dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
-		    sre->vsid, sre->raw);
+		    sr_vsid(sre), sre);
 
 	pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
 
@@ -214,8 +238,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 			pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF);
 			pp = pteg[i+1] & 3;
 
-			if ((sre->Kp &&  (vcpu->arch.shared->msr & MSR_PR)) ||
-			    (sre->Ks && !(vcpu->arch.shared->msr & MSR_PR)))
+			if ((sr_kp(sre) &&  (vcpu->arch.shared->msr & MSR_PR)) ||
+			    (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR)))
 				pp |= 4;
 
 			pte->may_write = false;
@@ -311,30 +335,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
 {
-	return to_book3s(vcpu)->sr[srnum].raw;
+	return to_book3s(vcpu)->sr[srnum];
 }
 
 static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
 					ulong value)
 {
-	struct kvmppc_sr *sre;
-
-	sre = &to_book3s(vcpu)->sr[srnum];
-
-	/* Flush any left-over shadows from the previous SR */
-
-	/* XXX Not necessary? */
-	/* kvmppc_mmu_pte_flush(vcpu, ((u64)sre->vsid) << 28, 0xf0000000ULL); */
-
-	/* And then put in the new SR */
-	sre->raw = value;
-	sre->vsid = (value & 0x0fffffff);
-	sre->valid = (value & 0x80000000) ? false : true;
-	sre->Ks = (value & 0x40000000) ? true : false;
-	sre->Kp = (value & 0x20000000) ? true : false;
-	sre->nx = (value & 0x10000000) ? true : false;
-
-	/* Map the new segment */
+	to_book3s(vcpu)->sr[srnum] = value;
 	kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
 }
 
@@ -347,13 +354,13 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 					     u64 *vsid)
 {
 	ulong ea = esid << SID_SHIFT;
-	struct kvmppc_sr *sr;
+	u32 sr;
 	u64 gvsid = esid;
 
 	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
 		sr = find_sr(to_book3s(vcpu), ea);
-		if (sr->valid)
-			gvsid = sr->vsid;
+		if (sr_valid(sr))
+			gvsid = sr_vsid(sr);
 	}
 
 	/* In case we only have one of MSR_IR or MSR_DR set, let's put
@@ -370,8 +377,8 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		*vsid = VSID_REAL_DR | gvsid;
 		break;
 	case MSR_DR|MSR_IR:
-		if (sr->valid)
-			*vsid = sr->vsid;
+		if (sr_valid(sr))
+			*vsid = sr_vsid(sr);
 		else
 			*vsid = VSID_BAT | gvsid;
 		break;
-- 
cgit v0.10.2


From df1bfa25d81f9451715ccbbb67551e0f792ceec8 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 3 Aug 2010 02:29:27 +0200
Subject: KVM: PPC: Put segment registers in shared page

Now that the actual mtsr doesn't do anything anymore, we can move the sr
contents over to the shared page, so a guest can directly read and write
its sr contents from guest context.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 0884652..be8aac2 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -70,7 +70,6 @@ struct kvmppc_vcpu_book3s {
 		u64 vsid;
 	} slb_shadow[64];
 	u8 slb_shadow_max;
-	u32 sr[16];
 	struct kvmppc_bat ibat[8];
 	struct kvmppc_bat dbat[8];
 	u64 hid[6];
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 43c1b22..d79fd09 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -38,6 +38,7 @@ struct kvm_vcpu_arch_shared {
 	__u64 msr;
 	__u32 dsisr;
 	__u32 int_pending;	/* Tells the guest if we have an interrupt */
+	__u32 sr[16];
 };
 
 #define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 34472af..02a9cb1 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1161,10 +1161,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 			sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
 		}
 	} else {
-		for (i = 0; i < 16; i++) {
-			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i];
-			sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i];
-		}
+		for (i = 0; i < 16; i++)
+			sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
+
 		for (i = 0; i < 8; i++) {
 			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
 			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index d4ff76f..c8cefdd 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -88,9 +88,9 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
 static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 					     u64 *vsid);
 
-static u32 find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr)
+static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	return vcpu_book3s->sr[(eaddr >> 28) & 0xf];
+	return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf];
 }
 
 static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -211,7 +211,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
 	int i;
 	int found = 0;
 
-	sre = find_sr(vcpu_book3s, eaddr);
+	sre = find_sr(vcpu, eaddr);
 
 	dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
 		    sr_vsid(sre), sre);
@@ -335,13 +335,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
 {
-	return to_book3s(vcpu)->sr[srnum];
+	return vcpu->arch.shared->sr[srnum];
 }
 
 static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
 					ulong value)
 {
-	to_book3s(vcpu)->sr[srnum] = value;
+	vcpu->arch.shared->sr[srnum] = value;
 	kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
 }
 
@@ -358,7 +358,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	u64 gvsid = esid;
 
 	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-		sr = find_sr(to_book3s(vcpu), ea);
+		sr = find_sr(vcpu, ea);
 		if (sr_valid(sr))
 			gvsid = sr_vsid(sr);
 	}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 496d7a5..028891c 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -66,7 +66,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 		vcpu->arch.magic_page_pa = param1;
 		vcpu->arch.magic_page_ea = param2;
 
-		r2 = 0;
+		r2 = KVM_MAGIC_FEAT_SR;
 
 		r = HC_EV_SUCCESS;
 		break;
-- 
cgit v0.10.2


From cbe487fac7fc016dbabbcbe83f11384e1803a56d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 3 Aug 2010 10:39:35 +0200
Subject: KVM: PPC: Add mtsrin PV code

This is the guest side of the mtsr acceleration. Using this a guest can now
call mtsrin with almost no overhead as long as it ensures that it only uses
it with (MSR_IR|MSR_DR) == 0. Linux does that, so we're good.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
index 41ee16d..922cf95 100644
--- a/Documentation/kvm/ppc-pv.txt
+++ b/Documentation/kvm/ppc-pv.txt
@@ -160,6 +160,9 @@ mtmsr	rX		b	<special mtmsr section>
 
 mtmsrd	rX, 1		b	<special mtmsrd section>
 
+[Book3S only]
+mtsrin	rX, rY		b	<special mtsrin section>
+
 [BookE only]
 wrteei	[0|1]		b	<special wrteei section>
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6d92b4e..7f0d6fc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -478,6 +478,7 @@ int main(void)
 	DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 	DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared,
 					    critical));
+	DEFINE(KVM_MAGIC_SR, offsetof(struct kvm_vcpu_arch_shared, sr));
 #endif
 
 #ifdef CONFIG_44x
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 226882f..c8bab24 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -42,6 +42,7 @@
 #define KVM_INST_B_MAX		0x01ffffff
 
 #define KVM_MASK_RT		0x03e00000
+#define KVM_MASK_RB		0x0000f800
 #define KVM_INST_MFMSR		0x7c0000a6
 #define KVM_INST_MFSPR_SPRG0	0x7c1042a6
 #define KVM_INST_MFSPR_SPRG1	0x7c1142a6
@@ -69,6 +70,8 @@
 #define KVM_INST_WRTEEI_0	0x7c000146
 #define KVM_INST_WRTEEI_1	0x7c008146
 
+#define KVM_INST_MTSRIN		0x7c0001e4
+
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
 static int kvm_tmp_index;
@@ -264,6 +267,51 @@ static void kvm_patch_ins_wrteei(u32 *inst)
 
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S_32
+
+extern u32 kvm_emulate_mtsrin_branch_offs;
+extern u32 kvm_emulate_mtsrin_reg1_offs;
+extern u32 kvm_emulate_mtsrin_reg2_offs;
+extern u32 kvm_emulate_mtsrin_orig_ins_offs;
+extern u32 kvm_emulate_mtsrin_len;
+extern u32 kvm_emulate_mtsrin[];
+
+static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtsrin_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4);
+	p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10);
+	p[kvm_emulate_mtsrin_reg2_offs] |= rt;
+	p[kvm_emulate_mtsrin_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
 static void kvm_map_magic_page(void *data)
 {
 	u32 *features = data;
@@ -360,6 +408,18 @@ static void kvm_check_ins(u32 *inst, u32 features)
 		break;
 	}
 
+	switch (inst_no_rt & ~KVM_MASK_RB) {
+#ifdef CONFIG_PPC_BOOK3S_32
+	case KVM_INST_MTSRIN:
+		if (features & KVM_MAGIC_FEAT_SR) {
+			u32 inst_rb = _inst & KVM_MASK_RB;
+			kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb);
+		}
+		break;
+		break;
+#endif
+	}
+
 	switch (_inst) {
 #ifdef CONFIG_BOOKE
 	case KVM_INST_WRTEEI_0:
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 3199f65..a6e97e7 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -245,3 +245,53 @@ kvm_emulate_wrteei_ee_offs:
 .global kvm_emulate_wrteei_len
 kvm_emulate_wrteei_len:
 	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
+
+
+.global kvm_emulate_mtsrin
+kvm_emulate_mtsrin:
+
+	SCRATCH_SAVE
+
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	andi.	r31, r31, MSR_DR | MSR_IR
+	beq	kvm_emulate_mtsrin_reg1
+
+	SCRATCH_RESTORE
+
+kvm_emulate_mtsrin_orig_ins:
+	nop
+	b	kvm_emulate_mtsrin_branch
+
+kvm_emulate_mtsrin_reg1:
+	/* rX >> 26 */
+	rlwinm  r30,r0,6,26,29
+
+kvm_emulate_mtsrin_reg2:
+	stw	r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtsrin_branch:
+	b	.
+kvm_emulate_mtsrin_end:
+
+.global kvm_emulate_mtsrin_branch_offs
+kvm_emulate_mtsrin_branch_offs:
+	.long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg1_offs
+kvm_emulate_mtsrin_reg1_offs:
+	.long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg2_offs
+kvm_emulate_mtsrin_reg2_offs:
+	.long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_orig_ins_offs
+kvm_emulate_mtsrin_orig_ins_offs:
+	.long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_len
+kvm_emulate_mtsrin_len:
+	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
-- 
cgit v0.10.2


From 512ba59ed9c580b5e5575beda0041bb19a641127 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 5 Aug 2010 11:26:04 +0200
Subject: KVM: PPC: Make PV mtmsr work with r30 and r31

So far we've been restricting ourselves to r0-r29 as registers an mtmsr
instruction could use. This was bad, as there are some code paths in
Linux actually using r30.

So let's instead handle all registers gracefully and get rid of that
stupid limitation

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index c8bab24..10b681c 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -42,6 +42,7 @@
 #define KVM_INST_B_MAX		0x01ffffff
 
 #define KVM_MASK_RT		0x03e00000
+#define KVM_RT_30		0x03c00000
 #define KVM_MASK_RB		0x0000f800
 #define KVM_INST_MFMSR		0x7c0000a6
 #define KVM_INST_MFSPR_SPRG0	0x7c1042a6
@@ -82,6 +83,15 @@ static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
 	flush_icache_range((ulong)inst, (ulong)inst + 4);
 }
 
+static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000fffc));
+#endif
+}
+
 static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
 {
 #ifdef CONFIG_64BIT
@@ -186,7 +196,6 @@ static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
 extern u32 kvm_emulate_mtmsr_branch_offs;
 extern u32 kvm_emulate_mtmsr_reg1_offs;
 extern u32 kvm_emulate_mtmsr_reg2_offs;
-extern u32 kvm_emulate_mtmsr_reg3_offs;
 extern u32 kvm_emulate_mtmsr_orig_ins_offs;
 extern u32 kvm_emulate_mtmsr_len;
 extern u32 kvm_emulate_mtmsr[];
@@ -216,9 +225,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 	/* Modify the chunk to fit the invocation */
 	memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4);
 	p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK;
-	p[kvm_emulate_mtmsr_reg1_offs] |= rt;
-	p[kvm_emulate_mtmsr_reg2_offs] |= rt;
-	p[kvm_emulate_mtmsr_reg3_offs] |= rt;
+
+	/* Make clobbered registers work too */
+	switch (get_rt(rt)) {
+	case 30:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		break;
+	case 31:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		break;
+	default:
+		p[kvm_emulate_mtmsr_reg1_offs] |= rt;
+		p[kvm_emulate_mtmsr_reg2_offs] |= rt;
+		break;
+	}
+
 	p[kvm_emulate_mtmsr_orig_ins_offs] = *inst;
 	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4);
 
@@ -402,9 +429,7 @@ static void kvm_check_ins(u32 *inst, u32 features)
 		break;
 	case KVM_INST_MTMSR:
 	case KVM_INST_MTMSRD_L0:
-		/* We use r30 and r31 during the hook */
-		if (get_rt(inst_rt) < 30)
-			kvm_patch_ins_mtmsr(inst, inst_rt);
+		kvm_patch_ins_mtmsr(inst, inst_rt);
 		break;
 	}
 
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index a6e97e7..6530532 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -135,7 +135,8 @@ kvm_emulate_mtmsr:
 
 	/* Find the changed bits between old and new MSR */
 kvm_emulate_mtmsr_reg1:
-	xor	r31, r0, r31
+	ori	r30, r0, 0
+	xor	r31, r30, r31
 
 	/* Check if we need to really do mtmsr */
 	LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS)
@@ -156,14 +157,17 @@ kvm_emulate_mtmsr_orig_ins:
 
 maybe_stay_in_guest:
 
+	/* Get the target register in r30 */
+kvm_emulate_mtmsr_reg2:
+	ori	r30, r0, 0
+
 	/* Check if we have to fetch an interrupt */
 	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
 	cmpwi	r31, 0
 	beq+	no_mtmsr
 
 	/* Check if we may trigger an interrupt */
-kvm_emulate_mtmsr_reg2:
-	andi.	r31, r0, MSR_EE
+	andi.	r31, r30, MSR_EE
 	beq	no_mtmsr
 
 	b	do_mtmsr
@@ -171,8 +175,7 @@ kvm_emulate_mtmsr_reg2:
 no_mtmsr:
 
 	/* Put MSR into magic page because we don't call mtmsr */
-kvm_emulate_mtmsr_reg3:
-	STL64(r0, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
 
 	SCRATCH_RESTORE
 
@@ -193,10 +196,6 @@ kvm_emulate_mtmsr_reg1_offs:
 kvm_emulate_mtmsr_reg2_offs:
 	.long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4
 
-.global kvm_emulate_mtmsr_reg3_offs
-kvm_emulate_mtmsr_reg3_offs:
-	.long (kvm_emulate_mtmsr_reg3 - kvm_emulate_mtmsr) / 4
-
 .global kvm_emulate_mtmsr_orig_ins_offs
 kvm_emulate_mtmsr_orig_ins_offs:
 	.long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4
-- 
cgit v0.10.2


From 9ee18b1e08e6a5648aeaaf998eabc72b5304cc17 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 5 Aug 2010 12:24:40 +0200
Subject: KVM: PPC: Update int_pending also on dequeue

When having a decrementor interrupt pending, the dequeuing happens manually
through an mtdec instruction. This instruction simply calls dequeue on that
interrupt, so the int_pending hint doesn't get updated.

This patch enables updating the int_pending hint also on dequeue, thus
correctly enabling guests to stay in guest contexts more often.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 02a9cb1..7adea63 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -201,6 +201,9 @@ static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 {
 	clear_bit(kvmppc_book3s_vec2irqprio(vec),
 		  &vcpu->arch.pending_exceptions);
+
+	if (!vcpu->arch.pending_exceptions)
+		vcpu->arch.shared->int_pending = 0;
 }
 
 void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
-- 
cgit v0.10.2


From df08bd10266ce6132278f6b4ddc4bb0a12330b56 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 5 Aug 2010 15:44:41 +0200
Subject: KVM: PPC: Make PV mtmsrd L=1 work with r30 and r31

We had an arbitrary limitation in mtmsrd L=1 that kept us from using r30 and
r31 as input registers. Let's get rid of that and get more potential speedups!

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 10b681c..48a0338 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -158,6 +158,7 @@ static u32 *kvm_alloc(int len)
 
 extern u32 kvm_emulate_mtmsrd_branch_offs;
 extern u32 kvm_emulate_mtmsrd_reg_offs;
+extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
 extern u32 kvm_emulate_mtmsrd_len;
 extern u32 kvm_emulate_mtmsrd[];
 
@@ -186,7 +187,21 @@ static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
 	/* Modify the chunk to fit the invocation */
 	memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4);
 	p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK;
-	p[kvm_emulate_mtmsrd_reg_offs] |= rt;
+	switch (get_rt(rt)) {
+	case 30:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
+				 magic_var(scratch2), KVM_RT_30);
+		break;
+	case 31:
+		kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
+				 magic_var(scratch1), KVM_RT_30);
+		break;
+	default:
+		p[kvm_emulate_mtmsrd_reg_offs] |= rt;
+		break;
+	}
+
+	p[kvm_emulate_mtmsrd_orig_ins_offs] = *inst;
 	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4);
 
 	/* Patch the invocation */
@@ -423,9 +438,7 @@ static void kvm_check_ins(u32 *inst, u32 features)
 
 	/* Rewrites */
 	case KVM_INST_MTMSRD_L1:
-		/* We use r30 and r31 during the hook */
-		if (get_rt(inst_rt) < 30)
-			kvm_patch_ins_mtmsrd(inst, inst_rt);
+		kvm_patch_ins_mtmsrd(inst, inst_rt);
 		break;
 	case KVM_INST_MTMSR:
 	case KVM_INST_MTMSRD_L0:
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 6530532..f2b1b25 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -78,7 +78,8 @@ kvm_emulate_mtmsrd:
 
 	/* OR the register's (MSR_EE|MSR_RI) on MSR */
 kvm_emulate_mtmsrd_reg:
-	andi.	r30, r0, (MSR_EE|MSR_RI)
+	ori	r30, r0, 0
+	andi.	r30, r30, (MSR_EE|MSR_RI)
 	or	r31, r31, r30
 
 	/* Put MSR back into magic page */
@@ -96,6 +97,7 @@ kvm_emulate_mtmsrd_reg:
 	SCRATCH_RESTORE
 
 	/* Nag hypervisor */
+kvm_emulate_mtmsrd_orig_ins:
 	tlbsync
 
 	b	kvm_emulate_mtmsrd_branch
@@ -117,6 +119,10 @@ kvm_emulate_mtmsrd_branch_offs:
 kvm_emulate_mtmsrd_reg_offs:
 	.long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4
 
+.global kvm_emulate_mtmsrd_orig_ins_offs
+kvm_emulate_mtmsrd_orig_ins_offs:
+	.long (kvm_emulate_mtmsrd_orig_ins - kvm_emulate_mtmsrd) / 4
+
 .global kvm_emulate_mtmsrd_len
 kvm_emulate_mtmsrd_len:
 	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
-- 
cgit v0.10.2


From ad0873763a83e7b31ba87a85ec2027dd6a9d7b55 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 17 Aug 2010 11:41:44 +0200
Subject: KVM: PPC: Force enable nap on KVM

There are some heuristics in the PPC power management code that try to find
out if the particular hardware we're running on supports proper power management
or just hangs the machine when going into nap mode.

Since we know that KVM is safe with nap, let's force enable it in the PV code
once we're certain that we are on a KVM VM.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 48a0338..669d989 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -582,6 +582,9 @@ static int __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
 		kvm_use_magic_page();
 
+	/* Enable napping */
+	powersave_nap = 1;
+
 free_tmp:
 	kvm_free_tmp();
 
-- 
cgit v0.10.2


From 8b6db3bc965c204db6868d4005808b4fdc9c46d7 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sun, 15 Aug 2010 08:04:24 +0200
Subject: KVM: PPC: Implement correct SID mapping on Book3s_32

Up until now we were doing segment mappings wrong on Book3s_32. For Book3s_64
we were using a trick where we know that a single mmu_context gives us 16 bits
of context ids.

The mm system on Book3s_32 instead uses a clever algorithm to distribute VSIDs
across the available range, so a context id really only gives us 16 available
VSIDs.

To keep at least a few guest processes in the SID shadow, let's map a number of
contexts that we can use as VSID pool. This makes the code be actually correct
and shouldn't hurt performance too much.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index be8aac2..d62e703 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -60,6 +60,13 @@ struct kvmppc_sid_map {
 #define SID_MAP_NUM     (1 << SID_MAP_BITS)
 #define SID_MAP_MASK    (SID_MAP_NUM - 1)
 
+#ifdef CONFIG_PPC_BOOK3S_64
+#define SID_CONTEXTS	1
+#else
+#define SID_CONTEXTS	128
+#define VSID_POOL_SIZE	(SID_CONTEXTS * 16)
+#endif
+
 struct kvmppc_vcpu_book3s {
 	struct kvm_vcpu vcpu;
 	struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
@@ -78,10 +85,14 @@ struct kvmppc_vcpu_book3s {
 	u64 sdr1;
 	u64 hior;
 	u64 msr_mask;
-	u64 vsid_first;
 	u64 vsid_next;
+#ifdef CONFIG_PPC_BOOK3S_32
+	u32 vsid_pool[VSID_POOL_SIZE];
+#else
+	u64 vsid_first;
 	u64 vsid_max;
-	int context_id;
+#endif
+	int context_id[SID_CONTEXTS];
 	ulong prog_flags; /* flags to inject when giving a 700 trap */
 };
 
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 57dddeb..9fecbfb 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -275,18 +275,15 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 	backwards_map = !backwards_map;
 
 	/* Uh-oh ... out of mappings. Let's flush! */
-	if (vcpu_book3s->vsid_next >= vcpu_book3s->vsid_max) {
-		vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+	if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) {
+		vcpu_book3s->vsid_next = 0;
 		memset(vcpu_book3s->sid_map, 0,
 		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
 		kvmppc_mmu_flush_segments(vcpu);
 	}
-	map->host_vsid = vcpu_book3s->vsid_next;
-
-	/* Would have to be 111 to be completely aligned with the rest of
-	   Linux, but that is just way too little space! */
-	vcpu_book3s->vsid_next+=1;
+	map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next];
+	vcpu_book3s->vsid_next++;
 
 	map->guest_vsid = gvsid;
 	map->valid = true;
@@ -333,40 +330,38 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
+	int i;
+
 	kvmppc_mmu_hpte_destroy(vcpu);
 	preempt_disable();
-	__destroy_context(to_book3s(vcpu)->context_id);
+	for (i = 0; i < SID_CONTEXTS; i++)
+		__destroy_context(to_book3s(vcpu)->context_id[i]);
 	preempt_enable();
 }
 
 /* From mm/mmu_context_hash32.c */
-#define CTX_TO_VSID(ctx) (((ctx) * (897 * 16)) & 0xffffff)
+#define CTX_TO_VSID(c, id)	((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff)
 
 int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int err;
 	ulong sdr1;
+	int i;
+	int j;
 
-	err = __init_new_context();
-	if (err < 0)
-		return -1;
-	vcpu3s->context_id = err;
-
-	vcpu3s->vsid_max = CTX_TO_VSID(vcpu3s->context_id + 1) - 1;
-	vcpu3s->vsid_first = CTX_TO_VSID(vcpu3s->context_id);
-
-#if 0 /* XXX still doesn't guarantee uniqueness */
-	/* We could collide with the Linux vsid space because the vsid
-	 * wraps around at 24 bits. We're safe if we do our own space
-	 * though, so let's always set the highest bit. */
+	for (i = 0; i < SID_CONTEXTS; i++) {
+		err = __init_new_context();
+		if (err < 0)
+			goto init_fail;
+		vcpu3s->context_id[i] = err;
 
-	vcpu3s->vsid_max |= 0x00800000;
-	vcpu3s->vsid_first |= 0x00800000;
-#endif
-	BUG_ON(vcpu3s->vsid_max < vcpu3s->vsid_first);
+		/* Remember context id for this combination */
+		for (j = 0; j < 16; j++)
+			vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j);
+	}
 
-	vcpu3s->vsid_next = vcpu3s->vsid_first;
+	vcpu3s->vsid_next = 0;
 
 	/* Remember where the HTAB is */
 	asm ( "mfsdr1 %0" : "=r"(sdr1) );
@@ -376,4 +371,14 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	kvmppc_mmu_hpte_init(vcpu);
 
 	return 0;
+
+init_fail:
+	for (j = 0; j < i; j++) {
+		if (!vcpu3s->context_id[j])
+			continue;
+
+		__destroy_context(to_book3s(vcpu)->context_id[j]);
+	}
+
+	return -1;
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4040c8d..fa2f084 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -286,7 +286,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvmppc_mmu_hpte_destroy(vcpu);
-	__destroy_context(to_book3s(vcpu)->context_id);
+	__destroy_context(to_book3s(vcpu)->context_id[0]);
 }
 
 int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
@@ -297,10 +297,10 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	err = __init_new_context();
 	if (err < 0)
 		return -1;
-	vcpu3s->context_id = err;
+	vcpu3s->context_id[0] = err;
 
-	vcpu3s->vsid_max = ((vcpu3s->context_id + 1) << USER_ESID_BITS) - 1;
-	vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
+	vcpu3s->vsid_max = ((vcpu3s->context_id[0] + 1) << USER_ESID_BITS) - 1;
+	vcpu3s->vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS;
 	vcpu3s->vsid_next = vcpu3s->vsid_first;
 
 	kvmppc_mmu_hpte_init(vcpu);
-- 
cgit v0.10.2


From 296c19d0b4072dd9594daeec532563e56bddd119 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sun, 15 Aug 2010 08:39:19 +0200
Subject: KVM: PPC: Don't put MSR_POW in MSR

On Book3S a mtmsr with the MSR_POW bit set indicates that the OS is in
idle and only needs to be waked up on the next interrupt.

Now, unfortunately we let that bit slip into the stored MSR value which
is not what the real CPU does, so that we ended up executing code like
this:

	r = mfmsr();
	/* r containts MSR_POW */
	mtmsr(r | MSR_EE);

This obviously breaks, as we're going into idle mode in code sections that
don't expect to be idling.

This patch masks MSR_POW out of the stored MSR value on wakeup, making
guests happy again.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 7adea63..5833df7 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -134,10 +134,14 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 	vcpu->arch.shared->msr = msr;
 	kvmppc_recalc_shadow_msr(vcpu);
 
-	if (msr & (MSR_WE|MSR_POW)) {
+	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
 			vcpu->stat.halt_wakeup++;
+
+			/* Unset POW bit after we woke up */
+			msr &= ~MSR_POW;
+			vcpu->arch.shared->msr = msr;
 		}
 	}
 
-- 
cgit v0.10.2


From 082decf29a9fe5bd5dcbfb26223e44edd9deabed Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollis_blanchard@mentor.com>
Date: Sat, 7 Aug 2010 10:33:56 -0700
Subject: KVM: PPC: initialize IVORs in addition to IVPR

Developers can now tell at a glace the exact type of the premature interrupt,
instead of just knowing that there was some premature interrupt.

Signed-off-by: Hollis Blanchard <hollis_blanchard@mentor.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c604277..835f6d0 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -497,15 +497,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
+	int i;
+
 	vcpu->arch.pc = 0;
 	vcpu->arch.shared->msr = 0;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
 
 	vcpu->arch.shadow_pid = 1;
 
-	/* Eye-catching number so we know if the guest takes an interrupt
-	 * before it's programmed its own IVPR. */
+	/* Eye-catching numbers so we know if the guest takes an interrupt
+	 * before it's programmed its own IVPR/IVORs. */
 	vcpu->arch.ivpr = 0x55550000;
+	for (i = 0; i < BOOKE_IRQPRIO_MAX; i++)
+		vcpu->arch.ivor[i] = 0x7700 | i * 4;
 
 	kvmppc_init_timing_stats(vcpu);
 
-- 
cgit v0.10.2


From 0b3bafc8e5867039e265869749abbb7ea6dd2c8b Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollis_blanchard@mentor.com>
Date: Sat, 7 Aug 2010 10:33:57 -0700
Subject: KVM: PPC: fix compilation of "dump tlbs" debug function

Missing local variable.

Signed-off-by: Hollis Blanchard <hollis_blanchard@mentor.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 9f71b8d..5f3cff8 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -47,6 +47,7 @@
 #ifdef DEBUG
 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	struct kvmppc_44x_tlbe *tlbe;
 	int i;
 
-- 
cgit v0.10.2


From ebc65874e9e8f3b8bbbc69aa49acd7489cd41c52 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollis_blanchard@mentor.com>
Date: Sat, 7 Aug 2010 10:33:58 -0700
Subject: KVM: PPC: allow ppc440gp to pass the compatibility check

Match only the first part of cur_cpu_spec->platform.

440GP (the first 440 processor) is identified by the string "ppc440gp", while
all later 440 processors use simply "ppc440".

Signed-off-by: Hollis Blanchard <hollis_blanchard@mentor.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index e7b1f3f..74d0e74 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -43,7 +43,7 @@ int kvmppc_core_check_processor_compat(void)
 {
 	int r;
 
-	if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
+	if (strncmp(cur_cpu_spec->platform, "ppc440", 6) == 0)
 		r = 0;
 	else
 		r = -ENOTSUPP;
@@ -72,6 +72,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Since the guest can directly access the timebase, it must know the
 	 * real timebase frequency. Accordingly, it must see the state of
 	 * CCR1[TCS]. */
+	/* XXX CCR1 doesn't exist on all 440 SoCs. */
 	vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
 
 	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
-- 
cgit v0.10.2


From 591bd8e7b4c8b9246d7a1c81ffbd28e35dc5de4e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 17 Aug 2010 22:08:39 +0200
Subject: KVM: PPC: Enable napping only for Book3s_64

Before I incorrectly enabled napping also for BookE, which would result in
needless dcache flushes. Since we only need to force enable napping on
Book3s_64 because it doesn't go into MSR_POW otherwise, we can just #ifdef
that code to this particular platform.

Reported-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 669d989..428d0e5 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -582,8 +582,10 @@ static int __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
 		kvm_use_magic_page();
 
+#ifdef CONFIG_PPC_BOOK3S_64
 	/* Enable napping */
 	powersave_nap = 1;
+#endif
 
 free_tmp:
 	kvm_free_tmp();
-- 
cgit v0.10.2


From 17bd158006a33615270f9dba15c62f49bd447435 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 30 Aug 2010 10:44:15 +0200
Subject: KVM: PPC: Implement Level interrupts on Book3S

The current interrupt logic is just completely broken. We get a notification
from user space, telling us that an interrupt is there. But then user space
expects us that we just acknowledge an interrupt once we deliver it to the
guest.

This is not how real hardware works though. On real hardware, the interrupt
controller pulls the external interrupt line until it gets notified that the
interrupt was received.

So in reality we have two events: pulling and letting go of the interrupt line.

To maintain backwards compatibility, I added a new request for the pulling
part. The letting go part was implemented earlier already.

With this in place, we can now finally start guests that do not randomly stall
and stop to work at random times.

This patch implements above logic for Book3S.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 6c5547d..18ea696 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -86,5 +86,6 @@ struct kvm_guest_debug_arch {
 
 #define KVM_INTERRUPT_SET	-1U
 #define KVM_INTERRUPT_UNSET	-2U
+#define KVM_INTERRUPT_SET_LEVEL	-3U
 
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index c5ea4cd..5b75046 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -58,6 +58,7 @@
 #define BOOK3S_INTERRUPT_INST_STORAGE	0x400
 #define BOOK3S_INTERRUPT_INST_SEGMENT	0x480
 #define BOOK3S_INTERRUPT_EXTERNAL	0x500
+#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL	0x501
 #define BOOK3S_INTERRUPT_ALIGNMENT	0x600
 #define BOOK3S_INTERRUPT_PROGRAM	0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
@@ -84,7 +85,8 @@
 #define BOOK3S_IRQPRIO_EXTERNAL			13
 #define BOOK3S_IRQPRIO_DECREMENTER		14
 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR	15
-#define BOOK3S_IRQPRIO_MAX			16
+#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL		16
+#define BOOK3S_IRQPRIO_MAX			17
 
 #define BOOK3S_HFLAG_DCBZ32			0x1
 #define BOOK3S_HFLAG_SLB			0x2
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 5833df7..e316847 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -186,6 +186,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 	case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;		break;
 	case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;		break;
 	case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;		break;
+	case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;	break;
 	case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;		break;
 	case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;		break;
 	case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;		break;
@@ -246,13 +247,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+	unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
+
+	kvmppc_book3s_queue_irqprio(vcpu, vec);
 }
 
 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                   struct kvm_interrupt *irq)
 {
 	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
 }
 
 int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
@@ -281,6 +288,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 		vec = BOOK3S_INTERRUPT_DECREMENTER;
 		break;
 	case BOOK3S_IRQPRIO_EXTERNAL:
+	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
 		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_EXTERNAL;
 		break;
@@ -343,6 +351,23 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 	return deliver;
 }
 
+/*
+ * This function determines if an irqprio should be cleared once issued.
+ */
+static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	switch (priority) {
+		case BOOK3S_IRQPRIO_DECREMENTER:
+			/* DEC interrupts get cleared by mtdec */
+			return false;
+		case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
+			/* External interrupts get cleared by userspace */
+			return false;
+	}
+
+	return true;
+}
+
 void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
@@ -356,8 +381,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	priority = __ffs(*pending);
 	while (priority < BOOK3S_IRQPRIO_MAX) {
 		if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
-		    (priority != BOOK3S_IRQPRIO_DECREMENTER)) {
-			/* DEC interrupts get cleared by mtdec */
+		    clear_irqprio(vcpu, priority)) {
 			clear_bit(priority, &vcpu->arch.pending_exceptions);
 			break;
 		}
-- 
cgit v0.10.2


From 7b4203e8cb5c5d9bc49da62b7a6fa4ba876a1b3f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 30 Aug 2010 13:50:45 +0200
Subject: KVM: PPC: Expose level based interrupt cap

Now that we have all the level interrupt magic in place, let's
expose the capability to user space, so it can make use of it!

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 028891c..2f87a16 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -192,6 +192,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_SEGSTATE:
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_UNSET_IRQ:
+	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3707704..919ae53 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -539,6 +539,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_XCRS 56
 #endif
 #define KVM_CAP_PPC_GET_PVINFO 57
+#define KVM_CAP_PPC_IRQ_LEVEL 58
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v0.10.2


From c5335f17651de5075313524ccc3881527268966f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 30 Aug 2010 14:03:24 +0200
Subject: KVM: PPC: Implement level interrupts for BookE

BookE also wants to support level based interrupts, so let's implement
all the necessary logic there. We need to trick a bit here because the
irqprios are 1:1 assigned to architecture defined values. But since there
is some space left there, we can just pick a random one and move it later
on - it's internal anyways.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 835f6d0..77575d0 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -131,13 +131,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL);
+	unsigned int prio = BOOKE_IRQPRIO_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL;
+
+	kvmppc_booke_queue_irqprio(vcpu, prio);
 }
 
 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                   struct kvm_interrupt *irq)
 {
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
 /* Deliver the interrupt of the corresponding priority, if possible. */
@@ -150,6 +156,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	ulong crit_raw = vcpu->arch.shared->critical;
 	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
 	bool crit;
+	bool keep_irq = false;
 
 	/* Truncate crit indicators in 32 bit mode */
 	if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -162,6 +169,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	/* ... and we're in supervisor mode */
 	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
 
+	if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) {
+		priority = BOOKE_IRQPRIO_EXTERNAL;
+		keep_irq = true;
+	}
+
 	switch (priority) {
 	case BOOKE_IRQPRIO_DTLB_MISS:
 	case BOOKE_IRQPRIO_DATA_STORAGE:
@@ -214,7 +226,8 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
 		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 
-		clear_bit(priority, &vcpu->arch.pending_exceptions);
+		if (!keep_irq)
+			clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
 
 	return allowed;
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 88258ac..492bb70 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -46,7 +46,9 @@
 #define BOOKE_IRQPRIO_FIT 17
 #define BOOKE_IRQPRIO_DECREMENTER 18
 #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
-#define BOOKE_IRQPRIO_MAX 19
+/* Internal pseudo-irqprio for level triggered externals */
+#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
+#define BOOKE_IRQPRIO_MAX 20
 
 extern unsigned long kvmppc_booke_handlers;
 
-- 
cgit v0.10.2


From 6f7a2bd41fa8d52cbf5f32fdf8ba659d4ce4ae59 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 31 Aug 2010 02:03:32 +0200
Subject: KVM: PPC: Document KVM_INTERRUPT ioctl

This adds some documentation for the KVM_INTERRUPT special cases that
PowerPC now implements.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 44d9893..24d6341 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -320,13 +320,13 @@ struct kvm_translation {
 4.15 KVM_INTERRUPT
 
 Capability: basic
-Architectures: x86
+Architectures: x86, ppc
 Type: vcpu ioctl
 Parameters: struct kvm_interrupt (in)
 Returns: 0 on success, -1 on error
 
 Queues a hardware interrupt vector to be injected.  This is only
-useful if in-kernel local APIC is not used.
+useful if in-kernel local APIC or equivalent is not used.
 
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
@@ -334,8 +334,37 @@ struct kvm_interrupt {
 	__u32 irq;
 };
 
+X86:
+
 Note 'irq' is an interrupt vector, not an interrupt pin or line.
 
+PPC:
+
+Queues an external interrupt to be injected. This ioctl is overleaded
+with 3 different irq values:
+
+a) KVM_INTERRUPT_SET
+
+  This injects an edge type external interrupt into the guest once it's ready
+  to receive interrupts. When injected, the interrupt is done.
+
+b) KVM_INTERRUPT_UNSET
+
+  This unsets any pending interrupt.
+
+  Only available with KVM_CAP_PPC_UNSET_IRQ.
+
+c) KVM_INTERRUPT_SET_LEVEL
+
+  This injects a level type external interrupt into the guest context. The
+  interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET
+  is triggered.
+
+  Only available with KVM_CAP_PPC_IRQ_LEVEL.
+
+Note that any value for 'irq' other than the ones stated above is invalid
+and incurs unexpected behavior.
+
 4.16 KVM_DEBUG_GUEST
 
 Capability: basic
-- 
cgit v0.10.2


From 21e537ba149be99c4d31a04949ca6e0770379662 Mon Sep 17 00:00:00 2001
From: Kyle Moffett <Kyle.D.Moffett@boeing.com>
Date: Mon, 30 Aug 2010 11:38:39 -0400
Subject: KVM: PPC: e500_tlb: Fix a minor copy-paste tracing bug

The kvmppc_e500_stlbe_invalidate() function was trying to pass too many
parameters to trace_kvm_stlb_inval().  This appears to be a bad
copy-paste from a call to trace_kvm_stlb_write().

Signed-off-by: Kyle Moffett <Kyle.D.Moffett@boeing.com>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 092a390..a413883 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -226,8 +226,7 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
 	stlbe->mas1 = 0;
-	trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
-			     stlbe->mas3, stlbe->mas7);
+	trace_kvm_stlb_inval(index_of(tlbsel, esel));
 }
 
 static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
-- 
cgit v0.10.2


From 344941beb9926418663e171a347d1a31d727fe45 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 31 Aug 2010 03:45:39 +0200
Subject: KVM: PPC: Fix compile error in e500_tlb.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The e500_tlb.c file didn't compile for me due to the following error:

arch/powerpc/kvm/e500_tlb.c: In function ‘kvmppc_e500_shadow_map’:
arch/powerpc/kvm/e500_tlb.c:300: error: format ‘%lx’ expects type ‘long unsigned int’, but argument 2 has type ‘gfn_t’

So let's explicitly cast the argument to make printk happy.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index a413883..d6d6d47 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -297,7 +297,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	/* Get reference to new page. */
 	new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
 	if (is_error_page(new_page)) {
-		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n",
+				(long)gfn);
 		kvm_release_page_clean(new_page);
 		return;
 	}
-- 
cgit v0.10.2


From d1e87c7ee65a20b10faf7e59dbe2cc934c32473b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 31 Aug 2010 04:25:39 +0200
Subject: KVM: PPC: Add documentation for magic page enhancements

This documents how to detect additional features inside the magic
page when a guest maps it.

Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
index 922cf95..a7f2244 100644
--- a/Documentation/kvm/ppc-pv.txt
+++ b/Documentation/kvm/ppc-pv.txt
@@ -102,6 +102,20 @@ struct kvm_vcpu_arch_shared {
 Additions to the page must only occur at the end. Struct fields are always 32
 or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
 
+Magic page features
+===================
+
+When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE,
+a second return value is passed to the guest. This second return value contains
+a bitmap of available features inside the magic page.
+
+The following enhancements to the magic page are currently available:
+
+  KVM_MAGIC_FEAT_SR		Maps SR registers r/w in the magic page
+
+For enhanced features in the magic page, please check for the existence of the
+feature before using them!
+
 MSR bits
 ========
 
-- 
cgit v0.10.2


From 26e673c3003bc8f24bdbbdcb8bc91a78556f579a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 3 Sep 2010 10:22:19 +0200
Subject: KVM: PPC: Move of include to __KERNEL__ section

We have to protect the include for linux/of.h by __KERNEL__ so it doesn't
accidently get referenced outside.

This patch fixes this and makes the tree compile again.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Alexander Graf <agraf@suse.de>

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index d79fd09..50533f9 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -21,7 +21,6 @@
 #define __POWERPC_KVM_PARA_H__
 
 #include <linux/types.h>
-#include <linux/of.h>
 
 struct kvm_vcpu_arch_shared {
 	__u64 scratch1;
@@ -54,6 +53,8 @@ struct kvm_vcpu_arch_shared {
 
 #ifdef CONFIG_KVM_GUEST
 
+#include <linux/of.h>
+
 static inline int kvm_para_available(void)
 {
 	struct device_node *hyper_node;
-- 
cgit v0.10.2


From f87f928882d080eaec8b0d76aecff003d664697d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 2 Sep 2010 17:29:45 +0200
Subject: KVM: MMU: Fix 32 bit legacy paging with NPT

This patch fixes 32 bit legacy paging with NPT enabled. The
mmu_check_root call on the top-level of the loop causes
root_gfn to take values (in the tdp_enabled path) which are
outside of guest memory. So the mmu_check_root call fails at
some point in the loop interation causing the guest to
tiple-fault.
This patch changes the mmu_check_root calls to the places
where they are really necessary. As a side-effect it
introduces a check for the root of a pae page table too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2dad65..b2136f9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2387,6 +2387,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 	direct = !is_paging(vcpu);
+
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2398,10 +2402,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				continue;
 			}
 			root_gfn = pdptr >> PAGE_SHIFT;
+			if (mmu_check_root(vcpu, root_gfn))
+				return 1;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
 		if (tdp_enabled) {
 			direct = 1;
 			root_gfn = i << 30;
-- 
cgit v0.10.2


From cda0008299a06f0d7218c6037c3c02d7a865e954 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 2 Sep 2010 17:29:46 +0200
Subject: KVM: SVM: Restore correct registers after sel_cr0 intercept emulation

This patch implements restoring of the correct rip, rsp, and
rax after the svm emulation in KVM injected a selective_cr0
write intercept into the guest hypervisor. The problem was
that the vmexit is emulated in the instruction emulation
which later commits the registers right after the write-cr0
instruction. So the l1 guest will continue to run with the
l2 rip, rsp and rax resulting in unpredictable behavior.

This patch is not the final word, it is just an easy patch
to fix the issue. The real fix will be done when the
instruction emulator is made aware of nested virtualization.
Until this is done this patch fixes the issue and provides
an easy way to fix this in -stable too.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a1a83b9..0765534 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -88,6 +88,14 @@ struct nested_state {
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
 
+	/*
+	 * If we vmexit during an instruction emulation we need this to restore
+	 * the l1 guest rip after the emulation
+	 */
+	unsigned long vmexit_rip;
+	unsigned long vmexit_rsp;
+	unsigned long vmexit_rax;
+
 	/* cache for intercepts of the guest */
 	u16 intercept_cr_read;
 	u16 intercept_cr_write;
@@ -1213,8 +1221,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		if (old == new) {
 			/* cr0 write with ts and mp unchanged */
 			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
+				svm->nested.vmexit_rip = kvm_rip_read(vcpu);
+				svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+				svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 				return;
+			}
 		}
 	}
 
@@ -2430,6 +2442,23 @@ static int emulate_on_interception(struct vcpu_svm *svm)
 	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
+static int cr0_write_interception(struct vcpu_svm *svm)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	int r;
+
+	r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+
+	if (svm->nested.vmexit_rip) {
+		kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
+		kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
+		svm->nested.vmexit_rip = 0;
+	}
+
+	return r == EMULATE_DONE;
+}
+
 static int cr8_write_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
@@ -2692,7 +2721,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
 	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR0]			= cr0_write_interception,
 	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
-- 
cgit v0.10.2


From b75f4eb34122b60ee4f07ec89973d1589002c68a Mon Sep 17 00:00:00 2001
From: "Roedel, Joerg" <Joerg.Roedel@amd.com>
Date: Fri, 3 Sep 2010 14:21:40 +0200
Subject: KVM: SVM: Clean up rip handling in vmrun emulation

This patch changes the rip handling in the vmrun emulation
path from using next_rip to the generic kvm register access
functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0765534..fcbc491 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2069,7 +2069,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		return false;
 	}
 
-	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
+	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
 			       nested_vmcb->control.event_inj,
@@ -2098,7 +2098,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	hsave->save.cr4    = svm->vcpu.arch.cr4;
 	hsave->save.rflags = vmcb->save.rflags;
-	hsave->save.rip    = svm->next_rip;
+	hsave->save.rip    = kvm_rip_read(&svm->vcpu);
 	hsave->save.rsp    = vmcb->save.rsp;
 	hsave->save.rax    = vmcb->save.rax;
 	if (npt_enabled)
@@ -2270,8 +2270,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
 	if (nested_svm_check_permissions(svm))
 		return 1;
 
-	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	skip_emulated_instruction(&svm->vcpu);
+	/* Save rip after vmrun instruction */
+	kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
 
 	if (!nested_svm_vmrun(svm))
 		return 1;
-- 
cgit v0.10.2


From b9a52c4b78ec254ee00cce47d75efd89b09f13dd Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Thu, 9 Sep 2010 12:06:45 +0200
Subject: x86: Define MSR_EBC_FREQUENCY_ID

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f779..83c4bb1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -198,6 +198,7 @@
 #define MSR_IA32_TSC			0x00000010
 #define MSR_IA32_PLATFORM_ID		0x00000017
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
+#define MSR_EBC_FREQUENCY_ID		0x0000002c
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
-- 
cgit v0.10.2


From 7b91409822ed37f2a58974e49498bdbe92ddd93c Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Thu, 9 Sep 2010 12:06:46 +0200
Subject: KVM: x86: Emulate MSR_EBC_FREQUENCY_ID

Some operating systems store data about the host processor at the
time of installation, and when booted on a more uptodate cpu tries
to read MSR_EBC_FREQUENCY_ID. This has been found with XP.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f47db25..9d43477 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1651,6 +1651,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case 0xcd: /* fsb frequency */
 		data = 3;
 		break;
+		/*
+		 * MSR_EBC_FREQUENCY_ID
+		 * Conservative value valid for even the basic CPU models.
+		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+		 * and 266MHz for model 3, or 4. Set Core Clock
+		 * Frequency to System Bus Frequency Ratio to 1 (bits
+		 * 31:24) even though these are only valid for CPU
+		 * models > 2, however guests may end up dividing or
+		 * multiplying by zero otherwise.
+		 */
+	case MSR_EBC_FREQUENCY_ID:
+		data = 1 << 24;
+		break;
 	case MSR_IA32_APICBASE:
 		data = kvm_get_apic_base(vcpu);
 		break;
-- 
cgit v0.10.2


From 957446afce22df9a42b9482fcd55985f4037fe66 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:38 +0200
Subject: KVM: MMU: Check for root_level instead of long mode

The walk_addr function checks for !is_long_mode in its 64
bit version. But what is meant here is a check for pae
paging. Change the condition to really check for pae paging
so that it also works with nested nested paging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index debe770..e4ad3dc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -132,7 +132,7 @@ walk:
 	walker->level = vcpu->arch.mmu.root_level;
 	pte = vcpu->arch.cr3;
 #if PTTYPE == 64
-	if (!is_long_mode(vcpu)) {
+	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
@@ -205,7 +205,7 @@ walk:
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
 				is_large_pte(pte) &&
-				is_long_mode(vcpu))) {
+				vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
 
 			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
-- 
cgit v0.10.2


From c5a78f2b649ae75ae788e7622ca5a586af2cb35a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:39 +0200
Subject: KVM: MMU: Make tdp_enabled a mmu-context parameter

This patch changes the tdp_enabled flag from its global
meaning to the mmu-context and renames it to direct_map
there. This is necessary for Nested SVM with emulation of
Nested Paging where we need an extra MMU context to shadow
the Nested Nested Page Table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 789e946..80ef28b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -249,6 +249,7 @@ struct kvm_mmu {
 	int root_level;
 	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	bool direct_map;
 
 	u64 *pae_root;
 	u64 rsvd_bits_mask[2][4];
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b2136f9..5c28e97 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1448,7 +1448,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 		role.cr4_pae = 0;
 	role.access = access;
-	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!vcpu->arch.mmu.direct_map
+	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
@@ -1973,7 +1974,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_user_mask;
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
@@ -1983,8 +1984,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
-		&& !user_fault)) {
+	    || (!vcpu->arch.mmu.direct_map && write_fault
+		&& !is_write_protection(vcpu) && !user_fault)) {
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1995,7 +1996,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 		spte |= PT_WRITABLE_MASK;
 
-		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+		if (!vcpu->arch.mmu.direct_map
+		    && !(pte_access & ACC_WRITE_MASK))
 			spte &= ~PT_USER_MASK;
 
 		/*
@@ -2371,7 +2373,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
-		if (tdp_enabled) {
+		if (vcpu->arch.mmu.direct_map) {
 			direct = 1;
 			root_gfn = 0;
 		}
@@ -2406,7 +2408,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				return 1;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
-		if (tdp_enabled) {
+		if (vcpu->arch.mmu.direct_map) {
 			direct = 1;
 			root_gfn = i << 30;
 		}
@@ -2544,6 +2546,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
 	return 0;
 }
 
@@ -2663,6 +2666,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->root_level = level;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
@@ -2687,6 +2691,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
@@ -2708,6 +2713,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -3060,7 +3066,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		return 0;
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-- 
cgit v0.10.2


From f43addd46168110d572dcf69100cb215a4e9fd08 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:40 +0200
Subject: KVM: MMU: Make set_cr3 a function pointer in kvm_mmu

This is necessary to implement Nested Nested Paging. As a
side effect this allows some cleanups in the SVM nested
paging code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 80ef28b..53ceded 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,6 +236,7 @@ struct kvm_pio_request {
  */
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
+	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c28e97..c8acb96 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2714,6 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
+	context->set_cr3 = kvm_x86_ops->set_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2752,7 +2753,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 		r = paging32_init_context(vcpu);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
-	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 
 	return r;
 }
@@ -2796,7 +2798,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 out:
 	return r;
 }
-- 
cgit v0.10.2


From 1c97f0a04c74196880f22a563134c8f6d0b9d752 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:41 +0200
Subject: KVM: X86: Introduce a tdp_set_cr3 function

This patch introduces a special set_tdp_cr3 function pointer
in kvm_x86_ops which is only used for tpd enabled mmu
contexts. This allows to remove some hacks from svm code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53ceded..81a5147 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -526,6 +526,8 @@ struct kvm_x86_ops {
 	bool (*rdtscp_supported)(void);
 	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
 
+	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
 	bool (*has_wbinvd_exit)(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c8acb96..a55f8d5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2714,7 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
-	context->set_cr3 = kvm_x86_ops->set_cr3;
+	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fcbc491..53c9039 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3216,9 +3216,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	savesegment(gs, gs_selector);
 	ldt_selector = kvm_read_ldt();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
-	/* required for live migration with NPT */
-	if (npt_enabled)
-		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
 	clgi();
 
@@ -3340,16 +3337,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (npt_enabled) {
-		svm->vmcb->control.nested_cr3 = root;
-		force_new_asid(vcpu);
-		return;
-	}
-
 	svm->vmcb->save.cr3 = root;
 	force_new_asid(vcpu);
 }
 
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+
+	/* Also sync guest cr3 here in case we live migrate */
+	svm->vmcb->save.cr3 = vcpu->arch.cr3;
+
+	force_new_asid(vcpu);
+}
+
 static int is_disabled(void)
 {
 	u64 vm_cr;
@@ -3576,6 +3579,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
+
+	.set_tdp_cr3 = set_tdp_cr3,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 275a81d..ff7a8d4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4341,6 +4341,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
+
+	.set_tdp_cr3 = vmx_set_cr3,
 };
 
 static int __init vmx_init(void)
-- 
cgit v0.10.2


From 5777ed340d89cdc6c76a5c552337a3861b40a806 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:42 +0200
Subject: KVM: MMU: Introduce get_cr3 function pointer

This function pointer in the MMU context is required to
implement Nested Nested Paging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81a5147..6c97b8d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -237,6 +237,7 @@ struct kvm_pio_request {
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a55f8d5..e4a7de4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2365,7 +2365,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int direct = 0;
 	u64 pdptr;
 
-	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2562,6 +2562,11 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr3;
+}
+
 static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
@@ -2715,6 +2720,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+	context->get_cr3 = get_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2755,6 +2761,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.mmu.get_cr3           = get_cr3;
 
 	return r;
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e4ad3dc..13d0c06 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ walk:
 	present = true;
 	eperm = rsvd_fault = false;
 	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.cr3;
+	pte = vcpu->arch.mmu.get_cr3(vcpu);
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
@@ -143,7 +143,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
-- 
cgit v0.10.2


From cb659db8a7d1ed558898f533a957dfc342f9499d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:43 +0200
Subject: KVM: MMU: Introduce inject_page_fault function pointer

This patch introduces an inject_page_fault function pointer
into struct kvm_mmu which will be used to inject a page
fault. This will be used later when Nested Nested Paging is
implemented.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6c97b8d..009a4a1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,6 +239,9 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  unsigned long addr,
+				  u32 error_code);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e4a7de4..a751dfc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2571,7 +2571,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
 {
-	kvm_inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
@@ -2721,6 +2721,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
+	context->inject_page_fault = kvm_inject_page_fault;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2762,6 +2763,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.mmu.get_cr3           = get_cr3;
+	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
 
 	return r;
 }
-- 
cgit v0.10.2


From 52fde8df7dd13d90f5f8dc43157418bff968d90a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:44 +0200
Subject: KVM: MMU: Introduce kvm_init_shadow_mmu helper function

Some logic of the init_kvm_softmmu function is required to
build the Nested Nested Paging context. So factor the
required logic into a seperate function and export it.
Also make the whole init path suitable for more than one mmu
context.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a751dfc..9e48a77 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2532,10 +2532,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2595,9 +2594,10 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context,
+				  int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
@@ -2656,9 +2656,11 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 	}
 }
 
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+					struct kvm_mmu *context,
+					int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -2675,17 +2677,17 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	return 0;
 }
 
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
 
-static int paging32_init_context(struct kvm_vcpu *vcpu)
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
-	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2700,10 +2702,10 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
@@ -2727,15 +2729,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
-		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 	}
@@ -2743,24 +2745,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	int r;
-
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		r = nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu, context);
 	else if (is_long_mode(vcpu))
-		r = paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu, context);
 	else if (is_pae(vcpu))
-		r = paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu, context);
 	else
-		r = paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu, context);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.mmu.get_cr3           = get_cr3;
 	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f05a03d..7086ca8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,7 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-- 
cgit v0.10.2


From 3241f22da85d26505b39f525a88f52ebd1235975 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:45 +0200
Subject: KVM: MMU: Let is_rsvd_bits_set take mmu context instead of vcpu

This patch changes is_rsvd_bits_set() function prototype to
take only a kvm_mmu context instead of a full vcpu.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e48a77..86f7557 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2578,12 +2578,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 {
 	int bit7;
 
 	bit7 = (gpte >> 7) & 1;
-	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
 #define PTTYPE 64
@@ -2859,7 +2859,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
-	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
 		return;
 
 	++vcpu->kvm->stat.mmu_pte_updated;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 13d0c06..68ee1b7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -168,7 +168,7 @@ walk:
 			break;
 		}
 
-		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
 			rsvd_fault = true;
 			break;
 		}
@@ -327,6 +327,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 				u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	pt_element_t *gptep = gw->prefetch_ptes;
 	u64 *spte;
 	int i;
@@ -358,7 +359,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		gpte = gptep[i];
 
 		if (!is_present_gpte(gpte) ||
-		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+		      is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
 			if (!sp->unsync)
 				__set_spte(spte, shadow_notrap_nonpresent_pte);
 			continue;
@@ -713,7 +714,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
 		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
 		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
-- 
cgit v0.10.2


From 8df25a328a6ca3bd0f048278f4d5ae0a1f6fadc1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:46 +0200
Subject: KVM: MMU: Track page fault data in struct vcpu

This patch introduces a struct with two new fields in
vcpu_arch for x86:

	* fault.address
	* fault.error_code

This will be used to correctly propagate page faults back
into the guest when we could have either an ordinary page
fault or a nested page fault. In the case of a nested page
fault the fault-address is different from the original
address that should be walked. So we need to keep track
about the real fault-address.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1bf1140..5187dd8 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,6 @@ struct x86_emulate_ctxt {
 	int exception; /* exception that happens during emulation or -1 */
 	u32 error_code; /* error code for exception */
 	bool error_code_valid;
-	unsigned long cr2; /* faulted address in case of #PF */
 
 	/* decode cache */
 	struct decode_cache decode;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 009a4a1..3fde5b3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,9 +239,7 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
-				  unsigned long addr,
-				  u32 error_code);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
@@ -288,6 +286,16 @@ struct kvm_vcpu_arch {
 	bool tpr_access_reporting;
 
 	struct kvm_mmu mmu;
+
+	/*
+	 * This struct is filled with the necessary information to propagate a
+	 * page fault into the guest
+	 */
+	struct {
+		u64      address;
+		unsigned error_code;
+	} fault;
+
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -624,8 +632,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-			   u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 27d2c22..2b08b78 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -487,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
-		       int err)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->cr2 = addr;
-	emulate_exception(ctxt, PF_VECTOR, err, true);
+	emulate_exception(ctxt, PF_VECTOR, 0, true);
 }
 
 static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -834,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, addr, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -921,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
        return ret;
 }
@@ -947,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
 	return ret;
 }
@@ -1117,7 +1115,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, c->dst.addr.mem, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1939,7 +1937,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1949,7 +1947,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1957,7 +1955,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1970,7 +1968,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
@@ -2081,7 +2079,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2091,7 +2089,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2099,7 +2097,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2112,7 +2110,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 86f7557..9936727 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2566,11 +2566,9 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr3;
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      u64 addr,
-			      u32 err_code)
+static void inject_page_fault(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 68ee1b7..d07f48a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -258,6 +258,10 @@ error:
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
+
+	vcpu->arch.fault.address    = addr;
+	vcpu->arch.fault.error_code = walker->error_code;
+
 	trace_kvm_mmu_walker_error(walker->error_code);
 	return 0;
 }
@@ -521,7 +525,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		inject_page_fault(vcpu, addr, walker.error_code);
+		inject_page_fault(vcpu);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d43477..48b74d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -329,11 +329,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-			   u32 error_code)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 {
+	unsigned error_code = vcpu->arch.fault.error_code;
+
 	++vcpu->stat.pf_guest;
-	vcpu->arch.cr2 = addr;
+	vcpu->arch.cr2 = vcpu->arch.fault.address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
@@ -4080,7 +4081,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception == PF_VECTOR)
-		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+		kvm_inject_page_fault(vcpu);
 	else if (ctxt->error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
 	else
-- 
cgit v0.10.2


From 1e301feb079e8ee6091bb75283e960fc33059a68 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:47 +0200
Subject: KVM: MMU: Introduce generic walk_addr function

This is the first patch in the series towards a generic
walk_addr implementation which could walk two-dimensional
page tables in the end. In this first step the walk_addr
function is renamed into walk_addr_generic which takes a
mmu context as an additional parameter.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d07f48a..a704a81 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -114,9 +114,10 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 /*
  * Fetch a guest pte for a guest virtual address
  */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				    gva_t addr, int write_fault,
+				    int user_fault, int fetch_fault)
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
@@ -129,10 +130,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 walk:
 	present = true;
 	eperm = rsvd_fault = false;
-	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.mmu.get_cr3(vcpu);
+	walker->level = mmu->root_level;
+	pte           = mmu->get_cr3(vcpu);
+
 #if PTTYPE == 64
-	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
@@ -143,7 +145,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
@@ -205,7 +207,7 @@ walk:
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
 				is_large_pte(pte) &&
-				vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL)) {
+				mmu->root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
 
 			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
@@ -266,6 +268,14 @@ error:
 	return 0;
 }
 
+static int FNAME(walk_addr)(struct guest_walker *walker,
+			    struct kvm_vcpu *vcpu, gva_t addr,
+			    int write_fault, int user_fault, int fetch_fault)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+					write_fault, user_fault, fetch_fault);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
-- 
cgit v0.10.2


From c30a358d33e0e111f06e54a4a4125371e6b6693c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:48 +0200
Subject: KVM: MMU: Add infrastructure for two-level page walker

This patch introduces a mmu-callback to translate gpa
addresses in the walk_addr code. This is later used to
translate l2_gpa addresses into l1_gpa addresses.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fde5b3..4915b7c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -243,6 +243,7 @@ struct kvm_mmu {
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
+	gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48b74d2..2364c2c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3448,6 +3448,11 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	return gpa;
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -5659,6 +5664,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f2ecdd5..917e68f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -534,6 +534,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn)
 	return (gpa_t)gfn << PAGE_SHIFT;
 }
 
+static inline gfn_t gpa_to_gfn(gpa_t gpa)
+{
+	return (gfn_t)(gpa >> PAGE_SHIFT);
+}
+
 static inline hpa_t pfn_to_hpa(pfn_t pfn)
 {
 	return (hpa_t)pfn << PAGE_SHIFT;
-- 
cgit v0.10.2


From 14dfe855f978181cd611ec018e5ceba860a98545 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:49 +0200
Subject: KVM: X86: Introduce pointer to mmu context used for gva_to_gpa

This patch introduces the walk_mmu pointer which points to
the mmu-context currently used for gva_to_gpa translations.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4915b7c..1b3eb8a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,9 +286,22 @@ struct kvm_vcpu_arch {
 	u64 ia32_misc_enable_msr;
 	bool tpr_access_reporting;
 
+	/*
+	 * Paging state of the vcpu
+	 *
+	 * If the vcpu runs in guest mode with two level paging this still saves
+	 * the paging mode of the l1 guest. This context is always used to
+	 * handle faults.
+	 */
 	struct kvm_mmu mmu;
 
 	/*
+	 * Pointer to the mmu context currently used for
+	 * gva_to_gpa translations.
+	 */
+	struct kvm_mmu *walk_mmu;
+
+	/*
 	 * This struct is filled with the necessary information to propagate a
 	 * page fault into the guest
 	 */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9936727..cb06ada 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2708,7 +2708,7 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu,
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
@@ -2767,11 +2767,11 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-	int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
 
-	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
-	vcpu->arch.mmu.get_cr3           = get_cr3;
-	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
+	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
 	return r;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2364c2c..4196fc7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3456,27 +3456,27 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_FETCH_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3487,7 +3487,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+							    error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3542,8 +3543,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
-						       PFERR_WRITE_MASK, error);
+		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+							     PFERR_WRITE_MASK,
+							     error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -5663,6 +5665,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm = vcpu->kvm;
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
-- 
cgit v0.10.2


From 6539e738f65a8f1fc7806295d5d701fba4008343 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:50 +0200
Subject: KVM: MMU: Implement nested gva_to_gpa functions

This patch adds the functions to do a nested l2_gva to
l1_gpa page table walk.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1b3eb8a..8ec3547 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -296,6 +296,16 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu mmu;
 
 	/*
+	 * Paging state of an L2 guest (used for nested npt)
+	 *
+	 * This context will save all necessary information to walk page tables
+	 * of the an L2 guest. This context is only initialized for page table
+	 * walking and not for faulting since we never handle l2 page faults on
+	 * the host.
+	 */
+	struct kvm_mmu nested_mmu;
+
+	/*
 	 * Pointer to the mmu context currently used for
 	 * gva_to_gpa translations.
 	 */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb06ada..1e215e8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2466,6 +2466,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vaddr;
 }
 
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+					 u32 access, u32 *error)
+{
+	if (error)
+		*error = 0;
+	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a704a81..eefe363 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -276,6 +276,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 					write_fault, user_fault, fetch_fault);
 }
 
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+				   struct kvm_vcpu *vcpu, gva_t addr,
+				   int write_fault, int user_fault,
+				   int fetch_fault)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+					addr, write_fault, user_fault,
+					fetch_fault);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
@@ -660,6 +670,27 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+				      u32 access, u32 *error)
+{
+	struct guest_walker walker;
+	gpa_t gpa = UNMAPPED_GVA;
+	int r;
+
+	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr,
+				    access & PFERR_WRITE_MASK,
+				    access & PFERR_USER_MASK,
+				    access & PFERR_FETCH_MASK);
+
+	if (r) {
+		gpa = gfn_to_gpa(walker.gfn);
+		gpa |= vaddr & ~PAGE_MASK;
+	} else if (error)
+		*error = walker.error_code;
+
+	return gpa;
+}
+
 static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d6385e..bf4dc2f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
-- 
cgit v0.10.2


From ec92fe44e7ff94d04d8305e49efcffd8773e1cf6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:51 +0200
Subject: KVM: X86: Add kvm_read_guest_page_mmu function

This patch adds a function which can read from the guests
physical memory or from the guest's guest physical memory.
This will be used in the two-dimensional page table walker.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8ec3547..08bc383 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -657,6 +657,9 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gfn_t gfn, void *data, int offset, int len,
+			    u32 access);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4196fc7..a2efb70 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -370,6 +370,29 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 EXPORT_SYMBOL_GPL(kvm_require_cpl);
 
 /*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gfn_t ngfn, void *data, int offset, int len,
+			    u32 access)
+{
+	gfn_t real_gfn;
+	gpa_t ngpa;
+
+	ngpa     = gfn_to_gpa(ngfn);
+	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+	if (real_gfn == UNMAPPED_GVA)
+		return -EFAULT;
+
+	real_gfn = gpa_to_gfn(real_gfn);
+
+	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
+/*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
-- 
cgit v0.10.2


From 2329d46d213d0721dafae18db29f54b196f11468 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:52 +0200
Subject: KVM: MMU: Make walk_addr_generic capable for two-level walking

This patch uses kvm_read_guest_page_tdp to make the
walk_addr_generic functions suitable for two-level page
table walking.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index eefe363..f4e09d3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -124,6 +124,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
 	bool eperm, present, rsvd_fault;
+	int offset;
+	u32 access = 0;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
@@ -153,12 +155,14 @@ walk:
 		index = PT_INDEX(addr, walker->level);
 
 		table_gfn = gpte_to_gfn(pte);
-		pte_gpa = gfn_to_gpa(table_gfn);
-		pte_gpa += index * sizeof(pt_element_t);
+		offset    = index * sizeof(pt_element_t);
+		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+		if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
+					    offset, sizeof(pte),
+					    PFERR_USER_MASK|PFERR_WRITE_MASK)) {
 			present = false;
 			break;
 		}
@@ -209,15 +213,27 @@ walk:
 				is_large_pte(pte) &&
 				mmu->root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
+			gpa_t real_gpa;
+			gfn_t gfn;
 
-			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
-			walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
-					>> PAGE_SHIFT;
+			gfn = gpte_to_gfn_lvl(pte, lvl);
+			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
 			if (PTTYPE == 32 &&
 			    walker->level == PT_DIRECTORY_LEVEL &&
 			    is_cpuid_PSE36())
-				walker->gfn += pse36_gfn_delta(pte);
+				gfn += pse36_gfn_delta(pte);
+
+			access |= write_fault ? PFERR_WRITE_MASK : 0;
+			access |= fetch_fault ? PFERR_FETCH_MASK : 0;
+			access |= user_fault  ? PFERR_USER_MASK  : 0;
+
+			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
+						      access);
+			if (real_gpa == UNMAPPED_GVA)
+				return 0;
+
+			walker->gfn = real_gpa >> PAGE_SHIFT;
 
 			break;
 		}
-- 
cgit v0.10.2


From 3d06b8bfd44ec421c386241f7c5af66c8200cbf4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:53 +0200
Subject: KVM: MMU: Introduce kvm_read_nested_guest_page()

This patch introduces the kvm_read_guest_page_x86 function
which reads from the physical memory of the guest. If the
guest is running in guest-mode itself with nested paging
enabled it will read from the guest's guest physical memory
instead.
The patch also changes changes the code to use this function
where it is necessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2efb70..46843ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -392,6 +392,13 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
+int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+			       void *data, int offset, int len, u32 access)
+{
+	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+				       data, offset, len, access);
+}
+
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
@@ -403,8 +410,9 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	int ret;
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
-				  offset * sizeof(u64), sizeof(pdpte));
+	ret = kvm_read_nested_guest_page(vcpu, pdpt_gfn, pdpte,
+					 offset * sizeof(u64), sizeof(pdpte),
+					 PFERR_USER_MASK|PFERR_WRITE_MASK);
 	if (ret < 0) {
 		ret = 0;
 		goto out;
@@ -433,6 +441,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 	bool changed = true;
+	int offset;
+	gfn_t gfn;
 	int r;
 
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -442,7 +452,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		return true;
 
-	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+	gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
+	offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
 		goto out;
 	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
-- 
cgit v0.10.2


From 02f59dc9f1f51d2148d87d48f84adb455a4fd697 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:54 +0200
Subject: KVM: MMU: Introduce init_kvm_nested_mmu()

This patch introduces the init_kvm_nested_mmu() function
which is used to re-initialize the nested mmu when the l2
guest changes its paging mode.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1e215e8..a26f13b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2784,11 +2784,46 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+	g_context->get_cr3           = get_cr3;
+	g_context->inject_page_fault = kvm_inject_page_fault;
+
+	/*
+	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+	 * translation of l2_gpa to l1_gpa addresses is done using the
+	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+	 * functions between mmu and nested_mmu are swapped.
+	 */
+	if (!is_paging(vcpu)) {
+		g_context->root_level = 0;
+		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+	} else if (is_long_mode(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+		g_context->root_level = PT64_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else if (is_pae(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+		g_context->root_level = PT32E_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else {
+		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+		g_context->root_level = PT32_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+	}
+
+	return 0;
+}
+
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.update_pte.pfn = bad_pfn;
 
-	if (tdp_enabled)
+	if (mmu_is_nested(vcpu))
+		return init_kvm_nested_mmu(vcpu);
+	else if (tdp_enabled)
 		return init_kvm_tdp_mmu(vcpu);
 	else
 		return init_kvm_softmmu(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca8..513abbb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,6 +47,7 @@
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
+#define PFERR_NESTED_MASK (1U << 31)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46843ed..e4c76bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3489,6 +3489,22 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 	return gpa;
 }
 
+static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	gpa_t t_gpa;
+	u32 error;
+
+	BUG_ON(!mmu_is_nested(vcpu));
+
+	/* NPT walks are always user-walks */
+	access |= PFERR_USER_MASK;
+	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
+	if (t_gpa == UNMAPPED_GVA)
+		vcpu->arch.fault.error_code |= PFERR_NESTED_MASK;
+
+	return t_gpa;
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -5704,6 +5720,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-- 
cgit v0.10.2


From d4f8cf664e4c1fd579df6b6e6378335c9f79d790 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:55 +0200
Subject: KVM: MMU: Propagate the right fault back to the guest after
 gva_to_gpa

This patch implements logic to make sure that either a
page-fault/page-fault-vmexit or a nested-page-fault-vmexit
is propagated back to the guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 08bc383..574db6d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -660,6 +660,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t gfn, void *data, int offset, int len,
 			    u32 access);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e4c76bf..0281d92 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -338,6 +338,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
+void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+{
+	u32 nested, error;
+
+	error   = vcpu->arch.fault.error_code;
+	nested  = error &  PFERR_NESTED_MASK;
+	error   = error & ~PFERR_NESTED_MASK;
+
+	vcpu->arch.fault.error_code = error;
+
+	if (mmu_is_nested(vcpu) && !nested)
+		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+	else
+		vcpu->arch.mmu.inject_page_fault(vcpu);
+}
+
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.nmi_pending = 1;
@@ -4140,7 +4156,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception == PF_VECTOR)
-		kvm_inject_page_fault(vcpu);
+		kvm_propagate_fault(vcpu);
 	else if (ctxt->error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
 	else
-- 
cgit v0.10.2


From d47f00a62b2e14b4a811b87bdb9ea1809693a377 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:56 +0200
Subject: KVM: X86: Propagate fetch faults

KVM currently ignores fetch faults in the instruction
emulator. With nested-npt we could have such faults. This
patch adds the code to handle these.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2b08b78..aead72e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1198,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	*(unsigned long *)dest =
 		(ctxt->eflags & ~change_mask) | (val & change_mask);
 
+	if (rc == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt);
+
 	return rc;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0281d92..3101060 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4247,6 +4247,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		vcpu->arch.emulate_ctxt.perm_ok = false;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+		if (r == X86EMUL_PROPAGATE_FAULT)
+			goto done;
+
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
@@ -4305,6 +4308,7 @@ restart:
 		return handle_emulation_failure(vcpu);
 	}
 
+done:
 	if (vcpu->arch.emulate_ctxt.exception >= 0) {
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
-- 
cgit v0.10.2


From ff03a073e715d49b5cfeeec862649b1df2481ae0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:57 +0200
Subject: KVM: MMU: Add kvm_mmu parameter to load_pdptrs function

This function need to be able to load the pdptrs from any
mmu context currently in use. So change this function to
take an kvm_mmu parameter to fit these needs.
As a side effect this patch also moves the cached pdptrs
from vcpu_arch into the kvm_mmu struct.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 574db6d..9e70de3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -257,6 +257,8 @@ struct kvm_mmu {
 
 	u64 *pae_root;
 	u64 rsvd_bits_mask[2][4];
+
+	u64 pdptrs[4]; /* pae */
 };
 
 struct kvm_vcpu_arch {
@@ -276,7 +278,6 @@ struct kvm_vcpu_arch {
 	unsigned long cr4_guest_owned_bits;
 	unsigned long cr8;
 	u32 hflags;
-	u64 pdptrs[4]; /* pae */
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
@@ -592,7 +593,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8..a37abe2 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
 
-	return vcpu->arch.pdptrs[index];
+	return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 53c9039..ca711cb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1010,7 +1010,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	switch (reg) {
 	case VCPU_EXREG_PDPTR:
 		BUG_ON(!npt_enabled);
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
 		break;
 	default:
 		BUG();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ff7a8d4..1a7691a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1842,20 +1842,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 		return;
 
 	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
-		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
-		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
-		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
-		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
 	}
 }
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 {
 	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
-		vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
-		vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
-		vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
-		vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+		vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+		vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+		vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+		vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 	}
 
 	__set_bit(VCPU_EXREG_PDPTR,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3101060..bbd9f4a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -418,17 +418,17 @@ int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 {
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 	int i;
 	int ret;
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 
-	ret = kvm_read_nested_guest_page(vcpu, pdpt_gfn, pdpte,
-					 offset * sizeof(u64), sizeof(pdpte),
-					 PFERR_USER_MASK|PFERR_WRITE_MASK);
+	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+				      offset * sizeof(u64), sizeof(pdpte),
+				      PFERR_USER_MASK|PFERR_WRITE_MASK);
 	if (ret < 0) {
 		ret = 0;
 		goto out;
@@ -442,7 +442,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 	ret = 1;
 
-	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 	__set_bit(VCPU_EXREG_PDPTR,
 		  (unsigned long *)&vcpu->arch.regs_avail);
 	__set_bit(VCPU_EXREG_PDPTR,
@@ -455,7 +455,7 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
 
 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 	bool changed = true;
 	int offset;
 	gfn_t gfn;
@@ -474,7 +474,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
 		goto out;
-	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 out:
 
 	return changed;
@@ -513,7 +513,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 				return 1;
 		} else
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+						 vcpu->arch.cr3))
 			return 1;
 	}
 
@@ -602,7 +603,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
 		return 1;
 
 	if (cr4 & X86_CR4_VMXE)
@@ -635,7 +636,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		if (is_pae(vcpu)) {
 			if (cr3 & CR3_PAE_RESERVED_BITS)
 				return 1;
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+			if (is_paging(vcpu) &&
+			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 				return 1;
 		}
 		/*
@@ -5422,7 +5424,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
 		mmu_reset_needed = 1;
 	}
 
-- 
cgit v0.10.2


From d41d1895eb856b5d1c82f3be106b7a3e75e4216b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:58 +0200
Subject: KVM: MMU: Introduce kvm_pdptr_read_mmu

This function is implemented to load the pdptr pointers of
the currently running guest (l1 or l2 guest). Therefore it
takes care about the current paging mode and can read pdptrs
out of l2 guest physical memory.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a37abe2..975bb45 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -45,6 +45,13 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 	return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
+static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
+{
+	load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
+
+	return mmu->pdptrs[index];
+}
+
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
 	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a26f13b..a25173a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2398,7 +2398,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-			pdptr = kvm_pdptr_read(vcpu, i);
+			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
 			if (!is_present_gpte(pdptr)) {
 				vcpu->arch.mmu.pae_root[i] = 0;
 				continue;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f4e09d3..a28f09b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -137,7 +137,7 @@ walk:
 
 #if PTTYPE == 64
 	if (walker->level == PT32E_ROOT_LEVEL) {
-		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
 			present = false;
-- 
cgit v0.10.2


From 651dd37a9ce6fdacdcd75da86619c62111efcbc2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:59 +0200
Subject: KVM: MMU: Refactor mmu_alloc_roots function

This patch factors out the direct-mapping paths of the
mmu_alloc_roots function into a seperate function. This
makes it a lot easier to avoid all the unnecessary checks
done in the shadow path which may break when running direct.
In fact, this patch already fixes a problem when running PAE
guests on a PAE shadow page table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a25173a..9cd5a717e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2357,42 +2357,77 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	return ret;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
+		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+				      1, ACC_ALL, NULL);
+		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+		for (i = 0; i < 4; ++i) {
+			hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+			ASSERT(!VALID_PAGE(root));
+			spin_lock(&vcpu->kvm->mmu_lock);
+			kvm_mmu_free_some_pages(vcpu);
+			sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+					      PT32_ROOT_LEVEL, 1, ACC_ALL,
+					      NULL);
+			root = __pa(sp->spt);
+			++sp->root_count;
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+			vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+		}
+	} else
+		BUG();
+
+	return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int direct = 0;
 	u64 pdptr;
 
 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
+	/*
+	 * Do we shadow a long mode page table? If so we need to
+	 * write-protect the guests page table root.
+	 */
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
-		if (vcpu->arch.mmu.direct_map) {
-			direct = 1;
-			root_gfn = 0;
-		}
+
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
-		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, direct,
-				      ACC_ALL, NULL);
+		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
-	direct = !is_paging(vcpu);
-
-	if (mmu_check_root(vcpu, root_gfn))
-		return 1;
 
+	/*
+	 * We shadow a 32 bit page table. This may be a legacy 2-level
+	 * or a PAE 3-level page table.
+	 */
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2406,16 +2441,11 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = pdptr >> PAGE_SHIFT;
 			if (mmu_check_root(vcpu, root_gfn))
 				return 1;
-		} else if (vcpu->arch.mmu.root_level == 0)
-			root_gfn = 0;
-		if (vcpu->arch.mmu.direct_map) {
-			direct = 1;
-			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, direct,
+				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -2427,6 +2457,14 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mmu.direct_map)
+		return mmu_alloc_direct_roots(vcpu);
+	else
+		return mmu_alloc_shadow_roots(vcpu);
+}
+
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
-- 
cgit v0.10.2


From 81407ca553c0c852b8cd3f38f3ec362d307f829b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:00 +0200
Subject: KVM: MMU: Allow long mode shadows for legacy page tables

Currently the KVM softmmu implementation can not shadow a 32
bit legacy or PAE page table with a long mode page table.
This is a required feature for nested paging emulation
because the nested page table must alway be in host format.
So this patch implements the missing pieces to allow long
mode page tables for page table types.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e70de3..bd59b48 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -256,6 +256,7 @@ struct kvm_mmu {
 	bool direct_map;
 
 	u64 *pae_root;
+	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
 	u64 pdptrs[4]; /* pae */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cd5a717e..dd76765 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1504,6 +1504,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 	iterator->addr = addr;
 	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
 	iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+	if (iterator->level == PT64_ROOT_LEVEL &&
+	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+	    !vcpu->arch.mmu.direct_map)
+		--iterator->level;
+
 	if (iterator->level == PT32E_ROOT_LEVEL) {
 		iterator->shadow_addr
 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -2314,7 +2320,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+	     vcpu->arch.mmu.direct_map)) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		sp = page_header(root);
@@ -2394,10 +2402,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
-	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	u64 pdptr;
+	u64 pdptr, pm_mask;
+	gfn_t root_gfn;
+	int i;
 
 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
@@ -2426,8 +2434,13 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 	/*
 	 * We shadow a 32 bit page table. This may be a legacy 2-level
-	 * or a PAE 3-level page table.
+	 * or a PAE 3-level page table. In either case we need to be aware that
+	 * the shadow page table may be a PAE or a long mode page table.
 	 */
+	pm_mask = PT_PRESENT_MASK;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2451,9 +2464,35 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	}
-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+	/*
+	 * If we shadow a 32 bit page table with a long mode page
+	 * table we enter this path.
+	 */
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		if (vcpu->arch.mmu.lm_root == NULL) {
+			/*
+			 * The additional page necessary for this is only
+			 * allocated on demand.
+			 */
+
+			u64 *lm_root;
+
+			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+			if (lm_root == NULL)
+				return 1;
+
+			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+			vcpu->arch.mmu.lm_root = lm_root;
+		}
+
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+	}
+
 	return 0;
 }
 
@@ -2470,9 +2509,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_mmu_page *sp;
 
+	if (vcpu->arch.mmu.direct_map)
+		return;
+
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
@@ -3253,6 +3295,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+	if (vcpu->arch.mmu.lm_root != NULL)
+		free_page((unsigned long)vcpu->arch.mmu.lm_root);
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 2d48a985c7bbcd72b4e92e301ea96bf1252ffc61 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:01 +0200
Subject: KVM: MMU: Track NX state in struct kvm_mmu

With Nested Paging emulation the NX state between the two
MMU contexts may differ. To make sure that always the right
fault error code is recorded this patch moves the NX state
into struct kvm_mmu so that the code can distinguish between
L1 and L2 NX state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bd59b48..b43686a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -259,6 +259,8 @@ struct kvm_mmu {
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
+	bool nx;
+
 	u64 pdptrs[4]; /* pae */
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd76765..95cbeed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2634,6 +2634,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
+	context->nx = false;
 	return 0;
 }
 
@@ -2687,7 +2688,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
-	if (!is_nx(vcpu))
+	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 	switch (level) {
 	case PT32_ROOT_LEVEL:
@@ -2746,6 +2747,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context,
 					int level)
 {
+	context->nx = is_nx(vcpu);
+
 	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
@@ -2772,6 +2775,8 @@ static int paging64_init_context(struct kvm_vcpu *vcpu,
 static int paging32_init_context(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu *context)
 {
+	context->nx = false;
+
 	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
 	context->new_cr3 = paging_new_cr3;
@@ -2810,19 +2815,24 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
 	context->inject_page_fault = kvm_inject_page_fault;
+	context->nx = is_nx(vcpu);
 
 	if (!is_paging(vcpu)) {
+		context->nx = false;
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
+		context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
+		context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
+		context->nx = false;
 		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
@@ -2878,17 +2888,21 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	 * functions between mmu and nested_mmu are swapped.
 	 */
 	if (!is_paging(vcpu)) {
+		g_context->nx = false;
 		g_context->root_level = 0;
 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
 	} else if (is_long_mode(vcpu)) {
+		g_context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
 		g_context->root_level = PT64_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else if (is_pae(vcpu)) {
+		g_context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
 		g_context->root_level = PT32E_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else {
+		g_context->nx = false;
 		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
 		g_context->root_level = PT32_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a28f09b..2bdd843 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -105,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 #if PTTYPE == 64
-	if (is_nx(vcpu))
+	if (vcpu->arch.mmu.nx)
 		access &= ~(gpte >> PT64_NX_SHIFT);
 #endif
 	return access;
@@ -272,7 +272,7 @@ error:
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
 		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault && is_nx(vcpu))
+	if (fetch_fault && mmu->nx)
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
-- 
cgit v0.10.2


From 5bd2edc341d11af175e759a546e4335ba3e0584f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:02 +0200
Subject: KVM: SVM: Implement MMU helper functions for Nested Nested Paging

This patch adds the helper functions which will be used in
the mmu context for handling nested nested page faults.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ca711cb..9a9a440 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -104,6 +104,8 @@ struct nested_state {
 	u32 intercept_exceptions;
 	u64 intercept;
 
+	/* Nested Paging related state */
+	u64 nested_cr3;
 };
 
 #define MSRPM_OFFSETS	16
@@ -1600,6 +1602,34 @@ static int vmmcall_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return svm->nested.nested_cr3;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+				   unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+	force_new_asid(vcpu);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+	svm->vmcb->control.exit_code_hi = 0;
+	svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
+	svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+
+	nested_svm_vmexit(svm);
+}
+
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
 	if (!(svm->vcpu.arch.efer & EFER_SVME)
-- 
cgit v0.10.2


From 4b16184c1ccafa4b0c188c622ea532fb90e6f5b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:03 +0200
Subject: KVM: SVM: Initialize Nested Nested MMU context on VMRUN

This patch adds code to initialize the Nested Nested Paging
MMU context when the L1 guest executes a VMRUN instruction
and has nested paging enabled in its VMCB.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95cbeed..6e248d8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2962,6 +2962,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
 	mmu_free_roots(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a9a440..3184772 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -294,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 	force_new_asid(vcpu);
 }
 
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+	return PT64_ROOT_LEVEL;
+#else
+	return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	vcpu->arch.efer = efer;
@@ -1630,6 +1639,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
 	nested_svm_vmexit(svm);
 }
 
+static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
+	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+	vcpu->arch.mmu.shadow_root_level = get_npt_level();
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
 	if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1998,6 +2027,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
+	svm->nested.nested_cr3 = 0;
+
 	/* Restore selected save entries */
 	svm->vmcb->save.es = hsave->save.es;
 	svm->vmcb->save.cs = hsave->save.cs;
@@ -2024,6 +2055,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
+	nested_svm_uninit_mmu_context(&svm->vcpu);
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
 
@@ -2071,6 +2103,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if (vmcb->control.asid == 0)
 		return false;
 
+	if (vmcb->control.nested_ctl && !npt_enabled)
+		return false;
+
 	return true;
 }
 
@@ -2143,6 +2178,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
+	if (nested_vmcb->control.nested_ctl) {
+		kvm_mmu_unload(&svm->vcpu);
+		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+		nested_svm_init_mmu_context(&svm->vcpu);
+	}
+
 	/* Load the nested guest state */
 	svm->vmcb->save.es = nested_vmcb->save.es;
 	svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -3415,15 +3456,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
-	return PT64_ROOT_LEVEL;
-#else
-	return PT32E_ROOT_LEVEL;
-#endif
-}
-
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
-- 
cgit v0.10.2


From 55c5e464fcc28ee763d40561abf2b259131dd703 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:04 +0200
Subject: KVM: SVM: Expect two more candiates for exit_int_info

This patch adds INTR and NMI intercepts to the list of
expected intercepts with an exit_int_info set. While this
can't happen on bare metal it is architectural legal and may
happen with KVMs SVM emulation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3184772..de1930e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2991,7 +2991,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
+	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __func__, svm->vmcb->control.exit_int_info,
-- 
cgit v0.10.2


From 3d4aeaad8bb8f8084a414819934b73ab49c26c92 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:05 +0200
Subject: KVM: SVM: Report Nested Paging support to userspace

This patch implements the reporting of the nested paging
feature support to userspace.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de1930e..36e6c88 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3481,6 +3481,10 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		if (svm_has(SVM_FEATURE_NRIP))
 			entry->edx |= SVM_FEATURE_NRIP;
 
+		/* Support NPT for the guest if enabled */
+		if (npt_enabled)
+			entry->edx |= SVM_FEATURE_NPT;
+
 		break;
 	}
 }
-- 
cgit v0.10.2


From 4c62a2dc92518c5adf434df8e5c2283c6762672a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:06 +0200
Subject: KVM: X86: Report SVM bit to userspace only when supported

This patch fixes a bug in KVM where it _always_ reports the
support of the SVM feature to userspace. But KVM only
supports SVM on AMD hardware and only when it is enabled in
the kernel module. This patch fixes the wrong reporting.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 36e6c88..e0f4da0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3469,6 +3469,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 	switch (func) {
+	case 0x80000001:
+		if (nested)
+			entry->ecx |= (1 << 2); /* Set SVM bit */
+		break;
 	case 0x8000000A:
 		entry->eax = 1; /* SVM revision 1 */
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbd9f4a..3ff0a8f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2209,7 +2209,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
 		0 /* SKINIT */ | 0 /* WDT */;
-- 
cgit v0.10.2


From c39cbd2a0012334714409eec1e9cf4d542e359e9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 12 Sep 2010 16:39:11 +0200
Subject: KVM: Document that KVM_GET_SUPPORTED_CPUID may return emulated values

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 24d6341..b336266 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1042,8 +1042,9 @@ number is just right, the 'nent' field is adjusted to the number of valid
 entries in the 'entries' array, which is then filled.
 
 The entries returned are the host cpuid as returned by the cpuid instruction,
-with unknown or unsupported features masked out.  The fields in each entry
-are defined as follows:
+with unknown or unsupported features masked out.  Some features (for example,
+x2apic), may not be present in the host cpu, but are exposed by kvm if it can
+emulate them efficiently. The fields in each entry are defined as follows:
 
   function: the eax value used to obtain the entry
   index: the ecx value used to obtain the entry (for entries that are
-- 
cgit v0.10.2


From b0bc3ee2b54fcea0df42cc9aa05103b1ccd89db0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 13 Sep 2010 16:45:28 +0200
Subject: KVM: MMU: Fix regression with ept memory types merged into non-ept
 page tables

Commit "KVM: MMU: Make tdp_enabled a mmu-context parameter" made real-mode
set ->direct_map, and changed the code that merges in the memory type depend
on direct_map instead of tdp_enabled.  However, in this case what really
matters is tdp, not direct_map, since tdp changes the pte format regardless
of whether the mapping is direct or not.

As a result, real-mode shadow mappings got corrupted with ept memory types.
The result was a huge slowdown, likely due to the cache being disabled.

Change it back as the simplest fix for the regression (real fix is to move
all that to vmx code, and not use tdp_enabled as a synonym for ept).

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6e248d8..3ce56bf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1980,7 +1980,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_user_mask;
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (vcpu->arch.mmu.direct_map)
+	if (tdp_enabled)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
-- 
cgit v0.10.2


From 3842d135ff246b6543f1df77f5600e12094a6845 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 27 Jul 2010 12:30:24 +0300
Subject: KVM: Check for pending events before attempting injection

Instead of blindly attempting to inject an event before each guest entry,
check for a possible event first in vcpu->requests.  Sites that can trigger
event injection are modified to set KVM_REQ_EVENT:

- interrupt, nmi window opening
- ppr updates
- i8259 output changes
- local apic irr changes
- rflags updates
- gif flag set
- event set on exit

This improves non-injecting entry performance, and sets the stage for
non-atomic injection.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 6e77471..ab1bb8f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
 		if (!found)
 			return;
 
+		kvm_make_request(KVM_REQ_EVENT, found);
 		kvm_vcpu_kick(found);
 	}
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f..c6f2f15 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
-	u32 tpr, isrv, ppr;
+	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
 
+	old_ppr = apic_get_reg(apic, APIC_PROCPRI);
 	tpr = apic_get_reg(apic, APIC_TASKPRI);
 	isr = apic_find_highest_isr(apic);
 	isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
 		   apic, ppr, isr, isrv);
 
-	apic_set_reg(apic, APIC_PROCPRI, ppr);
+	if (old_ppr != ppr) {
+		apic_set_reg(apic, APIC_PROCPRI, ppr);
+		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+	}
 }
 
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			break;
 		}
 
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		kvm_vcpu_kick(vcpu);
 		break;
 
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 				       "INIT on a runnable vcpu %d\n",
 				       vcpu->vcpu_id);
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		} else {
 			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			result = 1;
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
 		break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 		trigger_mode = IOAPIC_EDGE_TRIG;
 	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
 		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1152,6 +1160,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e0f4da0..1d2ea65 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2371,6 +2371,7 @@ static int stgi_interception(struct vcpu_svm *svm)
 
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 
 	enable_gif(svm);
 
@@ -2763,6 +2764,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
 
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	/*
@@ -3209,8 +3211,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 
 	svm->int3_injected = 0;
 
-	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+	if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+	}
 
 	svm->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&svm->vcpu);
@@ -3219,6 +3223,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
 		return;
 
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
 	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
 	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1a7691a..2ce2e0b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3327,6 +3327,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 1;
 }
 
@@ -3339,6 +3340,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	++vcpu->stat.irq_window_exits;
 
 	/*
@@ -3595,6 +3598,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 	++vcpu->stat.nmi_window_exits;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 1;
 }
@@ -3828,6 +3832,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	if (!idtv_info_valid)
 		return;
 
+	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3ff0a8f..e719803 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -284,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	u32 prev_nr;
 	int class1, class2;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	if (!vcpu->arch.exception.pending) {
 	queue:
 		vcpu->arch.exception.pending = true;
@@ -356,6 +358,7 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu)
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.nmi_pending = 1;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -2418,6 +2421,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
 }
@@ -2571,6 +2575,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -4329,6 +4335,7 @@ done:
 
 	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
@@ -4998,6 +5005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
+	bool req_event;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5045,8 +5053,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
+	req_event = kvm_check_request(KVM_REQ_EVENT, vcpu);
+
 	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
+		if (req_event)
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 		atomic_set(&vcpu->guest_mode, 0);
 		smp_wmb();
 		local_irq_enable();
@@ -5055,17 +5067,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	inject_pending_event(vcpu);
+	if (req_event || req_int_win) {
+		inject_pending_event(vcpu);
 
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		kvm_x86_ops->enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		kvm_x86_ops->enable_irq_window(vcpu);
+		/* enable NMI/IRQ window open exits if needed */
+		if (vcpu->arch.nmi_pending)
+			kvm_x86_ops->enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+			kvm_x86_ops->enable_irq_window(vcpu);
 
-	if (kvm_lapic_enabled(vcpu)) {
-		update_cr8_intercept(vcpu);
-		kvm_lapic_sync_to_vapic(vcpu);
+		if (kvm_lapic_enabled(vcpu)) {
+			update_cr8_intercept(vcpu);
+			kvm_lapic_sync_to_vapic(vcpu);
+		}
 	}
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5305,6 +5319,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	vcpu->arch.exception.pending = false;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -5368,6 +5384,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
 	vcpu->arch.mp_state = mp_state->mp_state;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 
@@ -5389,6 +5406,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5459,6 +5477,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -5691,6 +5711,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr6 = DR6_FIXED_1;
 	vcpu->arch.dr7 = DR7_FIXED_1;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -6001,6 +6023,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 917e68f..6022da1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -39,6 +39,7 @@
 #define KVM_REQ_KVMCLOCK_UPDATE    8
 #define KVM_REQ_KICK               9
 #define KVM_REQ_DEACTIVATE_FPU    10
+#define KVM_REQ_EVENT             11
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
cgit v0.10.2


From 51aa01d13d4a64422cf8095205fc4a02322aca2c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Jul 2010 14:31:20 +0300
Subject: KVM: VMX: Split up vmx_complete_interrupts()

vmx_complete_interrupts() does too much, split it up:
 - vmx_vcpu_run() gets the "cache important vmcs fields" part
 - a new vmx_complete_atomic_exit() gets the parts that must be done atomically
 - a new vmx_recover_nmi_blocking() does what its name says
 - vmx_complete_interrupts() retains the event injection recovery code

This helps in reducing the work done in atomic context.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2ce2e0b..927d840 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -125,6 +125,7 @@ struct vcpu_vmx {
 	unsigned long         host_rsp;
 	int                   launched;
 	u8                    fail;
+	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	struct shared_msr_entry *guest_msrs;
 	int                   nmsrs;
@@ -3775,18 +3776,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
-	u32 exit_intr_info;
-	u32 idt_vectoring_info = vmx->idt_vectoring_info;
-	bool unblock_nmi;
-	u8 vector;
-	int type;
-	bool idtv_info_valid;
-
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	u32 exit_intr_info = vmx->exit_intr_info;
 
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3801,8 +3793,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		asm("int $2");
 		kvm_after_handle_nmi(&vmx->vcpu);
 	}
+}
 
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+	u32 exit_intr_info = vmx->exit_intr_info;
+	bool unblock_nmi;
+	u8 vector;
+	bool idtv_info_valid;
+
+	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3824,6 +3824,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	} else if (unlikely(vmx->soft_vnmi_blocked))
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	u32 idt_vectoring_info = vmx->idt_vectoring_info;
+	u8 vector;
+	int type;
+	bool idtv_info_valid;
+
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&vmx->vcpu);
@@ -4036,6 +4046,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	vmx_complete_atomic_exit(vmx);
+	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 }
 
-- 
cgit v0.10.2


From 537b37e2674b7e4390a490e03cae53ca9ca99e30 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 22 Jul 2010 12:54:21 +0300
Subject: KVM: VMX: Move real-mode interrupt injection fixup to
 vmx_complete_interrupts()

This allows reuse of vmx_complete_interrupts() for cancelling injections.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 927d840..541f0d2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,6 +182,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static void fixup_rmode_irq(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3828,11 +3829,15 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-	u32 idt_vectoring_info = vmx->idt_vectoring_info;
+	u32 idt_vectoring_info;
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
 
+	if (vmx->rmode.irq.pending)
+		fixup_rmode_irq(vmx);
+
+	idt_vectoring_info = vmx->idt_vectoring_info;
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
@@ -4040,8 +4045,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_dirty = 0;
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
-- 
cgit v0.10.2


From 83422e17c19d61399cab7dbf9bf40ff9af2a7dd2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Jul 2010 14:43:23 +0300
Subject: KVM: VMX: Parameterize vmx_complete_interrupts() for both exit and
 entry

Currently vmx_complete_interrupts() can decode event information from vmx
exit fields into the generic kvm event queues.  Make it able to decode
the information from the entry fields as well by parametrizing it.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 541f0d2..3237f6c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void fixup_rmode_irq(struct vcpu_vmx *vmx);
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3827,17 +3827,18 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+				      u32 idt_vectoring_info,
+				      int instr_len_field,
+				      int error_code_field)
 {
-	u32 idt_vectoring_info;
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
 
 	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx);
+		fixup_rmode_irq(vmx, &idt_vectoring_info);
 
-	idt_vectoring_info = vmx->idt_vectoring_info;
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
@@ -3865,18 +3866,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		break;
 	case INTR_TYPE_SOFT_EXCEPTION:
 		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+			vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_HARD_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
-			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			u32 err = vmcs_read32(error_code_field);
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		break;
 	case INTR_TYPE_SOFT_INTR:
 		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+			vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3887,24 +3888,31 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+				  VM_EXIT_INSTRUCTION_LEN,
+				  IDT_VECTORING_ERROR_CODE);
+}
+
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
  * when fetching the interrupt redirection bitmap in the real-mode
  * tss, this doesn't happen.  So we do it ourselves.
  */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
 {
 	vmx->rmode.irq.pending = 0;
 	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
 		return;
 	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
 		return;
 	}
-	vmx->idt_vectoring_info =
+	*idt_vectoring_info =
 		VECTORING_INFO_VALID_MASK
 		| INTR_TYPE_EXT_INTR
 		| vmx->rmode.irq.vector;
-- 
cgit v0.10.2


From b463a6f744a263fccd7da14db1afdc880371a280 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Jul 2010 15:06:17 +0300
Subject: KVM: Non-atomic interrupt injection

Change the interrupt injection code to work from preemptible, interrupts
enabled context.  This works by adding a ->cancel_injection() operation
that undoes an injection in case we were not able to actually enter the guest
(this condition could never happen with atomic injection).

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b43686a..80224bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -552,6 +552,7 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code,
 				bool reinject);
+	void (*cancel_injection)(struct kvm_vcpu *vcpu);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d2ea65..1a85fc5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3261,6 +3261,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	}
 }
 
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	control->exit_int_info = control->event_inj;
+	control->exit_int_info_err = control->event_inj_err;
+	control->event_inj = 0;
+	svm_complete_interrupts(svm);
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #else
@@ -3631,6 +3642,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
+	.cancel_injection = svm_cancel_injection,
 	.interrupt_allowed = svm_interrupt_allowed,
 	.nmi_allowed = svm_nmi_allowed,
 	.get_nmi_mask = svm_get_nmi_mask,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3237f6c..70af3db 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3895,6 +3895,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 				  IDT_VECTORING_ERROR_CODE);
 }
 
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	__vmx_complete_interrupts(to_vmx(vcpu),
+				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+				  VM_ENTRY_INSTRUCTION_LEN,
+				  VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
@@ -4348,6 +4358,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
+	.cancel_injection = vmx_cancel_injection,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.nmi_allowed = vmx_nmi_allowed,
 	.get_nmi_mask = vmx_get_nmi_mask,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e719803..a465bd2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5005,7 +5005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
-	bool req_event;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5041,6 +5040,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (unlikely(r))
 		goto out;
 
+	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		inject_pending_event(vcpu);
+
+		/* enable NMI/IRQ window open exits if needed */
+		if (vcpu->arch.nmi_pending)
+			kvm_x86_ops->enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+			kvm_x86_ops->enable_irq_window(vcpu);
+
+		if (kvm_lapic_enabled(vcpu)) {
+			update_cr8_intercept(vcpu);
+			kvm_lapic_sync_to_vapic(vcpu);
+		}
+	}
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5053,35 +5067,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
-	req_event = kvm_check_request(KVM_REQ_EVENT, vcpu);
-
 	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
-		if (req_event)
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
 		atomic_set(&vcpu->guest_mode, 0);
 		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
+		kvm_x86_ops->cancel_injection(vcpu);
 		r = 1;
 		goto out;
 	}
 
-	if (req_event || req_int_win) {
-		inject_pending_event(vcpu);
-
-		/* enable NMI/IRQ window open exits if needed */
-		if (vcpu->arch.nmi_pending)
-			kvm_x86_ops->enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-			kvm_x86_ops->enable_irq_window(vcpu);
-
-		if (kvm_lapic_enabled(vcpu)) {
-			update_cr8_intercept(vcpu);
-			kvm_lapic_sync_to_vapic(vcpu);
-		}
-	}
-
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	kvm_guest_enter();
-- 
cgit v0.10.2


From 625831a3f40d330c611fe37cf501d80d611921f9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 22 Jul 2010 13:09:54 +0300
Subject: KVM: VMX: Move fixup_rmode_irq() to avoid forward declaration

No code changes.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 70af3db..3231593 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,6 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3827,6 +3826,29 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
+{
+	vmx->rmode.irq.pending = 0;
+	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
+		return;
+	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
+	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+		return;
+	}
+	*idt_vectoring_info =
+		VECTORING_INFO_VALID_MASK
+		| INTR_TYPE_EXT_INTR
+		| vmx->rmode.irq.vector;
+}
+
 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 				      u32 idt_vectoring_info,
 				      int instr_len_field,
@@ -3905,29 +3927,6 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
-{
-	vmx->rmode.irq.pending = 0;
-	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
-		return;
-	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-		return;
-	}
-	*idt_vectoring_info =
-		VECTORING_INFO_VALID_MASK
-		| INTR_TYPE_EXT_INTR
-		| vmx->rmode.irq.vector;
-}
-
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
-- 
cgit v0.10.2


From 0959ffacf39b1ae7f56072b0c64429ee528100ca Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 14 Sep 2010 17:46:12 +0200
Subject: KVM: MMU: Don't track nested fault info in error-code

This patch moves the detection whether a page-fault was
nested or not out of the error code and moves it into a
separate variable in the fault struct.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 80224bf..519d6f7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -322,6 +322,7 @@ struct kvm_vcpu_arch {
 	struct {
 		u64      address;
 		unsigned error_code;
+		bool     nested;
 	} fault;
 
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 513abbb..7086ca8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,7 +47,6 @@
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
-#define PFERR_NESTED_MASK (1U << 31)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a465bd2..a51635e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -342,18 +342,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 
 void kvm_propagate_fault(struct kvm_vcpu *vcpu)
 {
-	u32 nested, error;
-
-	error   = vcpu->arch.fault.error_code;
-	nested  = error &  PFERR_NESTED_MASK;
-	error   = error & ~PFERR_NESTED_MASK;
-
-	vcpu->arch.fault.error_code = error;
-
-	if (mmu_is_nested(vcpu) && !nested)
+	if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
 		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
 	else
 		vcpu->arch.mmu.inject_page_fault(vcpu);
+
+	vcpu->arch.fault.nested = false;
 }
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3524,7 +3518,7 @@ static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 	access |= PFERR_USER_MASK;
 	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
 	if (t_gpa == UNMAPPED_GVA)
-		vcpu->arch.fault.error_code |= PFERR_NESTED_MASK;
+		vcpu->arch.fault.nested = true;
 
 	return t_gpa;
 }
-- 
cgit v0.10.2


From 28e4639adf0c9f26f6bb56149b7ab547bf33bb95 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:12 -1000
Subject: KVM: x86: Fix kvmclock bug

If preempted after kvmclock values are updated, but before hardware
virtualization is entered, the last tsc time as read by the guest is
never set.  It underflows the next time kvmclock is updated if there
has not yet been a successful entry / exit into hardware virt.

Fix this by simply setting last_tsc to the newly read tsc value so
that any computed nsec advance of kvmclock is nulled.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a51635e..0b021e1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1095,6 +1095,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_kernel_ns = kernel_ns;
+	vcpu->last_guest_tsc = tsc_timestamp;
 	vcpu->hv_clock.flags = 0;
 
 	/*
-- 
cgit v0.10.2


From f4f510508741680e423524c222f615276ca6222c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Sep 2010 18:44:07 +0200
Subject: KVM: Convert PIC lock from raw spinlock to ordinary spinlock

The PIC code used to be called from preempt_disable() context, which
wasn't very good for PREEMPT_RT.  That is no longer the case, so move
back from raw_spinlock_t to spinlock_t.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab1bb8f..dd54c5b 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
 static void pic_lock(struct kvm_pic *s)
 	__acquires(&s->lock)
 {
-	raw_spin_lock(&s->lock);
+	spin_lock(&s->lock);
 }
 
 static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
 
 	s->wakeup_needed = false;
 
-	raw_spin_unlock(&s->lock);
+	spin_unlock(&s->lock);
 
 	if (wakeup) {
 		kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -569,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
 		return NULL;
-	raw_spin_lock_init(&s->lock);
+	spin_lock_init(&s->lock);
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c3145..ba910d1 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
 };
 
 struct kvm_pic {
-	raw_spinlock_t lock;
+	spinlock_t lock;
 	bool wakeup_needed;
 	unsigned pending_acks;
 	struct kvm *kvm;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b021e1..3adf692 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3011,18 +3011,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		raw_spin_lock(&pic_irqchip(kvm)->lock);
+		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[0],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		raw_spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		raw_spin_lock(&pic_irqchip(kvm)->lock);
+		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[1],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		raw_spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
-- 
cgit v0.10.2


From a0a07cd2c5fc8703db8a07287cdde3d29a286082 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 20 Sep 2010 10:15:32 +0200
Subject: KVM: SVM: do not generate "external interrupt exit" if other exit is
 pending

Nested SVM checks for external interrupt after injecting nested exception.
In case there is external interrupt pending the code generates "external
interrupt exit" and overwrites previous exit info. If previously injected
exception already generated exit it will be lost.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a85fc5..c929d00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1707,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 		return false;
 
+	/*
+	 * if vmexit was already requested (by intercepted exception
+	 * for instance) do not overwrite it with "external interrupt"
+	 * vmexit.
+	 */
+	if (svm->nested.exit_required)
+		return false;
+
 	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
 	svm->vmcb->control.exit_info_1 = 0;
 	svm->vmcb->control.exit_info_2 = 0;
-- 
cgit v0.10.2


From 8475f94abfa40d6975923ff022280cf7250fc5fb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 20 Sep 2010 22:16:45 +0800
Subject: KVM: fix the description of kvm-amd.nested in documentation

The default state of 'kvm-amd.nested' is enabled now, so fix the documentation

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6bd2375..3170edb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1127,7 +1127,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			Default is 1 (enabled)
 
 	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
-			Default is 0 (off)
+			Default is 1 (enabled)
 
 	kvm-amd.npt=	[KVM,AMD] Disable nested paging (virtualized MMU)
 			for all guests.
-- 
cgit v0.10.2


From a182d8737f0bdfb684b3255c8e266e71999e2225 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 20 Sep 2010 22:17:48 +0800
Subject: KVM: document 'kvm.mmu_audit' parameter

Document this parameter into Documentation/kernel-parameters.txt

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3170edb..8dc2548 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1126,6 +1126,10 @@ and is between 256 and 4096 characters. It is defined in the file
 	kvm.oos_shadow=	[KVM] Disable out-of-sync shadow paging.
 			Default is 1 (enabled)
 
+	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
+			KVM MMU at runtime.
+			Default is 0 (off)
+
 	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
 			Default is 1 (enabled)
 
-- 
cgit v0.10.2


From cb16a7b3872e9a806f16b1f09b59103fafc7b796 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Sat, 18 Sep 2010 08:41:02 +0800
Subject: KVM: MMU: fix counting of rmap entries in rmap_add()

It seems that rmap entries are under counted.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ce56bf..c94c432 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -632,6 +632,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 		desc->sptes[0] = (u64 *)*rmapp;
 		desc->sptes[1] = spte;
 		*rmapp = (unsigned long)desc | 1;
+		++count;
 	} else {
 		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -644,7 +645,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 			desc = desc->more;
 		}
 		for (i = 0; desc->sptes[i]; ++i)
-			;
+			++count;
 		desc->sptes[i] = spte;
 	}
 	return count;
-- 
cgit v0.10.2


From 4ab8e02404fcbc16beefac66de24dbb2706fe2f3 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 19 Sep 2010 14:34:05 +0200
Subject: KVM: x86 emulator: Expose emulate_int_real()

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 5187dd8..b36c6b3 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -260,5 +260,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
-
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+		     struct x86_emulate_ops *ops, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
-- 
cgit v0.10.2


From 63995653ade16deacaea5b49ceaf6376314593ac Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 19 Sep 2010 14:34:06 +0200
Subject: KVM: Add kvm_inject_realmode_interrupt() wrapper

This adds a wrapper function kvm_inject_realmode_interrupt() around the
emulator function emulate_int_real() to allow real mode interrupt injection.

[avi: initialize operand and address sizes before emulating interrupts]
[avi: initialize rip for real mode interrupt injection]
[avi: clear interrupt pending flag after emulating interrupt injection]

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3adf692..7d28805 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4188,6 +4188,35 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 }
 
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+{
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	int ret;
+
+	init_emulate_ctxt(vcpu);
+
+	vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
+	vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
+	vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
+	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+
+	if (ret != X86EMUL_CONTINUE)
+		return EMULATE_FAIL;
+
+	vcpu->arch.emulate_ctxt.eip = c->eip;
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+	if (irq == NMI_VECTOR)
+		vcpu->arch.nmi_pending = false;
+	else
+		vcpu->arch.interrupt.pending = false;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.insn_emulation_fail;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index bf4dc2f..2cea414 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -72,6 +72,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
 
-- 
cgit v0.10.2


From a92601bb707f6f49fd5563ef3d09928e70cc222e Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 19 Sep 2010 14:34:07 +0200
Subject: KVM: VMX: Emulated real mode interrupt injection

Replace the inject-as-software-interrupt hack we currently have with
emulated injection.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3231593..9d3f972 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -155,11 +155,6 @@ struct vcpu_vmx {
 			u32 limit;
 			u32 ar;
 		} tr, es, ds, fs, gs;
-		struct {
-			bool pending;
-			u8 vector;
-			unsigned rip;
-		} irq;
 	} rmode;
 	int vpid;
 	bool emulation_required;
@@ -1028,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	}
 
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = nr;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (kvm_exception_is_soft(nr))
-			vmx->rmode.irq.rip +=
-				vmx->vcpu.arch.event_exit_inst_len;
-		intr_info |= INTR_TYPE_SOFT_INTR;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 
@@ -2816,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.irq_injections;
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = irq;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (vcpu->arch.interrupt.soft)
-			vmx->rmode.irq.rip +=
-				vmx->vcpu.arch.event_exit_inst_len;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	intr = irq | INTR_INFO_VALID_MASK;
@@ -2857,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.nmi_injections;
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = NMI_VECTOR;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     NMI_VECTOR | INTR_TYPE_SOFT_INTR |
-			     INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -3826,29 +3799,6 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
-{
-	vmx->rmode.irq.pending = 0;
-	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
-		return;
-	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-		return;
-	}
-	*idt_vectoring_info =
-		VECTORING_INFO_VALID_MASK
-		| INTR_TYPE_EXT_INTR
-		| vmx->rmode.irq.vector;
-}
-
 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 				      u32 idt_vectoring_info,
 				      int instr_len_field,
@@ -3858,9 +3808,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 	int type;
 	bool idtv_info_valid;
 
-	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx, &idt_vectoring_info);
-
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
-- 
cgit v0.10.2


From 49e9d557f9b6e9639390b63b645f2def8dde5f1b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Sep 2010 14:34:08 +0200
Subject: KVM: VMX: Respect interrupt window in big real mode

If an interrupt is pending, we need to stop emulation so we
can inject it.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9d3f972..28c72da 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3582,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	enum emulation_result err = EMULATE_DONE;
 	int ret = 1;
+	u32 cpu_exec_ctrl;
+	bool intr_window_requested;
+
+	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 
 	while (!guest_state_valid(vcpu)) {
+		if (intr_window_requested
+		    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+			return handle_interrupt_window(&vmx->vcpu);
+
 		err = emulate_instruction(vcpu, 0, 0, 0);
 
 		if (err == EMULATE_DO_MMIO) {
-- 
cgit v0.10.2


From 624d84cfe63b5afdd087bf5b2075a8a8cac5c83f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 21 Sep 2010 19:59:44 +0200
Subject: KVM: cpu_relax() during spin waiting for reboot

It doesn't really matter, but if we spin, we should spin in a more relaxed
manner.  This way, if something goes wrong at least it won't contribute to
global warming.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c7a57b4..b8499f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2022,7 +2022,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void)
 		/* spin while reset goes on */
 		local_irq_enable();
 		while (true)
-			;
+			cpu_relax();
 	}
 	/* Fault while not rebooting.  We want the trace. */
 	BUG();
-- 
cgit v0.10.2


From 5f4e3f882731c65b5d64a2ff743fda96eaebb9ee Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:13 -1000
Subject: KVM: x86: Make math work for other scales

The math in kvm_get_time_scale relies on the fact that
NSEC_PER_SEC < 2^32.  To use the same function to compute
arbitrary time scales, we must extend the first reduction
step to shrink the base rate to a 32-bit value, and
possibly reduce the scaled rate into a 32-bit as well.

Note we must take care to avoid an arithmetic overflow
when scaling up the tps32 value (this could not happen
with the fixed scaled value of NSEC_PER_SEC, but can
happen with scaled rates above 2^31.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7d28805..6666af8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -920,31 +920,35 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 	return quotient;
 }
 
-static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+			       s8 *pshift, u32 *pmultiplier)
 {
-	uint64_t nsecs = 1000000000LL;
+	uint64_t scaled64;
 	int32_t  shift = 0;
 	uint64_t tps64;
 	uint32_t tps32;
 
-	tps64 = tsc_khz * 1000LL;
-	while (tps64 > nsecs*2) {
+	tps64 = base_khz * 1000LL;
+	scaled64 = scaled_khz * 1000LL;
+	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000UL) {
 		tps64 >>= 1;
 		shift--;
 	}
 
 	tps32 = (uint32_t)tps64;
-	while (tps32 <= (uint32_t)nsecs) {
-		tps32 <<= 1;
+	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000UL) {
+		if (scaled64 & 0xffffffff00000000UL || tps32 & 0x80000000)
+			scaled64 >>= 1;
+		else
+			tps32 <<= 1;
 		shift++;
 	}
 
-	hv_clock->tsc_shift = shift;
-	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+	*pshift = shift;
+	*pmultiplier = div_frac(scaled64, tps32);
 
-	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
-		 __func__, tsc_khz, hv_clock->tsc_shift,
-		 hv_clock->tsc_to_system_mul);
+	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
+		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
 }
 
 static inline u64 get_kernel_ns(void)
@@ -1084,7 +1088,9 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	}
 
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+				   &vcpu->hv_clock.tsc_shift,
+				   &vcpu->hv_clock.tsc_to_system_mul);
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
-- 
cgit v0.10.2


From 34c238a1d1832d7b1f655641f52782e86396b30a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:14 -1000
Subject: KVM: x86: Rename timer function

This just changes some names to better reflect the usage they
will be given.  Separated out to keep confusion to a minimum.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6666af8..ce57cd8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -892,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 	/*
 	 * The guest calculates current wall clock time by adding
-	 * system time (updated by kvm_write_guest_time below) to the
+	 * system time (updated by kvm_guest_time_update below) to the
 	 * wall clock specified here.  guest system time equals host
 	 * system time for us, thus we must fill in host boot time here.
 	 */
@@ -1032,7 +1032,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
-static int kvm_write_guest_time(struct kvm_vcpu *v)
+static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -1052,7 +1052,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	local_irq_restore(flags);
 
 	if (unlikely(this_tsc_khz == 0)) {
-		kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
 	}
 
@@ -1128,7 +1128,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
 	if (!vcpu->time_page)
 		return 0;
-	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 	return 1;
 }
 
@@ -5041,8 +5041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) {
-			r = kvm_write_guest_time(vcpu);
+		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
 				goto out;
 		}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6022da1..0b89d00 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -36,7 +36,7 @@
 #define KVM_REQ_PENDING_TIMER      5
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
-#define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_CLOCK_UPDATE       8
 #define KVM_REQ_KICK               9
 #define KVM_REQ_DEACTIVATE_FPU    10
 #define KVM_REQ_EVENT             11
-- 
cgit v0.10.2


From c285545f813d7b0ce989fd34e42ad1fe785dc65d Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:15 -1000
Subject: KVM: x86: TSC catchup mode

Negate the effects of AN TYM spell while kvm thread is preempted by tracking
conversion factor to the highest TSC rate and catching the TSC up when it has
fallen behind the kernel view of time.  Note that once triggered, we don't
turn off catchup mode.

A slightly more clever version of this is possible, which only does catchup
when TSC rate drops, and which specifically targets only CPUs with broken
TSC, but since these all are considered unstable_tsc(), this patch covers
all necessary cases.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 519d6f7..9e6fe39 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -384,6 +384,9 @@ struct kvm_vcpu_arch {
 	u64 last_host_tsc;
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
+	u64 last_tsc_nsec;
+	u64 last_tsc_write;
+	bool tsc_catchup;
 
 	bool nmi_pending;
 	bool nmi_injected;
@@ -444,6 +447,9 @@ struct kvm_arch {
 	u64 last_tsc_nsec;
 	u64 last_tsc_offset;
 	u64 last_tsc_write;
+	u32 virtual_tsc_khz;
+	u32 virtual_tsc_mult;
+	s8 virtual_tsc_shift;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce57cd8..bfcf8fd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -962,6 +962,7 @@ static inline u64 get_kernel_ns(void)
 }
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+unsigned long max_tsc_khz;
 
 static inline int kvm_tsc_changes_freq(void)
 {
@@ -985,6 +986,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
 	return ret;
 }
 
+static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+{
+	/* Compute a scale to convert nanoseconds in TSC cycles */
+	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+			   &kvm->arch.virtual_tsc_shift,
+			   &kvm->arch.virtual_tsc_mult);
+	kvm->arch.virtual_tsc_khz = this_tsc_khz;
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+				      vcpu->kvm->arch.virtual_tsc_mult,
+				      vcpu->kvm->arch.virtual_tsc_shift);
+	tsc += vcpu->arch.last_tsc_write;
+	return tsc;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -1029,6 +1048,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 
 	/* Reset of TSC must disable overshoot protection below */
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
+	vcpu->arch.last_tsc_write = data;
+	vcpu->arch.last_tsc_nsec = ns;
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
@@ -1041,22 +1062,42 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	s64 kernel_ns, max_kernel_ns;
 	u64 tsc_timestamp;
 
-	if ((!vcpu->time_page))
-		return 0;
-
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
 	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
-	local_irq_restore(flags);
 
 	if (unlikely(this_tsc_khz == 0)) {
+		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
 	}
 
 	/*
+	 * We may have to catch up the TSC to match elapsed wall clock
+	 * time for two reasons, even if kvmclock is used.
+	 *   1) CPU could have been running below the maximum TSC rate
+	 *   2) Broken TSC compensation resets the base at each VCPU
+	 *      entry to avoid unknown leaps of TSC even when running
+	 *      again on the same CPU.  This may cause apparent elapsed
+	 *      time to disappear, and the guest to stand still or run
+	 *	very slowly.
+	 */
+	if (vcpu->tsc_catchup) {
+		u64 tsc = compute_guest_tsc(v, kernel_ns);
+		if (tsc > tsc_timestamp) {
+			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+			tsc_timestamp = tsc;
+		}
+	}
+
+	local_irq_restore(flags);
+
+	if (!vcpu->time_page)
+		return 0;
+
+	/*
 	 * Time as measured by the TSC may go backwards when resetting the base
 	 * tsc_timestamp.  The reason for this is that the TSC resolution is
 	 * higher than the resolution of the other clock scales.  Thus, many
@@ -1122,16 +1163,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	return 0;
 }
 
-static int kvm_request_guest_time_update(struct kvm_vcpu *v)
-{
-	struct kvm_vcpu_arch *vcpu = &v->arch;
-
-	if (!vcpu->time_page)
-		return 0;
-	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
-	return 1;
-}
-
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1455,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		}
 
 		vcpu->arch.time = data;
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
@@ -1470,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			kvm_release_page_clean(vcpu->arch.time_page);
 			vcpu->arch.time_page = NULL;
 		}
-
-		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	case MSR_IA32_MCG_CTL:
@@ -2028,9 +2058,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 				native_read_tsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
-		if (check_tsc_unstable())
+		if (check_tsc_unstable()) {
 			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
-		kvm_migrate_timers(vcpu);
+			vcpu->arch.tsc_catchup = 1;
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		}
+		if (vcpu->cpu != cpu)
+			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
 }
@@ -4461,8 +4495,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
-			if (!kvm_request_guest_time_update(vcpu))
-				continue;
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
 				send_ipi = 1;
 		}
@@ -4517,11 +4550,20 @@ static void kvm_timer_init(void)
 {
 	int cpu;
 
+	max_tsc_khz = tsc_khz;
 	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+		struct cpufreq_policy policy;
+		memset(&policy, 0, sizeof(policy));
+		cpufreq_get_policy(&policy, get_cpu());
+		if (policy.cpuinfo.max_freq)
+			max_tsc_khz = policy.cpuinfo.max_freq;
+#endif
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
 	}
+	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
 	for_each_online_cpu(cpu)
 		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
@@ -5752,7 +5794,7 @@ int kvm_arch_hardware_enable(void *garbage)
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			if (vcpu->cpu == smp_processor_id())
-				kvm_request_guest_time_update(vcpu);
+				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
@@ -5803,6 +5845,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.pio_data = page_address(page);
 
+	if (!kvm->arch.virtual_tsc_khz)
+		kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
 		goto fail_free_pio_data;
-- 
cgit v0.10.2


From 19b6a85b78a5d4b466c537bdbf0eaecae5e2c4e2 Mon Sep 17 00:00:00 2001
From: Arjan Koers <0h61vkll2ly8@xutrox.com>
Date: Mon, 2 Aug 2010 23:35:28 +0200
Subject: KVM guest: Move a printk that's using the clock before it's ready

Fix a hang during SMP kernel boot on KVM that showed up
after commit 489fb490dbf8dab0249ad82b56688ae3842a79e8
(2.6.35) and 59aab522154a2f17b25335b63c1cf68a51fb6ae0
(2.6.34.1). The problem only occurs when
CONFIG_PRINTK_TIME is set.

KVM-Stable-Tag.
Signed-off-by: Arjan Koers <0h61vkll2ly8@xutrox.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c..ca43ce3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
 static int kvm_register_clock(char *txt)
 {
 	int cpu = smp_processor_id();
-	int low, high;
+	int low, high, ret;
+
 	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+	ret = native_write_msr_safe(msr_kvm_system_time, low, high);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
 
-	return native_write_msr_safe(msr_kvm_system_time, low, high);
+	return ret;
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-- 
cgit v0.10.2


From 07d6f555d536aad1d74bb8b41dae9385007ecc26 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 28 Sep 2010 16:37:42 +0200
Subject: KVM: VMX: Add AX to list of registers clobbered by guest switch

By chance this caused no harm so far. We overwrite AX during switch
to/from guest context, so we must declare this.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 28c72da..007be84 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4007,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
 	      : "cc", "memory"
-		, R"bx", R"di", R"si"
+		, R"ax", R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 #endif
-- 
cgit v0.10.2


From 50933623e50d8730cc1a65853c153b3b4c93b629 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 26 Sep 2010 13:00:53 +0200
Subject: KVM: x86: Fix constant type in kvm_get_time_scale

Older gcc versions complain about the improper type (for x86-32), 4.5
seems to fix this silently. However, we should better use the right type
initially.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfcf8fd..ffcb906 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -930,14 +930,14 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 
 	tps64 = base_khz * 1000LL;
 	scaled64 = scaled_khz * 1000LL;
-	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000UL) {
+	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
 	}
 
 	tps32 = (uint32_t)tps64;
-	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000UL) {
-		if (scaled64 & 0xffffffff00000000UL || tps32 & 0x80000000)
+	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
 			scaled64 >>= 1;
 		else
 			tps32 <<= 1;
-- 
cgit v0.10.2


From 7129eecac10681f69cb00c0323ee915feceb57eb Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 28 Sep 2010 16:33:32 +0800
Subject: KVM: x86 emulator: Eliminate compilation warning in x86_decode_insn()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminate:
arch/x86/kvm/emulate.c:801: warning: ‘sv’ may be used uninitialized in this
function

on gcc 4.1.2

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aead72e..d0df25d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -798,7 +798,7 @@ done:
 
 static void fetch_bit_operand(struct decode_cache *c)
 {
-	long sv, mask;
+	long sv = 0, mask;
 
 	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
 		mask = ~(c->dst.bytes * 8 - 1);
-- 
cgit v0.10.2


From 6292757fb0e758748fdb441861f8c50d397de9f0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:02:12 +0800
Subject: KVM: MMU: update 'root_hpa' out of loop in PAE shadow path

The value of 'vcpu->arch.mmu.pae_root' is not modified, so we can update
'root_hpa' out of the loop.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c94c432..3630046 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2393,8 +2393,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 			++sp->root_count;
 			spin_unlock(&vcpu->kvm->mmu_lock);
 			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
-			vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 		}
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	} else
 		BUG();
 
@@ -2466,8 +2466,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
 		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
-		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	}
+	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 
 	/*
 	 * If we shadow a 32 bit page table with a long mode page
-- 
cgit v0.10.2


From 20bd40dc6492da293993559555df07d467fd202e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:03:27 +0800
Subject: KVM: MMU: cleanup for error mask set while walk guest page table

Small cleanup for set page fault error code

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2bdd843..a83ff37 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -224,9 +224,7 @@ walk:
 			    is_cpuid_PSE36())
 				gfn += pse36_gfn_delta(pte);
 
-			access |= write_fault ? PFERR_WRITE_MASK : 0;
-			access |= fetch_fault ? PFERR_FETCH_MASK : 0;
-			access |= user_fault  ? PFERR_USER_MASK  : 0;
+			access |= write_fault | fetch_fault | user_fault;
 
 			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
 						      access);
@@ -268,10 +266,9 @@ error:
 	walker->error_code = 0;
 	if (present)
 		walker->error_code |= PFERR_PRESENT_MASK;
-	if (write_fault)
-		walker->error_code |= PFERR_WRITE_MASK;
-	if (user_fault)
-		walker->error_code |= PFERR_USER_MASK;
+
+	walker->error_code |= write_fault | user_fault;
+
 	if (fetch_fault && mmu->nx)
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
@@ -673,9 +670,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	int r;
 
 	r = FNAME(walk_addr)(&walker, vcpu, vaddr,
-			     !!(access & PFERR_WRITE_MASK),
-			     !!(access & PFERR_USER_MASK),
-			     !!(access & PFERR_FETCH_MASK));
+			     access & PFERR_WRITE_MASK,
+			     access & PFERR_USER_MASK,
+			     access & PFERR_FETCH_MASK);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
-- 
cgit v0.10.2


From 33f91edb9211f5c0392071f9eb01958ec69f2193 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:05:00 +0800
Subject: KVM: MMU: set access bit for direct mapping

Set access bit while setup up direct page table if it's nonpaing or npt enabled,
it's good for CPU's speculate access

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3630046..88203fa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2240,7 +2240,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			__set_spte(iterator.sptep,
 				   __pa(sp->spt)
 				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				   | shadow_user_mask | shadow_x_mask);
+				   | shadow_user_mask | shadow_x_mask
+				   | shadow_accessed_mask);
 		}
 	}
 	return pt_write;
-- 
cgit v0.10.2


From 98224bf1d1783a25ccede29ab08309424ec8de25 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:06:16 +0800
Subject: KVM: MMU: audit: fix vcpu's spte walking

After nested nested paging, it may using long mode to shadow 32/PAE paging
guest, so this patch fix it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index bd2b1be..dcca3e7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -51,7 +51,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		sp = page_header(root);
-- 
cgit v0.10.2


From c42fffe3a3aa8c62b8028fff32d18156f5325c3b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:07:07 +0800
Subject: KVM: MMU: audit: unregister audit tracepoints before module unloaded

fix:

Call Trace:
 [<ffffffffa01e46ba>] ? kvm_mmu_pte_write+0x229/0x911 [kvm]
 [<ffffffffa01c6ba9>] ? gfn_to_memslot+0x39/0xa0 [kvm]
 [<ffffffffa01c6c26>] ? mark_page_dirty+0x16/0x2e [kvm]
 [<ffffffffa01c6d6f>] ? kvm_write_guest_page+0x67/0x7f [kvm]
 [<ffffffff81066fbd>] ? local_clock+0x2a/0x3b
 [<ffffffffa01d52ce>] emulator_write_phys+0x46/0x54 [kvm]
 ......
Code:  Bad RIP value.
RIP  [<ffffffffa0172056>] 0xffffffffa0172056
 RSP <ffff880134f69a70>
CR2: ffffffffa0172056

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 88203fa..afde64b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3355,15 +3355,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	return init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-
-	destroy_kvm_mmu(vcpu);
-	free_mmu_pages(vcpu);
-	mmu_free_memory_caches(vcpu);
-}
-
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
@@ -3662,4 +3653,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
 #include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
 #endif
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+
+	destroy_kvm_mmu(vcpu);
+	free_mmu_pages(vcpu);
+	mmu_free_memory_caches(vcpu);
+	mmu_audit_disable();
+}
-- 
cgit v0.10.2


From 38904e128778c38809daf44a1dabc7f25fa8d83e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:07:59 +0800
Subject: KVM: MMU: audit: introduce audit_printk to cleanup audit code

Introduce audit_printk, and record audit point instead audit name

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index dcca3e7..66219af 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,7 +19,11 @@
 
 #include <linux/ratelimit.h>
 
-static const char *audit_msg;
+static int audit_point;
+
+#define audit_printk(fmt, args...)		\
+	printk(KERN_ERR "audit: (%s) error: "	\
+		fmt, audit_point_name[audit_point], ##args)
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
@@ -93,21 +97,18 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 	if (sp->unsync) {
 		if (level != PT_PAGE_TABLE_LEVEL) {
-			printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-				audit_msg, sp, level);
+			audit_printk("unsync sp: %p level = %d\n", sp, level);
 			return;
 		}
 
 		if (*sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-				audit_msg, sp);
+			audit_printk("notrap spte in unsync sp: %p\n", sp);
 			return;
 		}
 	}
 
 	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-		printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-			audit_msg, sp);
+		audit_printk("notrap spte in direct sp: %p\n", sp);
 		return;
 	}
 
@@ -124,10 +125,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 	hpa =  pfn << PAGE_SHIFT;
 	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-		printk(KERN_ERR "xx audit error: (%s) levels %d"
-				   "pfn %llx hpa %llx ent %llxn",
-				   audit_msg, vcpu->arch.mmu.root_level,
-				   pfn, hpa, *sptep);
+		audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
+				   vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
 }
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -143,11 +142,9 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	if (!gfn_to_memslot(kvm, gfn)) {
 		if (!printk_ratelimit())
 			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
+		audit_printk("no memslot for gfn %llx\n", gfn);
+		audit_printk("index %ld of sp (gfn=%llx)\n",
+		       (long int)(sptep - rev_sp->spt), rev_sp->gfn);
 		dump_stack();
 		return;
 	}
@@ -156,8 +153,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	if (!*rmapp) {
 		if (!printk_ratelimit())
 			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
+		audit_printk("no rmap for writable spte %llx\n", *sptep);
 		dump_stack();
 	}
 }
@@ -198,10 +194,8 @@ void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
 		if (is_writable_pte(*spte))
-			printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
+			audit_printk("shadow page has writable mappings: gfn "
+				     "%llx role %x\n", sp->gfn, sp->role.word);
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
@@ -228,14 +222,14 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, audit_spte);
 }
 
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 
 	if (!__ratelimit(&ratelimit_state))
 		return;
 
-	audit_msg = audit_point_name[audit_point];
+	audit_point = point;
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
 }
-- 
cgit v0.10.2


From 6903074c367cfb13166c2974d6a886fdc7a00d21 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:09:29 +0800
Subject: KVM: MMU: audit: check whether have unsync sps after root sync

After root synced, all unsync sps are synced, this patch add a check to make
sure it's no unsync sps in VCPU's page table

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index afde64b..ba7e764 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -53,14 +53,18 @@ enum {
 	AUDIT_PRE_PAGE_FAULT,
 	AUDIT_POST_PAGE_FAULT,
 	AUDIT_PRE_PTE_WRITE,
-	AUDIT_POST_PTE_WRITE
+	AUDIT_POST_PTE_WRITE,
+	AUDIT_PRE_SYNC,
+	AUDIT_POST_SYNC
 };
 
 char *audit_point_name[] = {
 	"pre page fault",
 	"post page fault",
 	"pre pte write",
-	"post pte write"
+	"post pte write",
+	"pre sync",
+	"post sync"
 };
 
 #undef MMU_DEBUG
@@ -2516,6 +2520,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
@@ -2531,6 +2537,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 66219af..4aee32c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -164,6 +164,14 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+	if (audit_point == AUDIT_POST_SYNC && sp->unsync)
+		audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+}
+
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	int i;
@@ -179,7 +187,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
@@ -215,6 +223,7 @@ static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
 	audit_sptes_have_rmaps(vcpu, sptep, level);
 	audit_mappings(vcpu, sptep, level);
+	audit_spte_after_sync(vcpu, sptep, level);
 }
 
 static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-- 
cgit v0.10.2


From 3377078027dc54dc2a5acb2efa09587e7ac1cbd9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 28 Sep 2010 17:03:14 +0800
Subject: KVM: MMU: move access code parsing to FNAME(walk_addr) function

Move access code parsing from caller site to FNAME(walk_addr) function

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a83ff37..9a5f7bb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -116,16 +116,18 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
  */
 static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				    gva_t addr, int write_fault,
-				    int user_fault, int fetch_fault)
+				    gva_t addr, u32 access)
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
 	bool eperm, present, rsvd_fault;
-	int offset;
-	u32 access = 0;
+	int offset, write_fault, user_fault, fetch_fault;
+
+	write_fault = access & PFERR_WRITE_MASK;
+	user_fault = access & PFERR_USER_MASK;
+	fetch_fault = access & PFERR_FETCH_MASK;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
@@ -215,6 +217,7 @@ walk:
 			int lvl = walker->level;
 			gpa_t real_gpa;
 			gfn_t gfn;
+			u32 ac;
 
 			gfn = gpte_to_gfn_lvl(pte, lvl);
 			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
@@ -224,10 +227,10 @@ walk:
 			    is_cpuid_PSE36())
 				gfn += pse36_gfn_delta(pte);
 
-			access |= write_fault | fetch_fault | user_fault;
+			ac = write_fault | fetch_fault | user_fault;
 
 			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
-						      access);
+						      ac);
 			if (real_gpa == UNMAPPED_GVA)
 				return 0;
 
@@ -282,21 +285,18 @@ error:
 }
 
 static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
+			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
 {
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
-					write_fault, user_fault, fetch_fault);
+					access);
 }
 
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
-				   int write_fault, int user_fault,
-				   int fetch_fault)
+				   u32 access)
 {
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
-					addr, write_fault, user_fault,
-					fetch_fault);
+					addr, access);
 }
 
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -532,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
-	int fetch_fault = error_code & PFERR_FETCH_MASK;
 	struct guest_walker walker;
 	u64 *sptep;
 	int write_pt = 0;
@@ -550,8 +549,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	/*
 	 * Look up the guest pte for the faulting address.
 	 */
-	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-			     fetch_fault);
+	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.
@@ -669,10 +667,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	gpa_t gpa = UNMAPPED_GVA;
 	int r;
 
-	r = FNAME(walk_addr)(&walker, vcpu, vaddr,
-			     access & PFERR_WRITE_MASK,
-			     access & PFERR_USER_MASK,
-			     access & PFERR_FETCH_MASK);
+	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
@@ -690,10 +685,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 	gpa_t gpa = UNMAPPED_GVA;
 	int r;
 
-	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr,
-				    access & PFERR_WRITE_MASK,
-				    access & PFERR_USER_MASK,
-				    access & PFERR_FETCH_MASK);
+	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
-- 
cgit v0.10.2


From 7ebaf15eefe7b019def72bd9d4420c7bc51ed69e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 3 Oct 2010 18:51:39 +0200
Subject: KVM: MMU: Avoid sign extension in mmu_alloc_direct_roots() pae root
 address

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba7e764..dc1b4fb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2374,7 +2374,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
-	int i;
+	unsigned i;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		spin_lock(&vcpu->kvm->mmu_lock);
-- 
cgit v0.10.2


From 395c6b0a9d56fe7fdb7aeda12795d0eb02475d24 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 4 Oct 2010 12:55:49 +0200
Subject: KVM: Disable interrupts around get_kernel_ns()

get_kernel_ns() wants preemption disabled.  It doesn't make a lot of sense
during the get/set ioctls (no way to make them non-racy) but the callee wants
it.

Signed-off-by: Avi Kivity <avi@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffcb906..e96038e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3469,8 +3469,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
+		local_irq_disable();
 		now_ns = get_kernel_ns();
 		delta = user_ns.clock - now_ns;
+		local_irq_enable();
 		kvm->arch.kvmclock_offset = delta;
 		break;
 	}
@@ -3478,8 +3480,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
+		local_irq_disable();
 		now_ns = get_kernel_ns();
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+		local_irq_enable();
 		user_ns.flags = 0;
 
 		r = -EFAULT;
-- 
cgit v0.10.2


From 9611c187774f0e20c258c23ced2599c44bd2fef4 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 6 Oct 2010 14:23:22 +0200
Subject: KVM: fix typo in copyright notice

Fix typo in copyright notice.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d0df25d..38b6e8d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 2ad40a4..efad723 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index dd54c5b..f628234 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f994da4..7e06ba1 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c6f2f15..8211808 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc1b4fb..eb65b9c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 4aee32c..ba2bcdd 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -4,7 +4,7 @@
  * Audit code for KVM MMU
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9a5f7bb..cd7a833 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c929d00..82e144a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
  * AMD SVM support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0db..fc7a101 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
  *
  * timer support
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 007be84..8da0e45 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e96038e..dcee64e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 369e380..8edca91 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -17,7 +17,7 @@
  * Authors:
  *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  */
 
 #include <linux/kvm_host.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b8499f5..1aeeb7f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
-- 
cgit v0.10.2


From 5854dbca9b235f8cdd414a0961018763d2d5bf77 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 8 Oct 2010 16:24:14 +0800
Subject: KVM: MCE: Add MCG_SER_P into KVM_MCE_CAP_SUPPORTED

Now we have MCG_SER_P (and corresponding SRAO/SRAR MCE) support in
kernel and QEMU-KVM, the MCG_SER_P should be added into
KVM_MCE_CAP_SUPPORTED to make all these code really works.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dcee64e..2e09078 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -73,7 +73,7 @@
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
 
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
-- 
cgit v0.10.2


From 77db5cbd29b7cb0e0fb4fd146e7f7ac2831a025a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 8 Oct 2010 16:24:15 +0800
Subject: KVM: MCE: Send SRAR SIGBUS directly

Originally, SRAR SIGBUS is sent to QEMU-KVM via touching the poisoned
page. But commit 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the
signal from being sent. So now the signal is sent via
force_sig_info_fault directly.

[marcelo: use send_sig_info instead]

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eb65b9c..908ea54 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2251,22 +2251,24 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 }
 
-static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
-	char buf[1];
-	void __user *hva;
-	int r;
+	siginfo_t info;
+
+	info.si_signo	= SIGBUS;
+	info.si_errno	= 0;
+	info.si_code	= BUS_MCEERR_AR;
+	info.si_addr	= (void __user *)address;
+	info.si_addr_lsb = PAGE_SHIFT;
 
-	/* Touch the page, so send SIGBUS */
-	hva = (void __user *)gfn_to_hva(kvm, gfn);
-	r = copy_from_user(buf, hva, 1);
+	send_sig_info(SIGBUS, &info, tsk);
 }
 
 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
-		kvm_send_hwpoison_signal(kvm, gfn);
+		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
 		return 0;
 	} else if (is_fault_pfn(pfn))
 		return -EFAULT;
-- 
cgit v0.10.2


From d7a79b6c80fdbe4366484805ee07a4735fc427d8 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 14 Oct 2010 13:59:04 +0200
Subject: KVM: Fix signature of kvm_iommu_map_pages stub

Breaks otherwise if CONFIG_IOMMU_API is not set.

KVM-Stable-Tag.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0b89d00..866ed30 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -483,8 +483,7 @@ int kvm_deassign_device(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *assigned_dev);
 #else /* CONFIG_IOMMU_API */
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
-				      gfn_t base_gfn,
-				      unsigned long npages)
+				      struct kvm_memory_slot *slot)
 {
 	return 0;
 }
-- 
cgit v0.10.2


From 2a31339aa014c0d0b97c57d3ebc997732f8f47fc Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 18 Oct 2010 15:38:40 +0200
Subject: KVM: Drop CONFIG_DMAR dependency around kvm_iommu_map_pages

We also have to call kvm_iommu_map_pages for CONFIG_AMD_IOMMU. So drop
the dependency on Intel IOMMU, kvm_iommu_map_pages will be a nop anyway
if CONFIG_IOMMU_API is not defined.

KVM-Stable-Tag.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1aeeb7f..ac326de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -705,14 +705,12 @@ skip_lpage:
 	if (r)
 		goto out_free;
 
-#ifdef CONFIG_DMAR
 	/* map the pages in iommu page table */
 	if (npages) {
 		r = kvm_iommu_map_pages(kvm, &new);
 		if (r)
 			goto out_free;
 	}
-#endif
 
 	r = -ENOMEM;
 	slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-- 
cgit v0.10.2