From cef12ee52b054282461a6d5fe7742755fa6e3bd3 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Thu, 7 Jun 2012 19:56:51 +0800
Subject: xen/mce: Add mcelog support for Xen platform

When MCA error occurs, it would be handled by Xen hypervisor first,
and then the error information would be sent to initial domain for logging.

This patch gets error information from Xen hypervisor and convert
Xen format error into Linux format mcelog. This logic is basically
self-contained, not touching other kernel components.

By using tools like mcelog tool users could read specific error information,
like what they did under native Linux.

To test follow directions outlined in Documentation/acpi/apei/einj.txt

Acked-and-tested-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Ke, Liping <liping.ke@intel.com>
Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5728852..59c226d 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
 
 /*
  * The hypercall asms have to meet several constraints:
@@ -302,6 +303,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
 }
 
 static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+	return _hypercall1(int, mca, mc_op);
+}
+
+static inline int
 HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
 {
 	platform_op->interface_version = XENPF_INTERFACE_VERSION;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index da27c5d..aa75487 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -57,8 +57,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 int mce_disabled __read_mostly;
 
-#define MISC_MCELOG_MINOR	227
-
 #define SPINUNIT 100	/* 100ns */
 
 atomic_t mce_entry;
@@ -2342,7 +2340,7 @@ static __init int mcheck_init_device(void)
 
 	return err;
 }
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);
 
 /*
  * Old style boot options parsing. Only for compatibility.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ff962d4..9a63468 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -38,6 +38,7 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
 #include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@@ -341,9 +342,7 @@ static void __init xen_init_cpuid_mask(void)
 	unsigned int xsave_mask;
 
 	cpuid_leaf1_edx_mask =
-		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
-		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
+		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
 		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
 	if (!xen_initial_domain())
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 8d2501e..d4dffcd 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -196,4 +196,12 @@ config XEN_ACPI_PROCESSOR
 	  called xen_acpi_processor  If you do not know what to choose, select
 	  M here. If the CPUFREQ drivers are built in, select Y here.
 
+config XEN_MCE_LOG
+	bool "Xen platform mcelog"
+	depends on XEN_DOM0 && X86_64 && X86_MCE
+	default n
+	help
+	  Allow kernel fetching MCE error from Xen platform and
+	  converting it into Linux mcelog format for mcelog tools
+
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index fc34886..a787029 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
 obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)			+= pci.o acpi.o
+obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
 obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
new file mode 100644
index 0000000..72e87d2
--- /dev/null
+++ b/drivers/xen/mcelog.c
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * mcelog.c
+ * Driver for receiving and transferring machine check error infomation
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ * Author: Ke, Liping <liping.ke@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+#include <xen/interface/vcpu.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#define XEN_MCELOG "xen_mcelog: "
+
+static struct mc_info g_mi;
+static struct mcinfo_logical_cpu *g_physinfo;
+static uint32_t ncpus;
+
+static DEFINE_SPINLOCK(mcelog_lock);
+
+static struct xen_mce_log xen_mcelog = {
+	.signature	= XEN_MCE_LOG_SIGNATURE,
+	.len		= XEN_MCE_LOG_LEN,
+	.recordlen	= sizeof(struct xen_mce),
+};
+
+static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
+static int xen_mce_chrdev_open_count;	/* #times opened */
+static int xen_mce_chrdev_open_exclu;	/* already open exclusive? */
+
+static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&xen_mce_chrdev_state_lock);
+
+	if (xen_mce_chrdev_open_exclu ||
+	    (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&xen_mce_chrdev_state_lock);
+
+		return -EBUSY;
+	}
+
+	if (file->f_flags & O_EXCL)
+		xen_mce_chrdev_open_exclu = 1;
+	xen_mce_chrdev_open_count++;
+
+	spin_unlock(&xen_mce_chrdev_state_lock);
+
+	return nonseekable_open(inode, file);
+}
+
+static int xen_mce_chrdev_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&xen_mce_chrdev_state_lock);
+
+	xen_mce_chrdev_open_count--;
+	xen_mce_chrdev_open_exclu = 0;
+
+	spin_unlock(&xen_mce_chrdev_state_lock);
+
+	return 0;
+}
+
+static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
+				size_t usize, loff_t *off)
+{
+	char __user *buf = ubuf;
+	unsigned num;
+	int i, err;
+
+	spin_lock(&mcelog_lock);
+
+	num = xen_mcelog.next;
+
+	/* Only supports full reads right now */
+	err = -EINVAL;
+	if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce))
+		goto out;
+
+	err = 0;
+	for (i = 0; i < num; i++) {
+		struct xen_mce *m = &xen_mcelog.entry[i];
+
+		err |= copy_to_user(buf, m, sizeof(*m));
+		buf += sizeof(*m);
+	}
+
+	memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce));
+	xen_mcelog.next = 0;
+
+	if (err)
+		err = -EFAULT;
+
+out:
+	spin_unlock(&mcelog_lock);
+
+	return err ? err : buf - ubuf;
+}
+
+static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+				unsigned long arg)
+{
+	int __user *p = (int __user *)arg;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case MCE_GET_RECORD_LEN:
+		return put_user(sizeof(struct xen_mce), p);
+	case MCE_GET_LOG_LEN:
+		return put_user(XEN_MCE_LOG_LEN, p);
+	case MCE_GETCLEAR_FLAGS: {
+		unsigned flags;
+
+		do {
+			flags = xen_mcelog.flags;
+		} while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags);
+
+		return put_user(flags, p);
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations xen_mce_chrdev_ops = {
+	.open			= xen_mce_chrdev_open,
+	.release		= xen_mce_chrdev_release,
+	.read			= xen_mce_chrdev_read,
+	.unlocked_ioctl		= xen_mce_chrdev_ioctl,
+	.llseek			= no_llseek,
+};
+
+static struct miscdevice xen_mce_chrdev_device = {
+	MISC_MCELOG_MINOR,
+	"mcelog",
+	&xen_mce_chrdev_ops,
+};
+
+/*
+ * Caller should hold the mcelog_lock
+ */
+static void xen_mce_log(struct xen_mce *mce)
+{
+	unsigned entry;
+
+	entry = xen_mcelog.next;
+
+	/*
+	 * When the buffer fills up discard new entries.
+	 * Assume that the earlier errors are the more
+	 * interesting ones:
+	 */
+	if (entry >= XEN_MCE_LOG_LEN) {
+		set_bit(XEN_MCE_OVERFLOW,
+			(unsigned long *)&xen_mcelog.flags);
+		return;
+	}
+
+	memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce));
+
+	xen_mcelog.next++;
+}
+
+static int convert_log(struct mc_info *mi)
+{
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+	struct xen_mce m;
+	uint32_t i;
+
+	mic = NULL;
+	x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
+	if (unlikely(!mic)) {
+		pr_warning(XEN_MCELOG "Failed to find global error info\n");
+		return -ENODEV;
+	}
+
+	memset(&m, 0, sizeof(struct xen_mce));
+
+	mc_global = (struct mcinfo_global *)mic;
+	m.mcgstatus = mc_global->mc_gstatus;
+	m.apicid = mc_global->mc_apicid;
+
+	for (i = 0; i < ncpus; i++)
+		if (g_physinfo[i].mc_apicid == m.apicid)
+			break;
+	if (unlikely(i == ncpus)) {
+		pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n",
+			   m.apicid);
+		return -ENODEV;
+	}
+
+	m.socketid = g_physinfo[i].mc_chipid;
+	m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+	m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+	m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value;
+
+	mic = NULL;
+	x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
+	if (unlikely(!mic)) {
+		pr_warning(XEN_MCELOG "Fail to find bank error info\n");
+		return -ENODEV;
+	}
+
+	do {
+		if ((!mic) || (mic->size == 0) ||
+		    (mic->type != MC_TYPE_GLOBAL   &&
+		     mic->type != MC_TYPE_BANK     &&
+		     mic->type != MC_TYPE_EXTENDED &&
+		     mic->type != MC_TYPE_RECOVERY))
+			break;
+
+		if (mic->type == MC_TYPE_BANK) {
+			mc_bank = (struct mcinfo_bank *)mic;
+			m.misc = mc_bank->mc_misc;
+			m.status = mc_bank->mc_status;
+			m.addr = mc_bank->mc_addr;
+			m.tsc = mc_bank->mc_tsc;
+			m.bank = mc_bank->mc_bank;
+			m.finished = 1;
+			/*log this record*/
+			xen_mce_log(&m);
+		}
+		mic = x86_mcinfo_next(mic);
+	} while (1);
+
+	return 0;
+}
+
+static int mc_queue_handle(uint32_t flags)
+{
+	struct xen_mc mc_op;
+	int ret = 0;
+
+	mc_op.cmd = XEN_MC_fetch;
+	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+	set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi);
+	do {
+		mc_op.u.mc_fetch.flags = flags;
+		ret = HYPERVISOR_mca(&mc_op);
+		if (ret) {
+			pr_err(XEN_MCELOG "Failed to fetch %s error log\n",
+			       (flags == XEN_MC_URGENT) ?
+			       "urgnet" : "nonurgent");
+			break;
+		}
+
+		if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+		    mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+			break;
+		else {
+			ret = convert_log(&g_mi);
+			if (ret)
+				pr_warning(XEN_MCELOG
+					   "Failed to convert this error log, "
+					   "continue acking it anyway\n");
+
+			mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
+			ret = HYPERVISOR_mca(&mc_op);
+			if (ret) {
+				pr_err(XEN_MCELOG
+				       "Failed to ack previous error log\n");
+				break;
+			}
+		}
+	} while (1);
+
+	return ret;
+}
+
+/* virq handler for machine check error info*/
+static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+{
+	int err;
+	unsigned long tmp;
+
+	spin_lock_irqsave(&mcelog_lock, tmp);
+
+	/* urgent mc_info */
+	err = mc_queue_handle(XEN_MC_URGENT);
+	if (err)
+		pr_err(XEN_MCELOG
+		       "Failed to handle urgent mc_info queue, "
+		       "continue handling nonurgent mc_info queue anyway.\n");
+
+	/* nonurgent mc_info */
+	err = mc_queue_handle(XEN_MC_NONURGENT);
+	if (err)
+		pr_err(XEN_MCELOG
+		       "Failed to handle nonurgent mc_info queue.\n");
+
+	spin_unlock_irqrestore(&mcelog_lock, tmp);
+
+	return IRQ_HANDLED;
+}
+
+static int bind_virq_for_mce(void)
+{
+	int ret;
+	struct xen_mc mc_op;
+
+	memset(&mc_op, 0, sizeof(struct xen_mc));
+
+	/* Fetch physical CPU Numbers */
+	mc_op.cmd = XEN_MC_physcpuinfo;
+	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err(XEN_MCELOG "Failed to get CPU numbers\n");
+		return ret;
+	}
+
+	/* Fetch each CPU Physical Info for later reference*/
+	ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+	g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu),
+			     GFP_KERNEL);
+	if (!g_physinfo)
+		return -ENOMEM;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err(XEN_MCELOG "Failed to get CPU info\n");
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0,
+				       xen_mce_interrupt, 0, "mce", NULL);
+	if (ret < 0) {
+		pr_err(XEN_MCELOG "Failed to bind virq\n");
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __init xen_late_init_mcelog(void)
+{
+	/* Only DOM0 is responsible for MCE logging */
+	if (xen_initial_domain()) {
+		/* register character device /dev/mcelog for xen mcelog */
+		if (misc_register(&xen_mce_chrdev_device))
+			return -ENODEV;
+		return bind_virq_for_mce();
+	}
+
+	return -ENODEV;
+}
+device_initcall(xen_late_init_mcelog);
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 0549d211..e0deeb2 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -35,6 +35,7 @@
 #define MPT_MINOR		220
 #define MPT2SAS_MINOR		221
 #define UINPUT_MINOR		223
+#define MISC_MCELOG_MINOR	227
 #define HPET_MINOR		228
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h
new file mode 100644
index 0000000..73a4ea7
--- /dev/null
+++ b/include/xen/interface/xen-mca.h
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * Guest OS machine check interface to x86 Xen.
+ *
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@amd.com>
+ *
+ * Updated by Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+#define XEN_MCA_INTERFACE_VERSION	0x01ecc003
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */
+#define XEN_MC_NONURGENT	0x1
+/* IN: Dom0 calls hypercall to retrieve urgent error log entry */
+#define XEN_MC_URGENT		0x2
+/* IN: Dom0 acknowledges previosly-fetched error log entry */
+#define XEN_MC_ACK		0x4
+
+/* OUT: All is ok */
+#define XEN_MC_OK		0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED	0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA		0x2
+
+#ifndef __ASSEMBLY__
+/* vIRQ injected to Dom0 */
+#define VIRQ_MCA VIRQ_ARCH_0
+
+/*
+ * mc_info entry types
+ * mca machine check info are recorded in mc_info entries.
+ * when fetch mca info, it can use MC_TYPE_... to distinguish
+ * different mca info.
+ */
+#define MC_TYPE_GLOBAL		0
+#define MC_TYPE_BANK		1
+#define MC_TYPE_EXTENDED	2
+#define MC_TYPE_RECOVERY	3
+
+struct mcinfo_common {
+	uint16_t type; /* structure type */
+	uint16_t size; /* size of this struct in bytes */
+};
+
+#define MC_FLAG_CORRECTABLE	(1 << 0)
+#define MC_FLAG_UNCORRECTABLE	(1 << 1)
+#define MC_FLAG_RECOVERABLE	(1 << 2)
+#define MC_FLAG_POLLED		(1 << 3)
+#define MC_FLAG_RESET		(1 << 4)
+#define MC_FLAG_CMCI		(1 << 5)
+#define MC_FLAG_MCE		(1 << 6)
+
+/* contains x86 global mc information */
+struct mcinfo_global {
+	struct mcinfo_common common;
+
+	uint16_t mc_domid; /* running domain at the time in error */
+	uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+	uint32_t mc_socketid; /* physical socket of the physical core */
+	uint16_t mc_coreid; /* physical impacted core */
+	uint16_t mc_core_threadid; /* core thread of physical core */
+	uint32_t mc_apicid;
+	uint32_t mc_flags;
+	uint64_t mc_gstatus; /* global status */
+};
+
+/* contains x86 bank mc information */
+struct mcinfo_bank {
+	struct mcinfo_common common;
+
+	uint16_t mc_bank; /* bank nr */
+	uint16_t mc_domid; /* domain referenced by mc_addr if valid */
+	uint64_t mc_status; /* bank status */
+	uint64_t mc_addr; /* bank address */
+	uint64_t mc_misc;
+	uint64_t mc_ctrl2;
+	uint64_t mc_tsc;
+};
+
+struct mcinfo_msr {
+	uint64_t reg; /* MSR */
+	uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other or additional mc MSRs */
+struct mcinfo_extended {
+	struct mcinfo_common common;
+	uint32_t mc_msrs; /* Number of msr with valid values. */
+	/*
+	 * Currently Intel extended MSR (32/64) include all gp registers
+	 * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
+	 * useful at present. So expand this array to 16/32 to leave room.
+	 */
+	struct mcinfo_msr mc_msr[sizeof(void *) * 4];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/*
+ * Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/*
+ * Below interface used between XEN/DOM0 for passing XEN's recovery action
+ * information to DOM0.
+ */
+struct page_offline_action {
+	/* Params for passing the offlined page number to DOM0 */
+	uint64_t mfn;
+	uint64_t status;
+};
+
+struct cpu_offline_action {
+	/* Params for passing the identity of the offlined CPU to DOM0 */
+	uint32_t mc_socketid;
+	uint16_t mc_coreid;
+	uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mcinfo_recovery {
+	struct mcinfo_common common;
+	uint16_t mc_bank; /* bank nr */
+	uint8_t action_flags;
+	uint8_t action_types;
+	union {
+		struct page_offline_action page_retire;
+		struct cpu_offline_action cpu_offline;
+		uint8_t pad[MAX_UNION_SIZE];
+	} action_info;
+};
+
+
+#define MCINFO_MAXSIZE 768
+struct mc_info {
+	/* Number of mcinfo_* entries in mi_data */
+	uint32_t mi_nentries;
+	uint32_t flags;
+	uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
+};
+DEFINE_GUEST_HANDLE_STRUCT(mc_info);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_MSR_MCGCAP 0
+#define __MC_NMSRS 1
+#define MC_NCAPS 7
+struct mcinfo_logical_cpu {
+	uint32_t mc_cpunr;
+	uint32_t mc_chipid;
+	uint16_t mc_coreid;
+	uint16_t mc_threadid;
+	uint32_t mc_apicid;
+	uint32_t mc_clusterid;
+	uint32_t mc_ncores;
+	uint32_t mc_ncores_active;
+	uint32_t mc_nthreads;
+	uint32_t mc_cpuid_level;
+	uint32_t mc_family;
+	uint32_t mc_vendor;
+	uint32_t mc_model;
+	uint32_t mc_step;
+	char mc_vendorid[16];
+	char mc_brandid[64];
+	uint32_t mc_cpu_caps[MC_NCAPS];
+	uint32_t mc_cache_size;
+	uint32_t mc_cache_alignment;
+	uint32_t mc_nmsrvals;
+	struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+};
+DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu);
+
+/*
+ * Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+	((_mi)->mi_nentries)
+/*
+ * Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+	((struct mcinfo_common *)(_mi)->mi_data)
+/*
+ * Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+	((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
+
+/*
+ * Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+static inline void x86_mcinfo_lookup(struct mcinfo_common **ret,
+				     struct mc_info *mi, uint16_t type)
+{
+	uint32_t i;
+	struct mcinfo_common *mic;
+	bool found = 0;
+
+	if (!ret || !mi)
+		return;
+
+	mic = x86_mcinfo_first(mi);
+	for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+		if (mic->type == type) {
+			found = 1;
+			break;
+		}
+		mic = x86_mcinfo_next(mic);
+	}
+
+	*ret = found ? mic : NULL;
+}
+
+/*
+ * Fetch machine check data from hypervisor.
+ */
+#define XEN_MC_fetch		1
+struct xen_mc_fetch {
+	/*
+	 * IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+	 * XEN_MC_ACK if ack'king an earlier fetch
+	 * OUT: XEN_MC_OK, XEN_MC_FETCHAILED, XEN_MC_NODATA
+	 */
+	uint32_t flags;
+	uint32_t _pad0;
+	/* OUT: id for ack, IN: id we are ack'ing */
+	uint64_t fetch_id;
+
+	/* OUT variables. */
+	GUEST_HANDLE(mc_info) data;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch);
+
+
+/*
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain	2
+struct xen_mc_notifydomain {
+	/* IN variables */
+	uint16_t mc_domid; /* The unprivileged domain to notify */
+	uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify */
+
+	/* IN/OUT variables */
+	uint32_t flags;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain);
+
+#define XEN_MC_physcpuinfo	3
+struct xen_mc_physcpuinfo {
+	/* IN/OUT */
+	uint32_t ncpus;
+	uint32_t _pad0;
+	/* OUT */
+	GUEST_HANDLE(mcinfo_logical_cpu) info;
+};
+
+#define XEN_MC_msrinject	4
+#define MC_MSRINJ_MAXMSRS	8
+struct xen_mc_msrinject {
+	/* IN */
+	uint32_t mcinj_cpunr; /* target processor id */
+	uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */
+	uint32_t mcinj_count; /* 0 .. count-1 in array are valid */
+	uint32_t _pad0;
+	struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE	0x1
+
+#define XEN_MC_mceinject	5
+struct xen_mc_mceinject {
+	unsigned int mceinj_cpunr; /* target processor id */
+};
+
+struct xen_mc {
+	uint32_t cmd;
+	uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+	union {
+		struct xen_mc_fetch        mc_fetch;
+		struct xen_mc_notifydomain mc_notifydomain;
+		struct xen_mc_physcpuinfo  mc_physcpuinfo;
+		struct xen_mc_msrinject    mc_msrinject;
+		struct xen_mc_mceinject    mc_mceinject;
+	} u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc);
+
+/* Fields are zero when not available */
+struct xen_mce {
+	__u64 status;
+	__u64 misc;
+	__u64 addr;
+	__u64 mcgstatus;
+	__u64 ip;
+	__u64 tsc;	/* cpu time stamp counter */
+	__u64 time;	/* wall time_t when error was detected */
+	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
+	__u8  inject_flags;	/* software inject flags */
+	__u16  pad;
+	__u32 cpuid;	/* CPUID 1 EAX */
+	__u8  cs;		/* code segment */
+	__u8  bank;	/* machine check bank */
+	__u8  cpu;	/* cpu number; obsolete; use extcpu now */
+	__u8  finished;   /* entry is valid */
+	__u32 extcpu;	/* linux cpu number that detected the error */
+	__u32 socketid;	/* CPU socket ID */
+	__u32 apicid;	/* CPU initial apic ID */
+	__u64 mcgcap;	/* MCGCAP MSR: machine check capabilities of CPU */
+};
+
+/*
+ * This structure contains all data related to the MCE log.  Also
+ * carries a signature to make it easier to find from external
+ * debugging tools.  Each entry is only valid when its finished flag
+ * is set.
+ */
+
+#define XEN_MCE_LOG_LEN 32
+
+struct xen_mce_log {
+	char signature[12]; /* "MACHINECHECK" */
+	unsigned len;	    /* = XEN_MCE_LOG_LEN */
+	unsigned next;
+	unsigned flags;
+	unsigned recordlen;	/* length of struct xen_mce */
+	struct xen_mce entry[XEN_MCE_LOG_LEN];
+};
+
+#define XEN_MCE_OVERFLOW 0		/* bit 0 in flags means overflow */
+
+#define XEN_MCE_LOG_SIGNATURE	"MACHINECHECK"
+
+#define MCE_GET_RECORD_LEN   _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN      _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS   _IOR('M', 3, int)
+
+#endif /* __ASSEMBLY__ */
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
-- 
cgit v0.10.2


From a8fccdb0617386695a13ec742a61b5c935b63795 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Thu, 7 Jun 2012 19:58:50 +0800
Subject: x86, MCE, AMD: Adjust initcall sequence for xen

there are 3 funcs which need to be _initcalled in a logic sequence:
1. xen_late_init_mcelog
2. mcheck_init_device
3. threshold_init_device

xen_late_init_mcelog must register xen_mce_chrdev_device before
native mce_chrdev_device registration if running under xen platform;

mcheck_init_device should be inited before threshold_init_device to
initialize mce_device, otherwise a a NULL ptr dereference will cause panic.

so we use following _initcalls
1. device_initcall(xen_late_init_mcelog);
2. device_initcall_sync(mcheck_init_device);
3. late_initcall(threshold_init_device);

when running under xen, the initcall order is 1,2,3;
on baremetal, we skip 1 and we do only 2 and 3.

Acked-and-tested-by: Borislav Petkov <bp@amd64.org>
Suggested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f4873a6..be52744 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -777,4 +777,24 @@ static __init int threshold_init_device(void)
 
 	return 0;
 }
-device_initcall(threshold_init_device);
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
-- 
cgit v0.10.2


From 05e36006adc3046f86f2be8652a22d5f77ebd6ea Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Thu, 7 Jun 2012 20:10:37 +0800
Subject: xen/mce: Register native mce handler as vMCE bounce back point

When Xen hypervisor inject vMCE to guest, use native mce handler
to handle it

Signed-off-by: Ke, Liping <liping.ke@intel.com>
Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 9a63468..0c1ab43 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -626,8 +626,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 	/*
 	 * Look for known traps using IST, and substitute them
 	 * appropriately.  The debugger ones are the only ones we care
-	 * about.  Xen will handle faults like double_fault and
-	 * machine_check, so we should never see them.  Warn if
+	 * about.  Xen will handle faults like double_fault,
+	 * so we should never see them.  Warn if
 	 * there's an unexpected IST-using fault handler.
 	 */
 	if (addr == (unsigned long)debug)
@@ -642,7 +642,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 		return 0;
 #ifdef CONFIG_X86_MCE
 	} else if (addr == (unsigned long)machine_check) {
-		return 0;
+		/*
+		 * when xen hypervisor inject vMCE to guest,
+		 * use native mce handler to handle it
+		 */
+		;
 #endif
 	} else {
 		/* Some other trap using IST? */
-- 
cgit v0.10.2


From f65c9bb3fb725551d3e405f4d092caf24929cebe Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Mon, 11 Jun 2012 20:38:08 +0800
Subject: xen/pcpu: Xen physical cpus online/offline sys interface

This patch provide Xen physical cpus online/offline sys interface.
User can use it for their own purpose, like power saving:
by offlining some cpus when light workload it save power greatly.

Its basic workflow is, user online/offline cpu via sys interface,
then hypercall xen to implement, after done xen inject virq back to dom0,
and then dom0 sync cpu status.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/Documentation/ABI/testing/sysfs-devices-system-xen_cpu b/Documentation/ABI/testing/sysfs-devices-system-xen_cpu
new file mode 100644
index 0000000..9ca02fb
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-system-xen_cpu
@@ -0,0 +1,20 @@
+What:		/sys/devices/system/xen_cpu/
+Date:		May 2012
+Contact:	Liu, Jinsong <jinsong.liu@intel.com>
+Description:
+		A collection of global/individual Xen physical cpu attributes
+
+		Individual physical cpu attributes are contained in
+		subdirectories named by the Xen's logical cpu number, e.g.:
+		/sys/devices/system/xen_cpu/xen_cpu#/
+
+
+What:		/sys/devices/system/xen_cpu/xen_cpu#/online
+Date:		May 2012
+Contact:	Liu, Jinsong <jinsong.liu@intel.com>
+Description:
+		Interface to online/offline Xen physical cpus
+
+		When running under Xen platform, it provide user interface
+		to online/offline physical cpus, except cpu0 due to several
+		logic restrictions and assumptions.
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index a787029..d80bea5 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
 obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0)			+= pcpu.o
 obj-$(CONFIG_XEN_DOM0)			+= pci.o acpi.o
 obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
new file mode 100644
index 0000000..067fcfa
--- /dev/null
+++ b/drivers/xen/pcpu.c
@@ -0,0 +1,371 @@
+/******************************************************************************
+ * pcpu.c
+ * Management physical cpu in dom0, get pcpu info and provide sys interface
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/stat.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#define XEN_PCPU "xen_cpu: "
+
+/*
+ * @cpu_id: Xen physical cpu logic number
+ * @flags: Xen physical cpu status flag
+ * - XEN_PCPU_FLAGS_ONLINE: cpu is online
+ * - XEN_PCPU_FLAGS_INVALID: cpu is not present
+ */
+struct pcpu {
+	struct list_head list;
+	struct device dev;
+	uint32_t cpu_id;
+	uint32_t flags;
+};
+
+static struct bus_type xen_pcpu_subsys = {
+	.name = "xen_cpu",
+	.dev_name = "xen_cpu",
+};
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+
+static LIST_HEAD(xen_pcpus);
+
+static int xen_pcpu_down(uint32_t cpu_id)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_cpu_offline,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.cpu_ol.cpuid		= cpu_id,
+	};
+
+	return HYPERVISOR_dom0_op(&op);
+}
+
+static int xen_pcpu_up(uint32_t cpu_id)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_cpu_online,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.cpu_ol.cpuid		= cpu_id,
+	};
+
+	return HYPERVISOR_dom0_op(&op);
+}
+
+static ssize_t show_online(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+	return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
+}
+
+static ssize_t __ref store_online(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+	unsigned long long val;
+	ssize_t ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (kstrtoull(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	switch (val) {
+	case 0:
+		ret = xen_pcpu_down(pcpu->cpu_id);
+		break;
+	case 1:
+		ret = xen_pcpu_up(pcpu->cpu_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret >= 0)
+		ret = count;
+	return ret;
+}
+static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
+
+static bool xen_pcpu_online(uint32_t flags)
+{
+	return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+static void pcpu_online_status(struct xenpf_pcpuinfo *info,
+			       struct pcpu *pcpu)
+{
+	if (xen_pcpu_online(info->flags) &&
+	   !xen_pcpu_online(pcpu->flags)) {
+		/* the pcpu is onlined */
+		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
+	} else if (!xen_pcpu_online(info->flags) &&
+		    xen_pcpu_online(pcpu->flags)) {
+		/* The pcpu is offlined */
+		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
+	}
+}
+
+static struct pcpu *get_pcpu(uint32_t cpu_id)
+{
+	struct pcpu *pcpu;
+
+	list_for_each_entry(pcpu, &xen_pcpus, list) {
+		if (pcpu->cpu_id == cpu_id)
+			return pcpu;
+	}
+
+	return NULL;
+}
+
+static void pcpu_release(struct device *dev)
+{
+	struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+
+	list_del(&pcpu->list);
+	kfree(pcpu);
+}
+
+static void unregister_and_remove_pcpu(struct pcpu *pcpu)
+{
+	struct device *dev;
+
+	if (!pcpu)
+		return;
+
+	dev = &pcpu->dev;
+	if (dev->id)
+		device_remove_file(dev, &dev_attr_online);
+
+	/* pcpu remove would be implicitly done */
+	device_unregister(dev);
+}
+
+static int register_pcpu(struct pcpu *pcpu)
+{
+	struct device *dev;
+	int err = -EINVAL;
+
+	if (!pcpu)
+		return err;
+
+	dev = &pcpu->dev;
+	dev->bus = &xen_pcpu_subsys;
+	dev->id = pcpu->cpu_id;
+	dev->release = pcpu_release;
+
+	err = device_register(dev);
+	if (err) {
+		pcpu_release(dev);
+		return err;
+	}
+
+	/*
+	 * Xen never offline cpu0 due to several restrictions
+	 * and assumptions. This basically doesn't add a sys control
+	 * to user, one cannot attempt to offline BSP.
+	 */
+	if (dev->id) {
+		err = device_create_file(dev, &dev_attr_online);
+		if (err) {
+			device_unregister(dev);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
+{
+	struct pcpu *pcpu;
+	int err;
+
+	if (info->flags & XEN_PCPU_FLAGS_INVALID)
+		return ERR_PTR(-ENODEV);
+
+	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+	if (!pcpu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&pcpu->list);
+	pcpu->cpu_id = info->xen_cpuid;
+	pcpu->flags = info->flags;
+
+	/* Need hold on xen_pcpu_lock before pcpu list manipulations */
+	list_add_tail(&pcpu->list, &xen_pcpus);
+
+	err = register_pcpu(pcpu);
+	if (err) {
+		pr_warning(XEN_PCPU "Failed to register pcpu%u\n",
+			   info->xen_cpuid);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return pcpu;
+}
+
+/*
+ * Caller should hold the xen_pcpu_lock
+ */
+static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
+{
+	int ret;
+	struct pcpu *pcpu = NULL;
+	struct xenpf_pcpuinfo *info;
+	struct xen_platform_op op = {
+		.cmd                   = XENPF_get_cpuinfo,
+		.interface_version     = XENPF_INTERFACE_VERSION,
+		.u.pcpu_info.xen_cpuid = cpu,
+	};
+
+	ret = HYPERVISOR_dom0_op(&op);
+	if (ret)
+		return ret;
+
+	info = &op.u.pcpu_info;
+	if (max_cpu)
+		*max_cpu = info->max_present;
+
+	pcpu = get_pcpu(cpu);
+
+	/*
+	 * Only those at cpu present map has its sys interface.
+	 */
+	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+		if (pcpu)
+			unregister_and_remove_pcpu(pcpu);
+		return 0;
+	}
+
+	if (!pcpu) {
+		pcpu = create_and_register_pcpu(info);
+		if (IS_ERR_OR_NULL(pcpu))
+			return -ENODEV;
+	} else
+		pcpu_online_status(info, pcpu);
+
+	return 0;
+}
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+	/*
+	 * Boot cpu always have cpu_id 0 in xen
+	 */
+	uint32_t cpu = 0, max_cpu = 0;
+	int err = 0;
+	struct pcpu *pcpu, *tmp;
+
+	mutex_lock(&xen_pcpu_lock);
+
+	while (!err && (cpu <= max_cpu)) {
+		err = sync_pcpu(cpu, &max_cpu);
+		cpu++;
+	}
+
+	if (err)
+		list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list)
+			unregister_and_remove_pcpu(pcpu);
+
+	mutex_unlock(&xen_pcpu_lock);
+
+	return err;
+}
+
+static void xen_pcpu_work_fn(struct work_struct *work)
+{
+	xen_sync_pcpus();
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_pcpu_work);
+	return IRQ_HANDLED;
+}
+
+static int __init xen_pcpu_init(void)
+{
+	int irq, ret;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
+				      xen_pcpu_interrupt, 0,
+				      "xen-pcpu", NULL);
+	if (irq < 0) {
+		pr_warning(XEN_PCPU "Failed to bind pcpu virq\n");
+		return irq;
+	}
+
+	ret = subsys_system_register(&xen_pcpu_subsys, NULL);
+	if (ret) {
+		pr_warning(XEN_PCPU "Failed to register pcpu subsys\n");
+		goto err1;
+	}
+
+	ret = xen_sync_pcpus();
+	if (ret) {
+		pr_warning(XEN_PCPU "Failed to sync pcpu info\n");
+		goto err2;
+	}
+
+	return 0;
+
+err2:
+	bus_unregister(&xen_pcpu_subsys);
+err1:
+	unbind_from_irqhandler(irq, NULL);
+	return ret;
+}
+arch_initcall(xen_pcpu_init);
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
index 486653f..61fa661 100644
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@@ -314,6 +314,13 @@ struct xenpf_pcpuinfo {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo);
 
+#define XENPF_cpu_online	56
+#define XENPF_cpu_offline	57
+struct xenpf_cpu_ol {
+	uint32_t cpuid;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol);
+
 struct xen_platform_op {
 	uint32_t cmd;
 	uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -330,6 +337,7 @@ struct xen_platform_op {
 		struct xenpf_getidletime       getidletime;
 		struct xenpf_set_processor_pminfo set_pminfo;
 		struct xenpf_pcpuinfo          pcpu_info;
+		struct xenpf_cpu_ol            cpu_ol;
 		uint8_t                        pad[128];
 	} u;
 };
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index a890804..0801468 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -80,6 +80,7 @@
 #define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
 #define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
 #define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+#define VIRQ_PCPU_STATE 9  /* (DOM0) PCPU state changed                   */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
-- 
cgit v0.10.2


From 1b2a05516e42149a5e9f0f5aeba2c7fa9574b3f4 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Tue, 12 Jun 2012 23:11:16 +0800
Subject: xen/mce: schedule a workqueue to avoid sleep in atomic context

copy_to_user might sleep and print a stack trace if it is executed
in an atomic spinlock context. Like this:

(XEN) CMCI: send CMCI to DOM0 through virq
BUG: sleeping function called from invalid context at /home/konradinux/kernel.h:199
in_atomic(): 1, irqs_disabled(): 0, pid: 4581, name: mcelog
Pid: 4581, comm: mcelog Tainted: G           O 3.5.0-rc1upstream-00003-g149000b-dirty #1
 [<ffffffff8109ad9a>] __might_sleep+0xda/0x100
 [<ffffffff81329b0b>] xen_mce_chrdev_read+0xab/0x140
 [<ffffffff81148945>] vfs_read+0xc5/0x190
 [<ffffffff81148b0c>] sys_read+0x4c/0x90
 [<ffffffff815bd039>] system_call_fastpath+0x16

This patch schedule a workqueue for IRQ handler to poll the data,
and use mutex instead of spinlock, so copy_to_user sleep in atomic
context would not occur.

Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Suggested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
index 72e87d2..804aa3c 100644
--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@@ -55,7 +55,7 @@ static struct mc_info g_mi;
 static struct mcinfo_logical_cpu *g_physinfo;
 static uint32_t ncpus;
 
-static DEFINE_SPINLOCK(mcelog_lock);
+static DEFINE_MUTEX(mcelog_lock);
 
 static struct xen_mce_log xen_mcelog = {
 	.signature	= XEN_MCE_LOG_SIGNATURE,
@@ -106,7 +106,7 @@ static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
 	unsigned num;
 	int i, err;
 
-	spin_lock(&mcelog_lock);
+	mutex_lock(&mcelog_lock);
 
 	num = xen_mcelog.next;
 
@@ -130,7 +130,7 @@ static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
 		err = -EFAULT;
 
 out:
-	spin_unlock(&mcelog_lock);
+	mutex_unlock(&mcelog_lock);
 
 	return err ? err : buf - ubuf;
 }
@@ -310,12 +310,11 @@ static int mc_queue_handle(uint32_t flags)
 }
 
 /* virq handler for machine check error info*/
-static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+static void xen_mce_work_fn(struct work_struct *work)
 {
 	int err;
-	unsigned long tmp;
 
-	spin_lock_irqsave(&mcelog_lock, tmp);
+	mutex_lock(&mcelog_lock);
 
 	/* urgent mc_info */
 	err = mc_queue_handle(XEN_MC_URGENT);
@@ -330,8 +329,13 @@ static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
 		pr_err(XEN_MCELOG
 		       "Failed to handle nonurgent mc_info queue.\n");
 
-	spin_unlock_irqrestore(&mcelog_lock, tmp);
+	mutex_unlock(&mcelog_lock);
+}
+static DECLARE_WORK(xen_mce_work, xen_mce_work_fn);
 
+static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_mce_work);
 	return IRQ_HANDLED;
 }
 
-- 
cgit v0.10.2


From a867e5d6b362fc2982f19c5223f1946292643e57 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Fri, 15 Jun 2012 09:03:39 +0800
Subject: xen/mce: add .poll method for mcelog device driver

If a driver leaves its poll method NULL, the device is assumed to
be both readable and writable without blocking.

This patch add .poll method to xen mcelog device driver, so that
when mcelog use system calls like ppoll or select, it would be
blocked when no data available, and avoid spinning at CPU.

Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
index 804aa3c..8feee08 100644
--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@@ -41,6 +41,8 @@
 #include <linux/miscdevice.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
 
 #include <xen/interface/xen.h>
 #include <xen/events.h>
@@ -67,6 +69,8 @@ static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
 static int xen_mce_chrdev_open_count;	/* #times opened */
 static int xen_mce_chrdev_open_exclu;	/* already open exclusive? */
 
+static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait);
+
 static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
 {
 	spin_lock(&xen_mce_chrdev_state_lock);
@@ -135,6 +139,16 @@ out:
 	return err ? err : buf - ubuf;
 }
 
+static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &xen_mce_chrdev_wait, wait);
+
+	if (xen_mcelog.next)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
 static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
 				unsigned long arg)
 {
@@ -166,6 +180,7 @@ static const struct file_operations xen_mce_chrdev_ops = {
 	.open			= xen_mce_chrdev_open,
 	.release		= xen_mce_chrdev_release,
 	.read			= xen_mce_chrdev_read,
+	.poll			= xen_mce_chrdev_poll,
 	.unlocked_ioctl		= xen_mce_chrdev_ioctl,
 	.llseek			= no_llseek,
 };
@@ -329,6 +344,9 @@ static void xen_mce_work_fn(struct work_struct *work)
 		pr_err(XEN_MCELOG
 		       "Failed to handle nonurgent mc_info queue.\n");
 
+	/* wake processes polling /dev/mcelog */
+	wake_up_interruptible(&xen_mce_chrdev_wait);
+
 	mutex_unlock(&mcelog_lock);
 }
 static DECLARE_WORK(xen_mce_work, xen_mce_work_fn);
-- 
cgit v0.10.2


From 17f9b896b06d314da890174584278dea8da7e0ce Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 19 Jun 2012 14:39:31 -0400
Subject: xen/acpi: Fix potential memory leak.

Coverity points out that we do not free in one case the
pr_backup - and sure enough we forgot.

Found by Coverity (CID 401970)

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 7ff2569..b590ee0 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -520,15 +520,18 @@ static int __init xen_acpi_processor_init(void)
 
 		if (!pr_backup) {
 			pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
-			memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
+			if (pr_backup)
+				memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
 		}
 		(void)upload_pm_data(_pr);
 	}
 	rc = check_acpi_ids(pr_backup);
-	if (rc)
-		goto err_unregister;
 
 	kfree(pr_backup);
+	pr_backup = NULL;
+
+	if (rc)
+		goto err_unregister;
 
 	return 0;
 err_unregister:
-- 
cgit v0.10.2


From 37a80bf560786d96c5e8370bff45d867e43fd5c3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 26 Jun 2012 09:30:51 -0400
Subject: xen/hvc: Fix up checks when the info is allocated.

Coverity would complain about this - even thought it looks OK.

CID 401957
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index 944eaeb..1e456dc 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -209,11 +209,10 @@ static int xen_hvm_console_init(void)
 		info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
 		if (!info)
 			return -ENOMEM;
-	}
-
-	/* already configured */
-	if (info->intf != NULL)
+	} else if (info->intf != NULL) {
+		/* already configured */
 		return 0;
+	}
 	/*
 	 * If the toolstack (or the hypervisor) hasn't set these values, the
 	 * default value is 0. Even though mfn = 0 and evtchn = 0 are
@@ -259,12 +258,10 @@ static int xen_pv_console_init(void)
 		info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
 		if (!info)
 			return -ENOMEM;
-	}
-
-	/* already configured */
-	if (info->intf != NULL)
+	} else if (info->intf != NULL) {
+		/* already configured */
 		return 0;
-
+	}
 	info->evtchn = xen_start_info->console.domU.evtchn;
 	info->intf = mfn_to_virt(xen_start_info->console.domU.mfn);
 	info->vtermno = HVC_COOKIE;
-- 
cgit v0.10.2


From d095d43e78dd811d5c02c25e207c3364019b5a77 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 9 Jul 2012 11:39:05 +0100
Subject: xen/mm: do direct hypercall in xen_set_pte() if batching is
 unavailable

In xen_set_pte() if batching is unavailable (because the caller is in
an interrupt context such as handling a page fault) it would fall back
to using native_set_pte() and trapping and emulating the PTE write.

On 32-bit guests this requires two traps for each PTE write (one for
each dword of the PTE).  Instead, do one mmu_update hypercall
directly.

During construction of the initial page tables, continue to use
native_set_pte() because most of the PTEs being set are in writable
and unpinned pages (see phys_pmd_init() in arch/x86/mm/init_64.c) and
using a hypercall for this is very expensive.

This significantly improves page fault performance in 32-bit PV
guests.

lmbench3 test  Before    After     Improvement
----------------------------------------------
lat_pagefault  3.18 us   2.32 us   27%
lat_proc fork  356 us    313.3 us  11%

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785..3f1783a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 
 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
-	if (!xen_batched_set_pte(ptep, pteval))
-		native_set_pte(ptep, pteval);
+	if (!xen_batched_set_pte(ptep, pteval)) {
+		/*
+		 * Could call native_set_pte() here and trap and
+		 * emulate the PTE write but with 32-bit guests this
+		 * needs two traps (one for each of the two 32-bit
+		 * words in the PTE) so do one hypercall directly
+		 * instead.
+		 */
+		struct mmu_update u;
+
+		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+		u.val = pte_val_ma(pteval);
+		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+	}
 }
 
 static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1416,13 +1428,21 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 }
 #endif /* CONFIG_X86_64 */
 
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive.  At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
 	pte = mask_rw_pte(ptep, pte);
 
-	xen_set_pte(ptep, pte);
+	native_set_pte(ptep, pte);
 }
 
 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-- 
cgit v0.10.2


From 66a27dde9ae96e35278983f2e59bea04eb714cd0 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 9 Jul 2012 11:39:06 +0100
Subject: xen/mm: zero PTEs for non-present MFNs in the initial page table

When constructing the initial page tables, if the MFN for a usable PFN
is missing in the p2m then that frame is initially ballooned out.  In
this case, zero the PTE (as in decrease_reservation() in
drivers/xen/balloon.c).

This is obviously safe instead of having an valid PTE with an MFN of
INVALID_P2M_ENTRY (~0).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f1783a..27336df 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1432,6 +1432,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  * Init-time set_pte while constructing initial pagetables, which
  * doesn't allow RO page table pages to be remapped RW.
  *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
  * Many of these PTE updates are done on unpinned and writable pages
  * and doing a hypercall for these is unnecessary and expensive.  At
  * this point it is not possible to tell if a page is pinned or not,
@@ -1440,7 +1444,10 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
-	pte = mask_rw_pte(ptep, pte);
+	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+		pte = mask_rw_pte(ptep, pte);
+	else
+		pte = __pte_ma(0);
 
 	native_set_pte(ptep, pte);
 }
-- 
cgit v0.10.2


From 59290362da587ce2e6d2f4a7f85e362fa2d7fd39 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 9 Jul 2012 11:39:07 +0100
Subject: xen/x86: add desc_equal() to compare GDT descriptors

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
[v1: Moving it to the Xen file]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0c1ab43..225ffdc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -539,6 +539,12 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 		BUG();
 }
 
+static inline bool desc_equal(const struct desc_struct *d1,
+			      const struct desc_struct *d2)
+{
+	return d1->a == d2->a && d1->b == d2->b;
+}
+
 static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
-- 
cgit v0.10.2


From 1c32cdc633c96a14cca35de9db19338354c25111 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Mon, 9 Jul 2012 11:39:08 +0100
Subject: xen/x86: avoid updating TLS descriptors if they haven't changed

When switching tasks in a Xen PV guest, avoid updating the TLS
descriptors if they haven't changed.  This improves the speed of
context switches by almost 10% as much of the time the descriptors are
the same or only one is different.

The descriptors written into the GDT by Xen are modified from the
values passed in the update_descriptor hypercall so we keep shadow
copies of the three TLS descriptors to compare against.

lmbench3 test     Before  After  Improvement
--------------------------------------------
lat_ctx -s 32 24   7.19    6.52  9%
lat_pipe          12.56   11.66  7%

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 225ffdc..9c80869 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -125,6 +125,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  */
 static int have_vcpu_info_placement = 1;
 
+struct tls_descs {
+	struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed.  Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -548,9 +561,19 @@ static inline bool desc_equal(const struct desc_struct *d1,
 static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
-	struct multicall_space mc = __xen_mc_entry(0);
+	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+	struct desc_struct *gdt;
+	xmaddr_t maddr;
+	struct multicall_space mc;
+
+	if (desc_equal(shadow, &t->tls_array[i]))
+		return;
+
+	*shadow = t->tls_array[i];
+
+	gdt = get_cpu_gdt_table(cpu);
+	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
 }
-- 
cgit v0.10.2


From 254d1a3f02ebc10ccc6e4903394d8d3f484f715e Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 10 Jul 2012 14:50:03 +0200
Subject: xen/pv-on-hvm kexec: shutdown watches from old kernel

Add xs_reset_watches function to shutdown watches from old kernel after
kexec boot.  The old kernel does not unregister all watches in the
shutdown path.  They are still active, the double registration can not
be detected by the new kernel.  When the watches fire, unexpected events
will arrive and the xenwatch thread will crash (jumps to NULL).  An
orderly reboot of a hvm guest will destroy the entire guest with all its
resources (including the watches) before it is rebuilt from scratch, so
the missing unregister is not an issue in that case.

With this change the xenstored is instructed to wipe all active watches
for the guest.  However, a patch for xenstored is required so that it
accepts the XS_RESET_WATCHES request from a client (see changeset
23839:42a45baf037d in xen-unstable.hg). Without the patch for xenstored
the registration of watches will fail and some features of a PVonHVM
guest are not available. The guest is still able to boot, but repeated
kexec boots will fail.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index d1c217b..bce15cf 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -618,6 +618,23 @@ static struct xenbus_watch *find_watch(const char *token)
 	return NULL;
 }
 
+static void xs_reset_watches(void)
+{
+	int err, supported = 0;
+
+	if (!xen_hvm_domain())
+		return;
+
+	err = xenbus_scanf(XBT_NIL, "control",
+			"platform-feature-xs_reset_watches", "%d", &supported);
+	if (err != 1 || !supported)
+		return;
+
+	err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+	if (err && err != -EEXIST)
+		printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
+}
+
 /* Register callback to watch this node. */
 int register_xenbus_watch(struct xenbus_watch *watch)
 {
@@ -900,5 +917,8 @@ int xs_init(void)
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 
+	/* shutdown watches for kexec boot */
+	xs_reset_watches();
+
 	return 0;
 }
diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h
index 7cdfca2..794deb0 100644
--- a/include/xen/interface/io/xs_wire.h
+++ b/include/xen/interface/io/xs_wire.h
@@ -29,7 +29,8 @@ enum xsd_sockmsg_type
     XS_IS_DOMAIN_INTRODUCED,
     XS_RESUME,
     XS_SET_TARGET,
-    XS_RESTRICT
+    XS_RESTRICT,
+    XS_RESET_WATCHES,
 };
 
 #define XS_WRITE_NONE "NONE"
-- 
cgit v0.10.2


From 38ad4f4b6cc713b3c42cb4252688ef5c296d7455 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 10 Jul 2012 15:31:39 +0200
Subject: xen: enable platform-pci only in a Xen guest

While debugging kexec issues in a PVonHVM guest I modified
xen_hvm_platform() to return false to disable all PV drivers. This
caused a crash in platform_pci_init() because it expects certain data
structures to be initialized properly.

To avoid such a crash make sure the driver is initialized only if
running in a Xen guest.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index 2389e58..97ca359 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -109,6 +109,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	long mmio_addr, mmio_len;
 	unsigned int max_nr_gframes;
 
+	if (!xen_domain())
+		return -ENODEV;
+
 	i = pci_enable_device(pdev);
 	if (i)
 		return i;
-- 
cgit v0.10.2


From 4648da7cb4079e263eaf4dcd3b10fdb2409d4ad6 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 17 Jul 2012 11:57:41 +0200
Subject: xen: remove cast from HYPERVISOR_shared_info assignment

Both have type struct shared_info so no cast is needed.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 9c80869..affa94b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
  */
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
 
 /*
  * Flag to determine whether vcpu info placement is available on all
-- 
cgit v0.10.2


From 4ff2d06255461390ad685843d0d7364aaa6642d2 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 17 Jul 2012 11:59:15 +0200
Subject: xen: simplify init_hvm_pv_info

init_hvm_pv_info is called only in PVonHVM context, move it into ifdef.
init_hvm_pv_info does not fail, make it a void function.
remove arguments from init_hvm_pv_info because they are not used by the
caller.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index affa94b..f1814fc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1471,32 +1471,6 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }
 
-static int init_hvm_pv_info(int *major, int *minor)
-{
-	uint32_t eax, ebx, ecx, edx, pages, msr, base;
-	u64 pfn;
-
-	base = xen_cpuid_base();
-	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
-
-	*major = eax >> 16;
-	*minor = eax & 0xffff;
-	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
-
-	cpuid(base + 2, &pages, &msr, &ecx, &edx);
-
-	pfn = __pa(hypercall_page);
-	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
-
-	xen_setup_features();
-
-	pv_info.name = "Xen HVM";
-
-	xen_domain_type = XEN_HVM_DOMAIN;
-
-	return 0;
-}
-
 void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
@@ -1529,6 +1503,31 @@ void __ref xen_hvm_init_shared_info(void)
 }
 
 #ifdef CONFIG_XEN_PVHVM
+static void __init init_hvm_pv_info(void)
+{
+	int major, minor;
+	uint32_t eax, ebx, ecx, edx, pages, msr, base;
+	u64 pfn;
+
+	base = xen_cpuid_base();
+	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+	major = eax >> 16;
+	minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	xen_setup_features();
+
+	pv_info.name = "Xen HVM";
+
+	xen_domain_type = XEN_HVM_DOMAIN;
+}
+
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
@@ -1551,12 +1550,7 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
 
 static void __init xen_hvm_guest_init(void)
 {
-	int r;
-	int major, minor;
-
-	r = init_hvm_pv_info(&major, &minor);
-	if (r < 0)
-		return;
+	init_hvm_pv_info();
 
 	xen_hvm_init_shared_info();
 
-- 
cgit v0.10.2


From 00e37bdb0113a98408de42db85be002f21dbffd3 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 17 Jul 2012 17:43:35 +0200
Subject: xen PVonHVM: move shared_info to MMIO before kexec

Currently kexec in a PVonHVM guest fails with a triple fault because the
new kernel overwrites the shared info page. The exact failure depends on
the size of the kernel image. This patch moves the pfn from RAM into
MMIO space before the kexec boot.

The pfn containing the shared_info is located somewhere in RAM. This
will cause trouble if the current kernel is doing a kexec boot into a
new kernel. The new kernel (and its startup code) can not know where the
pfn is, so it can not reserve the page. The hypervisor will continue to
update the pfn, and as a result memory corruption occours in the new
kernel.

One way to work around this issue is to allocate a page in the
xen-platform pci device's BAR memory range. But pci init is done very
late and the shared_info page is already in use very early to read the
pvclock. So moving the pfn from RAM to MMIO is racy because some code
paths on other vcpus could access the pfn during the small   window when
the old pfn is moved to the new pfn. There is even a  small window were
the old pfn is not backed by a mfn, and during that time all reads
return -1.

Because it is not known upfront where the MMIO region is located it can
not be used right from the start in xen_hvm_init_shared_info.

To minimise trouble the move of the pfn is done shortly before kexec.
This does not eliminate the race because all vcpus are still online when
the syscore_ops will be called. But hopefully there is no work pending
at this point in time. Also the syscore_op is run last which reduces the
risk further.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f1814fc..a6f8acb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/syscore_ops.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -1471,38 +1472,130 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }
 
-void __ref xen_hvm_init_shared_info(void)
+#ifdef CONFIG_XEN_PVHVM
+/*
+ * The pfn containing the shared_info is located somewhere in RAM. This
+ * will cause trouble if the current kernel is doing a kexec boot into a
+ * new kernel. The new kernel (and its startup code) can not know where
+ * the pfn is, so it can not reserve the page. The hypervisor will
+ * continue to update the pfn, and as a result memory corruption occours
+ * in the new kernel.
+ *
+ * One way to work around this issue is to allocate a page in the
+ * xen-platform pci device's BAR memory range. But pci init is done very
+ * late and the shared_info page is already in use very early to read
+ * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+ * code paths on other vcpus could access the pfn during the small
+ * window when the old pfn is moved to the new pfn. There is even a
+ * small window were the old pfn is not backed by a mfn, and during that
+ * time all reads return -1.
+ *
+ * Because it is not known upfront where the MMIO region is located it
+ * can not be used right from the start in xen_hvm_init_shared_info.
+ *
+ * To minimise trouble the move of the pfn is done shortly before kexec.
+ * This does not eliminate the race because all vcpus are still online
+ * when the syscore_ops will be called. But hopefully there is no work
+ * pending at this point in time. Also the syscore_op is run last which
+ * reduces the risk further.
+ */
+
+static struct shared_info *xen_hvm_shared_info;
+
+static void xen_hvm_connect_shared_info(unsigned long pfn)
 {
-	int cpu;
 	struct xen_add_to_physmap xatp;
-	static struct shared_info *shared_info_page = 0;
 
-	if (!shared_info_page)
-		shared_info_page = (struct shared_info *)
-			extend_brk(PAGE_SIZE, PAGE_SIZE);
 	xatp.domid = DOMID_SELF;
 	xatp.idx = 0;
 	xatp.space = XENMAPSPACE_shared_info;
-	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+	xatp.gpfn = pfn;
 	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
 		BUG();
 
-	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+}
+static void xen_hvm_set_shared_info(struct shared_info *sip)
+{
+	int cpu;
+
+	HYPERVISOR_shared_info = sip;
 
 	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
 	 * page, we use it in the event channel upcall and in some pvclock
 	 * related functions. We don't need the vcpu_info placement
 	 * optimizations because we don't use any pv_mmu or pv_irq op on
 	 * HVM.
-	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
-	 * online but xen_hvm_init_shared_info is run at resume time too and
+	 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
+	 * online but xen_hvm_set_shared_info is run at resume time too and
 	 * in that case multiple vcpus might be online. */
 	for_each_online_cpu(cpu) {
 		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 	}
 }
 
-#ifdef CONFIG_XEN_PVHVM
+/* Reconnect the shared_info pfn to a mfn */
+void xen_hvm_resume_shared_info(void)
+{
+	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+}
+
+#ifdef CONFIG_KEXEC
+static struct shared_info *xen_hvm_shared_info_kexec;
+static unsigned long xen_hvm_shared_info_pfn_kexec;
+
+/* Remember a pfn in MMIO space for kexec reboot */
+void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+{
+	xen_hvm_shared_info_kexec = sip;
+	xen_hvm_shared_info_pfn_kexec = pfn;
+}
+
+static void xen_hvm_syscore_shutdown(void)
+{
+	struct xen_memory_reservation reservation = {
+		.domid = DOMID_SELF,
+		.nr_extents = 1,
+	};
+	unsigned long prev_pfn;
+	int rc;
+
+	if (!xen_hvm_shared_info_kexec)
+		return;
+
+	prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+	set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+
+	/* Move pfn to MMIO, disconnects previous pfn from mfn */
+	xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+
+	/* Update pointers, following hypercall is also a memory barrier */
+	xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+
+	/* Allocate new mfn for previous pfn */
+	do {
+		rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+		if (rc == 0)
+			msleep(123);
+	} while (rc == 0);
+
+	/* Make sure the previous pfn is really connected to a (new) mfn */
+	BUG_ON(rc != 1);
+}
+
+static struct syscore_ops xen_hvm_syscore_ops = {
+	.shutdown = xen_hvm_syscore_shutdown,
+};
+#endif
+
+/* Use a pfn in RAM, may move to MMIO before kexec. */
+static void __init xen_hvm_init_shared_info(void)
+{
+	/* Remember pointer for resume */
+	xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+	xen_hvm_set_shared_info(xen_hvm_shared_info);
+}
+
 static void __init init_hvm_pv_info(void)
 {
 	int major, minor;
@@ -1553,6 +1646,9 @@ static void __init xen_hvm_guest_init(void)
 	init_hvm_pv_info();
 
 	xen_hvm_init_shared_info();
+#ifdef CONFIG_KEXEC
+	register_syscore_ops(&xen_hvm_syscore_ops);
+#endif
 
 	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8..ae8a00c 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
 {
 #ifdef CONFIG_XEN_PVHVM
 	int cpu;
-	xen_hvm_init_shared_info();
+	xen_hvm_resume_shared_info();
 	xen_callback_vector();
 	xen_unplug_emulated_devices();
 	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 202d4c1..1e4329e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 
 void xen_callback_vector(void);
-void xen_hvm_init_shared_info(void);
+void xen_hvm_resume_shared_info(void);
 void xen_unplug_emulated_devices(void);
 
 void __init xen_build_dynamic_phys_to_machine(void);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index 97ca359..d4c50d6 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev)
 	return 0;
 }
 
+static void __devinit prepare_shared_info(void)
+{
+#ifdef CONFIG_KEXEC
+	unsigned long addr;
+	struct shared_info *hvm_shared_info;
+
+	addr = alloc_xen_mmio(PAGE_SIZE);
+	hvm_shared_info = ioremap(addr, PAGE_SIZE);
+	memset(hvm_shared_info, 0, PAGE_SIZE);
+	xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT);
+#endif
+}
+
 static int __devinit platform_pci_init(struct pci_dev *pdev,
 				       const struct pci_device_id *ent)
 {
@@ -138,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	platform_mmio = mmio_addr;
 	platform_mmiolen = mmio_len;
 
+	prepare_shared_info();
+
 	if (!xen_have_vector_callback) {
 		ret = xen_allocate_irq(pdev);
 		if (ret) {
diff --git a/include/xen/events.h b/include/xen/events.h
index 04399b2..9c641de 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq);
 
 void xen_irq_resume(void);
 
+void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn);
+
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);
 void xen_set_irq_pending(int irq);
-- 
cgit v0.10.2


From c3d93f880197953f86ab90d9da4744e926b38e33 Mon Sep 17 00:00:00 2001
From: "zhenzhong.duan" <zhenzhong.duan@oracle.com>
Date: Wed, 18 Jul 2012 13:06:39 +0800
Subject: xen: populate correct number of pages when across mem boundary (v2)

When populate pages across a mem boundary at bootup, the page count
populated isn't correct. This is due to mem populated to non-mem
region and ignored.

Pfn range is also wrongly aligned when mem boundary isn't page aligned.

For a dom0 booted with dom_mem=3368952K(0xcd9ff000-4k) dmesg diff is:
 [    0.000000] Freeing 9e-100 pfn range: 98 pages freed
 [    0.000000] 1-1 mapping on 9e->100
 [    0.000000] 1-1 mapping on cd9ff->100000
 [    0.000000] Released 98 pages of unused memory
 [    0.000000] Set 206435 page(s) to 1-1 mapping
-[    0.000000] Populating cd9fe-cda00 pfn range: 1 pages added
+[    0.000000] Populating cd9fe-cd9ff pfn range: 1 pages added
+[    0.000000] Populating 100000-100061 pfn range: 97 pages added
 [    0.000000] BIOS-provided physical RAM map:
 [    0.000000] Xen: 0000000000000000 - 000000000009e000 (usable)
 [    0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved)
 [    0.000000] Xen: 0000000000100000 - 00000000cd9ff000 (usable)
 [    0.000000] Xen: 00000000cd9ffc00 - 00000000cda53c00 (ACPI NVS)
...
 [    0.000000] Xen: 0000000100000000 - 0000000100061000 (usable)
 [    0.000000] Xen: 0000000100061000 - 000000012c000000 (unusable)
...
 [    0.000000] MEMBLOCK configuration:
...
-[    0.000000]  reserved[0x4]       [0x000000cd9ff000-0x000000cd9ffbff], 0xc00 bytes
-[    0.000000]  reserved[0x5]       [0x00000100000000-0x00000100060fff], 0x61000 bytes

Related xen memory layout:
(XEN) Xen-e820 RAM map:
(XEN)  0000000000000000 - 000000000009ec00 (usable)
(XEN)  00000000000f0000 - 0000000000100000 (reserved)
(XEN)  0000000000100000 - 00000000cd9ffc00 (usable)

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
[v2: If xen_do_chunk fail(populate), abort this chunk and any others]
Suggested by David, thanks.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a4790bf..ead8557 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
 	unsigned long dest_pfn;
 
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
-		unsigned long credits = credits_left;
 		unsigned long s_pfn;
 		unsigned long e_pfn;
 		unsigned long pfns;
 		long capacity;
 
-		if (credits <= 0)
+		if (credits_left <= 0)
 			break;
 
 		if (entry->type != E820_RAM)
 			continue;
 
-		e_pfn = PFN_UP(entry->addr + entry->size);
+		e_pfn = PFN_DOWN(entry->addr + entry->size);
 
 		/* We only care about E820 after the xen_start_info->nr_pages */
 		if (e_pfn <= max_pfn)
 			continue;
 
-		s_pfn = PFN_DOWN(entry->addr);
+		s_pfn = PFN_UP(entry->addr);
 		/* If the E820 falls within the nr_pages, we want to start
 		 * at the nr_pages PFN.
 		 * If that would mean going past the E820 entry, skip it
@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
 			capacity = e_pfn - max_pfn;
 			dest_pfn = max_pfn;
 		} else {
-			/* last_pfn MUST be within E820_RAM regions */
-			if (*last_pfn && e_pfn >= *last_pfn)
-				s_pfn = *last_pfn;
 			capacity = e_pfn - s_pfn;
 			dest_pfn = s_pfn;
 		}
-		/* If we had filled this E820_RAM entry, go to the next one. */
-		if (capacity <= 0)
-			continue;
 
-		if (credits > capacity)
-			credits = capacity;
+		if (credits_left < capacity)
+			capacity = credits_left;
 
-		pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
+		pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
 		done += pfns;
-		credits_left -= pfns;
 		*last_pfn = (dest_pfn + pfns);
+		if (pfns < capacity)
+			break;
+		credits_left -= pfns;
 	}
 	return done;
 }
-- 
cgit v0.10.2