summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/Makefile.um4
-rw-r--r--arch/x86/crypto/camellia_glue.c4
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c4
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/auxvec.h7
-rw-r--r--arch/x86/include/asm/barrier.h116
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h1
-rw-r--r--arch/x86/include/asm/debugreg.h67
-rw-r--r--arch/x86/include/asm/elf.h1
-rw-r--r--arch/x86/include/asm/exec.h1
-rw-r--r--arch/x86/include/asm/futex.h1
-rw-r--r--arch/x86/include/asm/i387.h1
-rw-r--r--arch/x86/include/asm/kgdb.h10
-rw-r--r--arch/x86/include/asm/kvm.h4
-rw-r--r--arch/x86/include/asm/kvm_emulate.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h63
-rw-r--r--arch/x86/include/asm/local.h1
-rw-r--r--arch/x86/include/asm/mc146818rtc.h1
-rw-r--r--arch/x86/include/asm/page_types.h1
-rw-r--r--arch/x86/include/asm/paravirt.h1
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/processor.h94
-rw-r--r--arch/x86/include/asm/segment.h58
-rw-r--r--arch/x86/include/asm/special_insns.h199
-rw-r--r--arch/x86/include/asm/stackprotector.h1
-rw-r--r--arch/x86/include/asm/switch_to.h129
-rw-r--r--arch/x86/include/asm/system.h523
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/vgtod.h17
-rw-r--r--arch/x86/include/asm/virtext.h1
-rw-r--r--arch/x86/include/asm/x86_init.h6
-rw-r--r--arch/x86/include/asm/xen/interface.h1
-rw-r--r--arch/x86/kernel/acpi/cstate.c1
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c4
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/irqinit.c7
-rw-r--r--arch/x86/kernel/kgdb.c7
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1
-rw-r--r--arch/x86/kernel/pci-dma.c5
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c1
-rw-r--r--arch/x86/kernel/setup.c12
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/tce_64.c1
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps.c1
-rw-r--r--arch/x86/kernel/tsc.c14
-rw-r--r--arch/x86/kernel/vsyscall_64.c25
-rw-r--r--arch/x86/kernel/x86_init.c5
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c112
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c85
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/svm.c119
-rw-r--r--arch/x86/kvm/vmx.c53
-rw-r--r--arch/x86/kvm/x86.c403
-rw-r--r--arch/x86/mm/init.c1
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/x86/mm/init_64.c1
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c1
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/pci/acpi.c7
-rw-r--r--arch/x86/pci/fixup.c12
-rw-r--r--arch/x86/pci/i386.c85
-rw-r--r--arch/x86/pci/mrst.c40
-rw-r--r--arch/x86/pci/xen.c27
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts7
-rw-r--r--arch/x86/platform/geode/Makefile1
-rw-r--r--arch/x86/platform/geode/geos.c128
-rw-r--r--arch/x86/platform/mrst/mrst.c16
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/power/hibernate_32.c1
-rw-r--r--arch/x86/um/Kconfig4
-rw-r--r--arch/x86/um/asm/processor.h10
-rw-r--r--arch/x86/um/asm/processor_32.h10
-rw-r--r--arch/x86/um/asm/processor_64.h10
-rw-r--r--arch/x86/um/bugs_32.c4
-rw-r--r--arch/x86/um/mem_32.c8
-rw-r--r--arch/x86/um/vdso/vma.c3
-rw-r--r--arch/x86/vdso/vclock_gettime.c135
-rw-r--r--arch/x86/vdso/vdso32-setup.c17
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/enlighten.c99
-rw-r--r--arch/x86/xen/irq.c8
-rw-r--r--arch/x86/xen/mmu.c20
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/setup.c3
-rw-r--r--arch/x86/xen/smp.c8
112 files changed, 1873 insertions, 1066 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9019523..3ad653d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2125,6 +2125,13 @@ config NET5501
---help---
This option enables system support for the Soekris Engineering net5501.
+config GEOS
+ bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"
+ select GPIOLIB
+ depends on DMI
+ ---help---
+ This option enables system support for the Traverse Technologies GEOS.
+
endif # X86_32
config AMD_NB
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 36ddec6..4be406a 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -8,15 +8,11 @@ ELF_ARCH := i386
ELF_FORMAT := elf32-i386
CHECKFLAGS += -D__i386__
-ifeq ("$(origin SUBARCH)", "command line")
-ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)")
KBUILD_CFLAGS += $(call cc-option,-m32)
KBUILD_AFLAGS += $(call cc-option,-m32)
LINK-y += $(call cc-option,-m32)
export LDFLAGS
-endif
-endif
# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
include $(srctree)/arch/x86/Makefile_32.cpu
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 1ca36a9..3306dc0 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1925,7 +1925,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-int __init init(void)
+static int __init init(void)
{
if (!force && is_blacklisted_cpu()) {
printk(KERN_INFO
@@ -1938,7 +1938,7 @@ int __init init(void)
return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
}
-void __exit fini(void)
+static void __exit fini(void)
{
crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs));
}
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 408fc0c..922ab24 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -668,7 +668,7 @@ static int force;
module_param(force, int, 0);
MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
-int __init init(void)
+static int __init init(void)
{
if (!force && is_blacklisted_cpu()) {
printk(KERN_INFO
@@ -681,7 +681,7 @@ int __init init(void)
return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
}
-void __exit fini(void)
+static void __exit fini(void)
{
crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 4c2e59a..d511d95 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -26,7 +26,6 @@
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d3eaac4..d854101 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -11,7 +11,6 @@
#include <linux/atomic.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
-#include <asm/system.h>
#include <asm/msr.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h
index 1316b4c..77203ac 100644
--- a/arch/x86/include/asm/auxvec.h
+++ b/arch/x86/include/asm/auxvec.h
@@ -9,4 +9,11 @@
#endif
#define AT_SYSINFO_EHDR 33
+/* entries in ARCH_DLINFO: */
+#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
+# define AT_VECTOR_SIZE_ARCH 2
+#else /* else it's non-compat x86-64 */
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
new file mode 100644
index 0000000..c6cd358
--- /dev/null
+++ b/arch/x86/include/asm/barrier.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_BARRIER_H
+#define _ASM_X86_BARRIER_H
+
+#include <asm/alternative.h>
+#include <asm/nops.h>
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#ifdef CONFIG_X86_32
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() asm volatile("mfence":::"memory")
+#define rmb() asm volatile("lfence":::"memory")
+#define wmb() asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier. All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads. This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies. See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * b = 2;
+ * memory_barrier();
+ * p = &b; q = p;
+ * read_barrier_depends();
+ * d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends(). However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * a = 2;
+ * memory_barrier();
+ * b = 3; y = b;
+ * read_barrier_depends();
+ * x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends() do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb() rmb()
+#else
+# define smp_rmb() barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() wmb()
+#else
+# define smp_wmb() barrier()
+#endif
+#define smp_read_barrier_depends() read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_read_barrier_depends() do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static __always_inline void rdtsc_barrier(void)
+{
+ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index f654d1b..11e1152 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -36,4 +36,8 @@ do { \
#endif /* !CONFIG_BUG */
#include <asm-generic/bug.h>
+
+
+extern void show_regs_common(void);
+
#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 4e12668..9863ee3 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -3,6 +3,7 @@
/* Caches aren't brain-dead on the intel. */
#include <asm-generic/cacheflush.h>
+#include <asm/special_insns.h>
#ifdef CONFIG_X86_PAT
/*
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b903d5e..2d91580 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -78,8 +78,75 @@
*/
#ifdef __KERNEL__
+#include <linux/bug.h>
+
DECLARE_PER_CPU(unsigned long, cpu_dr7);
+#ifndef CONFIG_PARAVIRT
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ (var) = native_get_debugreg(register)
+#define set_debugreg(value, register) \
+ native_set_debugreg(register, value)
+#endif
+
+static inline unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("mov %%db0, %0" :"=r" (val));
+ break;
+ case 1:
+ asm("mov %%db1, %0" :"=r" (val));
+ break;
+ case 2:
+ asm("mov %%db2, %0" :"=r" (val));
+ break;
+ case 3:
+ asm("mov %%db3, %0" :"=r" (val));
+ break;
+ case 6:
+ asm("mov %%db6, %0" :"=r" (val));
+ break;
+ case 7:
+ asm("mov %%db7, %0" :"=r" (val));
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static inline void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("mov %0, %%db0" ::"r" (value));
+ break;
+ case 1:
+ asm("mov %0, %%db1" ::"r" (value));
+ break;
+ case 2:
+ asm("mov %0, %%db2" ::"r" (value));
+ break;
+ case 3:
+ asm("mov %0, %%db3" ::"r" (value));
+ break;
+ case 6:
+ asm("mov %0, %%db6" ::"r" (value));
+ break;
+ case 7:
+ asm("mov %0, %%db7" ::"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
static inline void hw_breakpoint_disable(void)
{
/* Zero the control register for HW Breakpoint */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 5f962df..f27f79a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -84,7 +84,6 @@ extern unsigned int vdso_enabled;
(((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
#include <asm/processor.h>
-#include <asm/system.h>
#ifdef CONFIG_X86_32
#include <asm/desc.h>
diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h
new file mode 100644
index 0000000..54c2e1d
--- /dev/null
+++ b/arch/x86/include/asm/exec.h
@@ -0,0 +1 @@
+/* define arch_align_stack() here */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index d09bb03..71ecbcb 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -9,7 +9,6 @@
#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/processor.h>
-#include <asm/system.h>
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 7ce0798..257d9cc 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -14,7 +14,6 @@
#include <linux/sched.h>
#include <linux/hardirq.h>
-#include <asm/system.h>
struct pt_regs;
struct user_i387_struct;
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 77e95f5..332f98c 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -64,11 +64,15 @@ enum regnames {
GDB_PS, /* 17 */
GDB_CS, /* 18 */
GDB_SS, /* 19 */
+ GDB_DS, /* 20 */
+ GDB_ES, /* 21 */
+ GDB_FS, /* 22 */
+ GDB_GS, /* 23 */
};
#define GDB_ORIG_AX 57
-#define DBG_MAX_REG_NUM 20
-/* 17 64 bit regs and 3 32 bit regs */
-#define NUMREGBYTES ((17 * 8) + (3 * 4))
+#define DBG_MAX_REG_NUM 24
+/* 17 64 bit regs and 5 32 bit regs */
+#define NUMREGBYTES ((17 * 8) + (5 * 4))
#endif /* ! CONFIG_X86_32 */
static inline void arch_kgdb_breakpoint(void)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4d8dcbd..e7d1c19 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -321,4 +321,8 @@ struct kvm_xcrs {
__u64 padding[16];
};
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7b9cfc4..c222e1a 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -176,6 +176,7 @@ struct x86_emulate_ops {
void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
@@ -388,7 +389,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_INTERCEPTED 2
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int reason,
+ u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code);
int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 52d6640..e216ba0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -29,7 +29,7 @@
#include <asm/msr-index.h>
#define KVM_MAX_VCPUS 254
-#define KVM_SOFT_MAX_VCPUS 64
+#define KVM_SOFT_MAX_VCPUS 160
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
@@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
};
-#define NR_PTE_CHAIN_ENTRIES 5
-
-struct kvm_pte_chain {
- u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
- struct hlist_node link;
-};
-
/*
* kvm_mmu_page_role, below, is defined as:
*
@@ -427,12 +420,16 @@ struct kvm_vcpu_arch {
u64 last_guest_tsc;
u64 last_kernel_ns;
- u64 last_tsc_nsec;
- u64 last_tsc_write;
- u32 virtual_tsc_khz;
+ u64 last_host_tsc;
+ u64 tsc_offset_adjustment;
+ u64 this_tsc_nsec;
+ u64 this_tsc_write;
+ u8 this_tsc_generation;
bool tsc_catchup;
- u32 tsc_catchup_mult;
- s8 tsc_catchup_shift;
+ bool tsc_always_catchup;
+ s8 virtual_tsc_shift;
+ u32 virtual_tsc_mult;
+ u32 virtual_tsc_khz;
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -478,6 +475,21 @@ struct kvm_vcpu_arch {
u32 id;
bool send_user_only;
} apf;
+
+ /* OSVW MSRs (AMD only) */
+ struct {
+ u64 length;
+ u64 status;
+ } osvw;
+};
+
+struct kvm_lpage_info {
+ unsigned long rmap_pde;
+ int write_count;
+};
+
+struct kvm_arch_memory_slot {
+ struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
struct kvm_arch {
@@ -511,8 +523,12 @@ struct kvm_arch {
s64 kvmclock_offset;
raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec;
- u64 last_tsc_offset;
u64 last_tsc_write;
+ u32 last_tsc_khz;
+ u64 cur_tsc_nsec;
+ u64 cur_tsc_write;
+ u64 cur_tsc_offset;
+ u8 cur_tsc_generation;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -644,7 +660,7 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
- void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+ void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -652,7 +668,7 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
+ void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
@@ -674,6 +690,17 @@ struct kvm_arch_async_pf {
extern struct kvm_x86_ops *kvm_x86_ops;
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
+}
+
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -741,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
- bool has_error_code, u32 error_code);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 9cdae5d..c8bed0d 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -3,7 +3,6 @@
#include <linux/percpu.h>
-#include <asm/system.h>
#include <linux/atomic.h>
#include <asm/asm.h>
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 0e8e85b..d354fb7 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -5,7 +5,6 @@
#define _ASM_X86_MC146818RTC_H
#include <asm/io.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <linux/mc146818rtc.h>
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index bce688d..e21fdd1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -55,7 +55,6 @@ extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
extern void initmem_init(void);
-extern void free_initmem(void);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c0180fd..aa0f913 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -10,6 +10,7 @@
#include <asm/paravirt_types.h>
#ifndef __ASSEMBLY__
+#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index e8fb2c7..2291895 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -23,6 +23,7 @@
#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19)
#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 95da14f..a19542c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -14,13 +14,13 @@ struct mm_struct;
#include <asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeature.h>
-#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
+#include <asm/special_insns.h>
#include <linux/personality.h>
#include <linux/cpumask.h>
@@ -29,6 +29,15 @@ struct mm_struct;
#include <linux/math64.h>
#include <linux/init.h>
#include <linux/err.h>
+#include <linux/irqflags.h>
+
+/*
+ * We handle most unaligned accesses in hardware. On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN 0
#define HBP_NUM 4
/*
@@ -475,61 +484,6 @@ struct thread_struct {
unsigned io_bitmap_max;
};
-static inline unsigned long native_get_debugreg(int regno)
-{
- unsigned long val = 0; /* Damn you, gcc! */
-
- switch (regno) {
- case 0:
- asm("mov %%db0, %0" :"=r" (val));
- break;
- case 1:
- asm("mov %%db1, %0" :"=r" (val));
- break;
- case 2:
- asm("mov %%db2, %0" :"=r" (val));
- break;
- case 3:
- asm("mov %%db3, %0" :"=r" (val));
- break;
- case 6:
- asm("mov %%db6, %0" :"=r" (val));
- break;
- case 7:
- asm("mov %%db7, %0" :"=r" (val));
- break;
- default:
- BUG();
- }
- return val;
-}
-
-static inline void native_set_debugreg(int regno, unsigned long value)
-{
- switch (regno) {
- case 0:
- asm("mov %0, %%db0" ::"r" (value));
- break;
- case 1:
- asm("mov %0, %%db1" ::"r" (value));
- break;
- case 2:
- asm("mov %0, %%db2" ::"r" (value));
- break;
- case 3:
- asm("mov %0, %%db3" ::"r" (value));
- break;
- case 6:
- asm("mov %0, %%db6" ::"r" (value));
- break;
- case 7:
- asm("mov %0, %%db7" ::"r" (value));
- break;
- default:
- BUG();
- }
-}
-
/*
* Set IOPL bits in EFLAGS from given mask
*/
@@ -575,14 +529,6 @@ static inline void native_swapgs(void)
#define __cpuid native_cpuid
#define paravirt_enabled() 0
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register) \
- (var) = native_get_debugreg(register)
-#define set_debugreg(value, register) \
- native_set_debugreg(register, value)
-
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -1022,4 +968,24 @@ extern bool cpu_has_amd_erratum(const int *);
#define cpu_has_amd_erratum(x) (false)
#endif /* CONFIG_CPU_SUP_AMD */
+#ifdef CONFIG_X86_32
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#endif
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+bool set_pm_idle_to_default(void);
+
+void stop_this_cpu(void *dummy);
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5e64171..1654662 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -212,7 +212,61 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
-#endif
-#endif
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value) \
+do { \
+ unsigned short __val = (value); \
+ \
+ asm volatile(" \n" \
+ "1: movl %k0,%%" #seg " \n" \
+ \
+ ".section .fixup,\"ax\" \n" \
+ "2: xorl %k0,%k0 \n" \
+ " jmp 1b \n" \
+ ".previous \n" \
+ \
+ _ASM_EXTABLE(1b, 2b) \
+ \
+ : "+r" (__val) : : "memory"); \
+} while (0)
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk) ((tsk)->thread.gs)
+#define lazy_save_gs(v) savesegment(gs, (v))
+#define lazy_load_gs(v) loadsegment(gs, (v))
+#else /* X86_32_LAZY_GS */
+#define get_user_gs(regs) (u16)((regs)->gs)
+#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v) do { } while (0)
+#define lazy_load_gs(v) do { } while (0)
+#endif /* X86_32_LAZY_GS */
+#endif /* X86_32 */
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+ unsigned long __limit;
+ asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+ return __limit + 1;
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif /* __KERNEL__ */
#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
new file mode 100644
index 0000000..41fc93a
--- /dev/null
+++ b/arch/x86/include/asm/special_insns.h
@@ -0,0 +1,199 @@
+#ifndef _ASM_X86_SPECIAL_INSNS_H
+#define _ASM_X86_SPECIAL_INSNS_H
+
+
+#ifdef __KERNEL__
+
+static inline void native_clts(void)
+{
+ asm volatile("clts");
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+ asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline unsigned long native_read_cr4_safe(void)
+{
+ unsigned long val;
+ /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
+ * exists, so it will never fail. */
+#ifdef CONFIG_X86_32
+ asm volatile("1: mov %%cr4, %0\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b)
+ : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+ val = native_read_cr4();
+#endif
+ return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long native_read_cr8(void)
+{
+ unsigned long cr8;
+ asm volatile("movq %%cr8,%0" : "=r" (cr8));
+ return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+ asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+extern void native_load_gs_index(unsigned);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+
+static inline unsigned long read_cr0(void)
+{
+ return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ native_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ native_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+ return native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ native_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+ return native_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+ return native_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+ native_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+ native_wbinvd();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long read_cr8(void)
+{
+ return native_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ native_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+ native_load_gs_index(selector);
+}
+
+#endif
+
+/* Clear the 'TS' bit */
+static inline void clts(void)
+{
+ native_clts();
+}
+
+#endif/* CONFIG_PARAVIRT */
+
+#define stts() write_cr0(read_cr0() | X86_CR0_TS)
+
+static inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 1575177..b5d9533 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -38,7 +38,6 @@
#include <asm/tsc.h>
#include <asm/processor.h>
#include <asm/percpu.h>
-#include <asm/system.h>
#include <asm/desc.h>
#include <linux/random.h>
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
new file mode 100644
index 0000000..4ec45b3
--- /dev/null
+++ b/arch/x86/include/asm/switch_to.h
@@ -0,0 +1,129 @@
+#ifndef _ASM_X86_SWITCH_TO_H
+#define _ASM_X86_SWITCH_TO_H
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+ struct task_struct *next);
+struct tss_struct;
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ struct tss_struct *tss);
+
+#ifdef CONFIG_X86_32
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movl %P[task_canary](%[next]), %%ebx\n\t" \
+ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [stack_canary] "=m" (stack_canary.canary)
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last) \
+do { \
+ /* \
+ * Context-switching clobbers all registers, so we clobber \
+ * them explicitly, via unused output variables. \
+ * (EAX and EBP is not listed because EBP is saved/restored \
+ * explicitly for wchan access and EAX is the return value of \
+ * __switch_to()) \
+ */ \
+ unsigned long ebx, ecx, edx, esi, edi; \
+ \
+ asm volatile("pushfl\n\t" /* save flags */ \
+ "pushl %%ebp\n\t" /* save EBP */ \
+ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
+ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
+ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
+ "pushl %[next_ip]\n\t" /* restore EIP */ \
+ __switch_canary \
+ "jmp __switch_to\n" /* regparm call */ \
+ "1:\t" \
+ "popl %%ebp\n\t" /* restore EBP */ \
+ "popfl\n" /* restore flags */ \
+ \
+ /* output parameters */ \
+ : [prev_sp] "=m" (prev->thread.sp), \
+ [prev_ip] "=m" (prev->thread.ip), \
+ "=a" (last), \
+ \
+ /* clobbered output registers: */ \
+ "=b" (ebx), "=c" (ecx), "=d" (edx), \
+ "=S" (esi), "=D" (edi) \
+ \
+ __switch_canary_oparam \
+ \
+ /* input parameters: */ \
+ : [next_sp] "m" (next->thread.sp), \
+ [next_ip] "m" (next->thread.ip), \
+ \
+ /* regparm parameters for __switch_to(): */ \
+ [prev] "a" (prev), \
+ [next] "d" (next) \
+ \
+ __switch_canary_iparam \
+ \
+ : /* reloaded segment registers */ \
+ "memory"); \
+} while (0)
+
+#else /* CONFIG_X86_32 */
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER \
+ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+ "r12", "r13", "r14", "r15"
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary \
+ "movq %P[task_canary](%%rsi),%%r8\n\t" \
+ "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam \
+ , [gs_canary] "=m" (irq_stack_union.stack_canary)
+#define __switch_canary_iparam \
+ , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else /* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif /* CC_STACKPROTECTOR */
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+ asm volatile(SAVE_CONTEXT \
+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
+ "call __switch_to\n\t" \
+ "movq "__percpu_arg([current_task])",%%rsi\n\t" \
+ __switch_canary \
+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
+ "movq %%rax,%%rdi\n\t" \
+ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "jnz ret_from_fork\n\t" \
+ RESTORE_CONTEXT \
+ : "=a" (last) \
+ __switch_canary_oparam \
+ : [next] "S" (next), [prev] "D" (prev), \
+ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
+ [_tif_fork] "i" (_TIF_FORK), \
+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
+ [current_task] "m" (current_task) \
+ __switch_canary_iparam \
+ : "memory", "cc" __EXTRA_CLOBBER)
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
deleted file mode 100644
index 2d2f01c..0000000
--- a/arch/x86/include/asm/system.h
+++ /dev/null
@@ -1,523 +0,0 @@
-#ifndef _ASM_X86_SYSTEM_H
-#define _ASM_X86_SYSTEM_H
-
-#include <asm/asm.h>
-#include <asm/segment.h>
-#include <asm/cpufeature.h>
-#include <asm/cmpxchg.h>
-#include <asm/nops.h>
-
-#include <linux/kernel.h>
-#include <linux/irqflags.h>
-
-/* entries in ARCH_DLINFO: */
-#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
-#else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
-#endif
-
-struct task_struct; /* one of the stranger aspects of C forward declarations */
-struct task_struct *__switch_to(struct task_struct *prev,
- struct task_struct *next);
-struct tss_struct;
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss);
-extern void show_regs_common(void);
-
-#ifdef CONFIG_X86_32
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movl %P[task_canary](%[next]), %%ebx\n\t" \
- "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam \
- , [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
-#define switch_to(prev, next, last) \
-do { \
- /* \
- * Context-switching clobbers all registers, so we clobber \
- * them explicitly, via unused output variables. \
- * (EAX and EBP is not listed because EBP is saved/restored \
- * explicitly for wchan access and EAX is the return value of \
- * __switch_to()) \
- */ \
- unsigned long ebx, ecx, edx, esi, edi; \
- \
- asm volatile("pushfl\n\t" /* save flags */ \
- "pushl %%ebp\n\t" /* save EBP */ \
- "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
- "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
- "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
- "pushl %[next_ip]\n\t" /* restore EIP */ \
- __switch_canary \
- "jmp __switch_to\n" /* regparm call */ \
- "1:\t" \
- "popl %%ebp\n\t" /* restore EBP */ \
- "popfl\n" /* restore flags */ \
- \
- /* output parameters */ \
- : [prev_sp] "=m" (prev->thread.sp), \
- [prev_ip] "=m" (prev->thread.ip), \
- "=a" (last), \
- \
- /* clobbered output registers: */ \
- "=b" (ebx), "=c" (ecx), "=d" (edx), \
- "=S" (esi), "=D" (edi) \
- \
- __switch_canary_oparam \
- \
- /* input parameters: */ \
- : [next_sp] "m" (next->thread.sp), \
- [next_ip] "m" (next->thread.ip), \
- \
- /* regparm parameters for __switch_to(): */ \
- [prev] "a" (prev), \
- [next] "d" (next) \
- \
- __switch_canary_iparam \
- \
- : /* reloaded segment registers */ \
- "memory"); \
-} while (0)
-
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
-#else
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
-
-#define __EXTRA_CLOBBER \
- , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movq %P[task_canary](%%rsi),%%r8\n\t" \
- "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam \
- , [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/* Save restore flags to clear handle leaking NT */
-#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
- "call __switch_to\n\t" \
- "movq "__percpu_arg([current_task])",%%rsi\n\t" \
- __switch_canary \
- "movq %P[thread_info](%%rsi),%%r8\n\t" \
- "movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
- "jnz ret_from_fork\n\t" \
- RESTORE_CONTEXT \
- : "=a" (last) \
- __switch_canary_oparam \
- : [next] "S" (next), [prev] "D" (prev), \
- [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
- [ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [_tif_fork] "i" (_TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (current_task) \
- __switch_canary_iparam \
- : "memory", "cc" __EXTRA_CLOBBER)
-#endif
-
-#ifdef __KERNEL__
-
-extern void native_load_gs_index(unsigned);
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg, value) \
-do { \
- unsigned short __val = (value); \
- \
- asm volatile(" \n" \
- "1: movl %k0,%%" #seg " \n" \
- \
- ".section .fixup,\"ax\" \n" \
- "2: xorl %k0,%k0 \n" \
- " jmp 1b \n" \
- ".previous \n" \
- \
- _ASM_EXTABLE(1b, 2b) \
- \
- : "+r" (__val) : : "memory"); \
-} while (0)
-
-/*
- * Save a segment register away
- */
-#define savesegment(seg, value) \
- asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
-
-/*
- * x86_32 user gs accessors.
- */
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk) ((tsk)->thread.gs)
-#define lazy_save_gs(v) savesegment(gs, (v))
-#define lazy_load_gs(v) loadsegment(gs, (v))
-#else /* X86_32_LAZY_GS */
-#define get_user_gs(regs) (u16)((regs)->gs)
-#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v) do { } while (0)
-#define lazy_load_gs(v) do { } while (0)
-#endif /* X86_32_LAZY_GS */
-#endif /* X86_32 */
-
-static inline unsigned long get_limit(unsigned long segment)
-{
- unsigned long __limit;
- asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
- return __limit + 1;
-}
-
-static inline void native_clts(void)
-{
- asm volatile("clts");
-}
-
-/*
- * Volatile isn't enough to prevent the compiler from reordering the
- * read/write functions for the control registers and messing everything up.
- * A memory clobber would solve the problem, but would prevent reordering of
- * all loads stores around it, which can hurt performance. Solution is to
- * use a variable and mimic reads and writes to it to enforce serialization
- */
-static unsigned long __force_order;
-
-static inline unsigned long native_read_cr0(void)
-{
- unsigned long val;
- asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline void native_write_cr0(unsigned long val)
-{
- asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr2(void)
-{
- unsigned long val;
- asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline void native_write_cr2(unsigned long val)
-{
- asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr3(void)
-{
- unsigned long val;
- asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline void native_write_cr3(unsigned long val)
-{
- asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
-}
-
-static inline unsigned long native_read_cr4(void)
-{
- unsigned long val;
- asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline unsigned long native_read_cr4_safe(void)
-{
- unsigned long val;
- /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
- * exists, so it will never fail. */
-#ifdef CONFIG_X86_32
- asm volatile("1: mov %%cr4, %0\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b)
- : "=r" (val), "=m" (__force_order) : "0" (0));
-#else
- val = native_read_cr4();
-#endif
- return val;
-}
-
-static inline void native_write_cr4(unsigned long val)
-{
- asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
-}
-
-#ifdef CONFIG_X86_64
-static inline unsigned long native_read_cr8(void)
-{
- unsigned long cr8;
- asm volatile("movq %%cr8,%0" : "=r" (cr8));
- return cr8;
-}
-
-static inline void native_write_cr8(unsigned long val)
-{
- asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
-}
-#endif
-
-static inline void native_wbinvd(void)
-{
- asm volatile("wbinvd": : :"memory");
-}
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-
-static inline unsigned long read_cr0(void)
-{
- return native_read_cr0();
-}
-
-static inline void write_cr0(unsigned long x)
-{
- native_write_cr0(x);
-}
-
-static inline unsigned long read_cr2(void)
-{
- return native_read_cr2();
-}
-
-static inline void write_cr2(unsigned long x)
-{
- native_write_cr2(x);
-}
-
-static inline unsigned long read_cr3(void)
-{
- return native_read_cr3();
-}
-
-static inline void write_cr3(unsigned long x)
-{
- native_write_cr3(x);
-}
-
-static inline unsigned long read_cr4(void)
-{
- return native_read_cr4();
-}
-
-static inline unsigned long read_cr4_safe(void)
-{
- return native_read_cr4_safe();
-}
-
-static inline void write_cr4(unsigned long x)
-{
- native_write_cr4(x);
-}
-
-static inline void wbinvd(void)
-{
- native_wbinvd();
-}
-
-#ifdef CONFIG_X86_64
-
-static inline unsigned long read_cr8(void)
-{
- return native_read_cr8();
-}
-
-static inline void write_cr8(unsigned long x)
-{
- native_write_cr8(x);
-}
-
-static inline void load_gs_index(unsigned selector)
-{
- native_load_gs_index(selector);
-}
-
-#endif
-
-/* Clear the 'TS' bit */
-static inline void clts(void)
-{
- native_clts();
-}
-
-#endif/* CONFIG_PARAVIRT */
-
-#define stts() write_cr0(read_cr0() | X86_CR0_TS)
-
-#endif /* __KERNEL__ */
-
-static inline void clflush(volatile void *__p)
-{
- asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
-}
-
-#define nop() asm volatile ("nop")
-
-void disable_hlt(void);
-void enable_hlt(void);
-
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
-void default_idle(void);
-bool set_pm_idle_to_default(void);
-
-void stop_this_cpu(void *dummy);
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- */
-#ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
-#else
-#define mb() asm volatile("mfence":::"memory")
-#define rmb() asm volatile("lfence":::"memory")
-#define wmb() asm volatile("sfence" ::: "memory")
-#endif
-
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier. All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads. This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies. See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- * CPU 0 CPU 1
- *
- * b = 2;
- * memory_barrier();
- * p = &b; q = p;
- * read_barrier_depends();
- * d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends(). However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- * CPU 0 CPU 1
- *
- * a = 2;
- * memory_barrier();
- * b = 3; y = b;
- * read_barrier_depends();
- * x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b". Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends() do { } while (0)
-
-#ifdef CONFIG_SMP
-#define smp_mb() mb()
-#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb() rmb()
-#else
-# define smp_rmb() barrier()
-#endif
-#ifdef CONFIG_X86_OOSTORE
-# define smp_wmb() wmb()
-#else
-# define smp_wmb() barrier()
-#endif
-#define smp_read_barrier_depends() read_barrier_depends()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-#define smp_read_barrier_depends() do { } while (0)
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif
-
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static __always_inline void rdtsc_barrier(void)
-{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
-/*
- * We handle most unaligned accesses in hardware. On the other hand
- * unaligned DMA can be quite expensive on some Nehalem processors.
- *
- * Based on this we disable the IP header alignment in network drivers.
- */
-#define NET_IP_ALIGN 0
-#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 169be89..c0e108e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,7 +5,7 @@
#include <linux/sched.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/special_insns.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 15d9915..c91e8b9 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu);
extern void check_tsc_sync_target(void);
extern int notsc_setup(char *);
-extern void save_sched_clock_state(void);
-extern void restore_sched_clock_state(void);
+extern void tsc_save_sched_clock_state(void);
+extern void tsc_restore_sched_clock_state(void);
#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 815285b..8b38be2 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -5,13 +5,8 @@
#include <linux/clocksource.h>
struct vsyscall_gtod_data {
- seqlock_t lock;
+ seqcount_t seq;
- /* open coded 'struct timespec' */
- time_t wall_time_sec;
- u32 wall_time_nsec;
-
- struct timezone sys_tz;
struct { /* extract of a clocksource struct */
int vclock_mode;
cycle_t cycle_last;
@@ -19,8 +14,16 @@ struct vsyscall_gtod_data {
u32 mult;
u32 shift;
} clock;
- struct timespec wall_to_monotonic;
+
+ /* open coded 'struct timespec' */
+ time_t wall_time_sec;
+ u32 wall_time_nsec;
+ u32 monotonic_time_nsec;
+ time_t monotonic_time_sec;
+
+ struct timezone sys_tz;
struct timespec wall_time_coarse;
+ struct timespec monotonic_time_coarse;
};
extern struct vsyscall_gtod_data vsyscall_gtod_data;
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index e0f9aa1..5da71c2 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -16,7 +16,6 @@
#define _ASM_X86_VIRTEX_H
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/vmx.h>
#include <asm/svm.h>
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 517d476..baaca8d 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -145,9 +145,11 @@ struct x86_init_ops {
/**
* struct x86_cpuinit_ops - platform specific cpu hotplug setups
* @setup_percpu_clockev: set up the per cpu clock event device
+ * @early_percpu_clock_init: early init of the per cpu clock event device
*/
struct x86_cpuinit_ops {
void (*setup_percpu_clockev)(void);
+ void (*early_percpu_clock_init)(void);
void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
};
@@ -160,6 +162,8 @@ struct x86_cpuinit_ops {
* @is_untracked_pat_range exclude from PAT logic
* @nmi_init enable NMI on cpus
* @i8042_detect pre-detect if i8042 controller exists
+ * @save_sched_clock_state: save state for sched_clock() on suspend
+ * @restore_sched_clock_state: restore state for sched_clock() on resume
*/
struct x86_platform_ops {
unsigned long (*calibrate_tsc)(void);
@@ -171,6 +175,8 @@ struct x86_platform_ops {
void (*nmi_init)(void);
unsigned char (*get_nmi_reason)(void);
int (*i8042_detect)(void);
+ void (*save_sched_clock_state)(void);
+ void (*restore_sched_clock_state)(void);
};
struct pci_dev;
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index a1f2db5..cbf0c9d 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -56,6 +56,7 @@ DEFINE_GUEST_HANDLE(int);
DEFINE_GUEST_HANDLE(long);
DEFINE_GUEST_HANDLE(void);
DEFINE_GUEST_HANDLE(uint64_t);
+DEFINE_GUEST_HANDLE(uint32_t);
#endif
#ifndef HYPERVISOR_VIRT_START
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb..d2b7f27 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -14,6 +14,7 @@
#include <acpi/processor.h>
#include <asm/acpi.h>
#include <asm/mwait.h>
+#include <asm/special_insns.h>
/*
* Initialize bm_flags based on the CPU cache properties
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5d56931..459e78c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -231,7 +231,6 @@
#include <linux/syscore_ops.h>
#include <linux/i8253.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/olpc.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b240323..67e2583 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,6 +18,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/debugreg.h>
#include <asm/sections.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 5c0e653..2d5454c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,7 +9,6 @@
#include <linux/smp.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/mce.h>
#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 67bb17a..47a1870 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -25,7 +25,6 @@
#include <linux/cpu.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 54060f5..2d7998f 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,7 +8,6 @@
#include <linux/init.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/mce.h>
#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 97b2635..75772ae 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -12,7 +12,6 @@
#include <asm/processor-flags.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
-#include <asm/system.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0a18d16..fa2900c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
/* Prefer fixed purpose counters */
if (x86_pmu.num_counters_fixed) {
idx = X86_PMC_IDX_FIXED;
- for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
}
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
- for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index a524353..39472dd 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -43,7 +43,6 @@
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
static struct class *cpuid_class;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 6104852..36d1853 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,7 +15,6 @@
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5c..6d5fc8c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -16,7 +16,6 @@
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
@@ -306,10 +305,10 @@ void __init native_init_IRQ(void)
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ i = FIRST_EXTERNAL_VECTOR;
+ for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- if (!test_bit(i, used_vectors))
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba577..db6720e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,6 @@
#include <asm/debugreg.h>
#include <asm/apicdef.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -67,8 +66,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "ss", 4, offsetof(struct pt_regs, ss) },
{ "ds", 4, offsetof(struct pt_regs, ds) },
{ "es", 4, offsetof(struct pt_regs, es) },
- { "fs", 4, -1 },
- { "gs", 4, -1 },
#else
{ "ax", 8, offsetof(struct pt_regs, ax) },
{ "bx", 8, offsetof(struct pt_regs, bx) },
@@ -90,7 +87,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "flags", 4, offsetof(struct pt_regs, flags) },
{ "cs", 4, offsetof(struct pt_regs, cs) },
{ "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
#endif
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
};
int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d7..f8492da6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
return ret;
}
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
+}
+
#ifdef CONFIG_X86_LOCAL_APIC
static void __cpuinit kvm_setup_secondary_clock(void)
{
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
* we shouldn't fail.
*/
WARN_ON(kvm_register_clock("secondary cpu clock"));
- /* ok, done with our trickery, call native */
- setup_secondary_APIC_clock();
}
#endif
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.setup_percpu_clockev =
+ x86_cpuinit.early_percpu_clock_init =
kvm_setup_secondary_clock;
#endif
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea69726..ebc9873 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,7 +15,6 @@
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43b..5b19e4d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,7 +23,6 @@
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/cacheflush.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 177183c..7eb1e2b 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -43,7 +43,6 @@
#include <linux/mca.h>
#include <linux/kprobes.h>
#include <linux/slab.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/mman.h>
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 925179f..f21fd94 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -26,7 +26,6 @@
#include <linux/gfp.h>
#include <linux/jump_label.h>
-#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 9635676..eb11369 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,6 @@
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
static struct class *msr_class;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ada2f99..ab13760 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,6 +26,7 @@
#include <asm/bug.h>
#include <asm/paravirt.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/pgtable.h>
@@ -37,6 +38,7 @@
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
+#include <asm/special_insns.h>
/* nop stub */
void _paravirt_nop(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 726494b..6ac5782 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -42,7 +42,6 @@
#include <asm/calgary.h>
#include <asm/tce.h>
#include <asm/pci-direct.h>
-#include <asm/system.h>
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769..28e5e06 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init);
static __devinit void via_no_dac(struct pci_dev *dev)
{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ if (forbid_dac == 0) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
forbid_dac = 1;
}
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 29309c4..a33afaa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -18,7 +18,6 @@
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
#include <asm/idle.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ea207c2..ae68473 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,7 +38,6 @@
#include <linux/kdebug.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/i387.h>
@@ -55,6 +54,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <asm/switch_to.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ce5e34f..2b154da 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,7 +37,6 @@
#include <linux/ftrace.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
@@ -49,6 +48,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <asm/switch_to.h>
asmlinkage extern void ret_from_fork(void);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 78f05e4..8a634c8 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,7 +24,6 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8863888..1a29015 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -90,7 +90,6 @@
#include <asm/processor.h>
#include <asm/bugs.h>
-#include <asm/system.h>
#include <asm/vsyscall.h>
#include <asm/cpu.h>
#include <asm/desc.h>
@@ -509,15 +508,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
#ifdef CONFIG_KEXEC
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
-
- total = max_pfn - min_low_pfn;
-
- return total << PAGE_SHIFT;
-}
-
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
@@ -536,7 +526,7 @@ static void __init reserve_crashkernel(void)
unsigned long long crash_size, crash_base;
int ret;
- total_mem = get_total_mem();
+ total_mem = memblock_phys_mem_size();
ret = parse_crashkernel(boot_command_line, total_mem,
&crash_size, &crash_base);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e578a79..5104a2b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)
* most necessary things.
*/
cpu_init();
+ x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index 9e540fe..ab40954 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -34,6 +34,7 @@
#include <asm/tce.h>
#include <asm/calgary.h>
#include <asm/proto.h>
+#include <asm/cacheflush.h>
/* flush a tce at 'tceaddr' to main memory */
static inline void flush_tce(void* tceaddr)
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index bcfec2d..9d9d2f9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -6,7 +6,6 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ec61d4c..860f126 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -50,7 +50,6 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 183c592..fc0a147 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -630,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
static unsigned long long cyc2ns_suspend;
-void save_sched_clock_state(void)
+void tsc_save_sched_clock_state(void)
{
if (!sched_clock_stable)
return;
@@ -646,7 +646,7 @@ void save_sched_clock_state(void)
* that sched_clock() continues from the point where it was left off during
* suspend.
*/
-void restore_sched_clock_state(void)
+void tsc_restore_sched_clock_state(void)
{
unsigned long long offset;
unsigned long flags;
@@ -933,6 +933,16 @@ static int __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
+
+ /*
+ * Trust the results of the earlier calibration on systems
+ * exporting a reliable TSC.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ return 0;
+ }
+
schedule_delayed_work(&tsc_irqwork, 0);
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index b07ba93..d5c6986 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -52,10 +52,7 @@
#include "vsyscall_trace.h"
DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
-{
- .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-};
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
@@ -80,20 +77,15 @@ early_param("vsyscall", vsyscall_setup);
void update_vsyscall_tz(void)
{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* sys_tz has changed */
vsyscall_gtod_data.sys_tz = sys_tz;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
struct clocksource *clock, u32 mult)
{
- unsigned long flags;
+ struct timespec monotonic;
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ write_seqcount_begin(&vsyscall_gtod_data.seq);
/* copy vsyscall data */
vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
@@ -101,12 +93,19 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = mult;
vsyscall_gtod_data.clock.shift = clock->shift;
+
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
+
+ monotonic = timespec_add(*wall_time, *wtm);
+ vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec;
+ vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec;
+
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+ vsyscall_gtod_data.monotonic_time_coarse =
+ timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+ write_seqcount_end(&vsyscall_gtod_data.seq);
}
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06c..e9f265f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
};
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+ .early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
.fixup_cpu_id = x86_default_fixup_cpu_id,
};
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
.get_nmi_reason = default_get_nmi_reason,
- .i8042_detect = default_i8042_detect
+ .i8042_detect = default_i8042_detect,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
};
EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 89b02bf..9fed5be 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
const u32 kvm_supported_word6_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+ F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
/* cpuid 0xC0000001.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5b97e17..26d1fb4 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
+static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->ecx & bit(X86_FEATURE_OSVW));
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0982507..8375622 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -57,6 +57,7 @@
#define OpDS 23ull /* DS */
#define OpFS 24ull /* FS */
#define OpGS 25ull /* GS */
+#define OpMem8 26ull /* 8-bit zero extended memory operand */
#define OpBits 5 /* Width of operand field */
#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
#define SrcAcc (OpAcc << SrcShift)
#define SrcImmU16 (OpImmU16 << SrcShift)
#define SrcDX (OpDX << SrcShift)
+#define SrcMem8 (OpMem8 << SrcShift)
#define SrcMask (OpMask << SrcShift)
#define BitOp (1<<11)
#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
}
static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
- struct operand *op,
- int inhibit_bytereg)
+ struct operand *op)
{
unsigned reg = ctxt->modrm_reg;
int highbyte_regs = ctxt->rex_prefix == 0;
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
}
op->type = OP_REG;
- if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+ if (ctxt->d & ByteOp) {
op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
op->bytes = 1;
} else {
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
return 1;
}
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 index, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ ulong addr;
+
+ ctxt->ops->get_idt(ctxt, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, index << 3 | 0x2);
+
+ addr = dt.address + index * 8;
+ return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+ &ctxt->exception);
+}
+
static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_ptr *dt)
{
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
seg_desc.type = 3;
seg_desc.p = 1;
seg_desc.s = 1;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ seg_desc.dpl = 3;
goto load;
}
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
ss->p = 1;
}
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = ecx = 0;
+ return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
+ && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+ && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
+ && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
{
struct x86_emulate_ops *ops = ctxt->ops;
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_REAL)
return emulate_gp(ctxt, 0);
+ /*
+ * Not recognized on AMD in compat mode (but is recognized in legacy
+ * mode).
+ */
+ if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
+ && !vendor_intel(ctxt))
+ return emulate_ud(ctxt);
+
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
return emulate_gp(ctxt, 0);
ctxt->_eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
+
+ /* General purpose registers */
ctxt->regs[VCPU_REGS_RAX] = tss->eax;
ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
/*
+ * If we're switching between Protected Mode and VM86, we need to make
+ * sure to update the mode before loading the segment descriptors so
+ * that the selectors are interpreted correctly.
+ *
+ * Need to get rflags to the vcpu struct immediately because it
+ * influences the CPL which is checked at least when loading the segment
+ * descriptors and when pushing an error code to the new kernel stack.
+ *
+ * TODO Introduce a separate ctxt->ops->set_cpl callback
+ */
+ if (ctxt->eflags & X86_EFLAGS_VM)
+ ctxt->mode = X86EMUL_MODE_VM86;
+ else
+ ctxt->mode = X86EMUL_MODE_PROT32;
+
+ ctxt->ops->set_rflags(ctxt, ctxt->eflags);
+
+ /*
* Now load segment descriptors. If fault happenes at this stage
* it is handled in a context of new task
*/
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
}
static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int reason,
+ u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code)
{
struct x86_emulate_ops *ops = ctxt->ops;
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
/* FIXME: check that next_tss_desc is tss */
- if (reason != TASK_SWITCH_IRET) {
- if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt) > next_tss_desc.dpl)
- return emulate_gp(ctxt, 0);
+ /*
+ * Check privileges. The three cases are task switch caused by...
+ *
+ * 1. jmp/call/int to task gate: Check against DPL of the task gate
+ * 2. Exception/IRQ/iret: No check is performed
+ * 3. jmp/call to TSS: Check agains DPL of the TSS
+ */
+ if (reason == TASK_SWITCH_GATE) {
+ if (idt_index != -1) {
+ /* Software interrupts */
+ struct desc_struct task_gate_desc;
+ int dpl;
+
+ ret = read_interrupt_descriptor(ctxt, idt_index,
+ &task_gate_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ dpl = task_gate_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+ }
+ } else if (reason != TASK_SWITCH_IRET) {
+ int dpl = next_tss_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, tss_selector);
}
+
desc_limit = desc_limit_scaled(&next_tss_desc);
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
}
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- u16 tss_selector, int reason,
+ u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code)
{
int rc;
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
ctxt->_eip = ctxt->eip;
ctxt->dst.type = OP_NONE;
- rc = emulator_do_task_switch(ctxt, tss_selector, reason,
+ rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
if (rc == X86EMUL_CONTINUE)
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = {
I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
- D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xB8 - 0xBF */
N, N,
G(BitOp, group8),
I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
- D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xCF */
D2bv(DstMem | SrcReg | ModRM | Lock),
N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
switch (d) {
case OpReg:
- decode_register_operand(ctxt, op,
- op == &ctxt->dst &&
- ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
+ decode_register_operand(ctxt, op);
break;
case OpImmUByte:
rc = decode_imm(ctxt, op, 1, false);
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
case OpImm:
rc = decode_imm(ctxt, op, imm_size(ctxt), true);
break;
+ case OpMem8:
+ ctxt->memop.bytes = 1;
+ goto mem_common;
case OpMem16:
ctxt->memop.bytes = 2;
goto mem_common;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b6a7353..81cf4fa 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
if (val & 0x10) {
s->init4 = val & 1;
s->last_irr = 0;
+ s->irr &= s->elcr;
s->imr = 0;
s->priority_add = 0;
s->special_mask = 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 31bfc69..8584322 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_INIT:
- if (level) {
+ if (!trig_mode || level) {
result = 1;
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+ unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
if (unlikely(!tscdeadline || !this_tsc_khz))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 224b02c..4cb1642 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
{
unsigned long idx;
- idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
- return &slot->lpage_info[level - 2][idx];
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.lpage_info[level - 2][idx];
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
}
}
-static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
struct kvm_lpage_info *linfo;
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
struct kvm_memory_slot *slot;
slot = gfn_to_memslot(kvm, gfn);
- return __gfn_to_rmap(kvm, gfn, level, slot);
+ return __gfn_to_rmap(gfn, level, slot);
}
static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
return pte_list_add(vcpu, spte, rmapp);
}
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
{
return pte_list_next(rmapp, spte);
}
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
u64 *spte;
int i, write_protected = 0;
- rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
- spte = rmap_next(kvm, rmapp, NULL);
+ rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
+ spte = rmap_next(rmapp, NULL);
while (spte) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
- spte = rmap_next(kvm, rmapp, spte);
+ spte = rmap_next(rmapp, spte);
}
/* check for huge page mappings */
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
- spte = rmap_next(kvm, rmapp, NULL);
+ rmapp = __gfn_to_rmap(gfn, i, slot);
+ spte = rmap_next(rmapp, NULL);
while (spte) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
spte = NULL;
write_protected = 1;
}
- spte = rmap_next(kvm, rmapp, spte);
+ spte = rmap_next(rmapp, spte);
}
}
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
u64 *spte;
int need_tlb_flush = 0;
- while ((spte = rmap_next(kvm, rmapp, NULL))) {
+ while ((spte = rmap_next(rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
drop_spte(kvm, spte);
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
- spte = rmap_next(kvm, rmapp, NULL);
+ spte = rmap_next(rmapp, NULL);
while (spte) {
BUG_ON(!is_shadow_present_pte(*spte));
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
drop_spte(kvm, spte);
- spte = rmap_next(kvm, rmapp, NULL);
+ spte = rmap_next(rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~shadow_accessed_mask;
mmu_spte_clear_track_bits(spte);
mmu_spte_set(spte, new_spte);
- spte = rmap_next(kvm, rmapp, spte);
+ spte = rmap_next(rmapp, spte);
}
}
if (need_flush)
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
return kvm_unmap_rmapp(kvm, rmapp, data);
- spte = rmap_next(kvm, rmapp, NULL);
+ spte = rmap_next(rmapp, NULL);
while (spte) {
int _young;
u64 _spte = *spte;
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
young = 1;
clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
}
- spte = rmap_next(kvm, rmapp, spte);
+ spte = rmap_next(rmapp, spte);
}
return young;
}
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- spte = rmap_next(kvm, rmapp, NULL);
+ spte = rmap_next(rmapp, NULL);
while (spte) {
u64 _spte = *spte;
BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
young = 1;
break;
}
- spte = rmap_next(kvm, rmapp, spte);
+ spte = rmap_next(rmapp, spte);
}
out:
return young;
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages {
unsigned int nr;
};
-#define for_each_unsync_children(bitmap, idx) \
- for (idx = find_first_bit(bitmap, 512); \
- idx < 512; \
- idx = find_next_bit(bitmap, 512, idx+1))
-
static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
int idx)
{
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
{
int i, ret, nr_unsync_leaf = 0;
- for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (is_large_pte(*sptep)) {
drop_spte(vcpu->kvm, sptep);
+ --vcpu->kvm->stat.lpages;
kvm_flush_remote_tlbs(vcpu->kvm);
}
}
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
#undef PTTYPE
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+ struct kvm_mmu *context)
{
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
- switch (level) {
+ switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
context->rsvd_bits_mask[0][1] = 0;
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
int level)
{
context->nx = is_nx(vcpu);
+ context->root_level = level;
- reset_rsvds_bits_mask(vcpu, context, level);
+ reset_rsvds_bits_mask(vcpu, context);
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
context->free = paging_free;
- context->root_level = level;
context->shadow_root_level = level;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
context->nx = false;
+ context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
+ reset_rsvds_bits_mask(vcpu, context);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
- context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
context->direct_map = false;
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->get_cr3 = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
- context->nx = is_nx(vcpu);
if (!is_paging(vcpu)) {
context->nx = false;
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
- context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT64_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging64_gva_to_gpa;
} else if (is_pae(vcpu)) {
context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
- context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT32E_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging64_gva_to_gpa;
} else {
context->nx = false;
- reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
- context->gva_to_gpa = paging32_gva_to_gpa;
context->root_level = PT32_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging32_gva_to_gpa;
}
return 0;
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
} else if (is_long_mode(vcpu)) {
g_context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
g_context->root_level = PT64_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
} else if (is_pae(vcpu)) {
g_context->nx = is_nx(vcpu);
- reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
g_context->root_level = PT32E_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
} else {
g_context->nx = false;
- reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
g_context->root_level = PT32_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
}
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
* If we're seeing too many writes to a page, it may no longer be a page table,
* or we may be forking, in which case it is better to unmap the page.
*/
-static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
{
/*
* Skip write-flooding detected for the sp whose level is 1, because
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
- spte = get_written_sptes(sp, gpa, &npte);
-
if (detect_write_misaligned(sp, gpa, bytes) ||
- detect_write_flooding(sp, spte)) {
+ detect_write_flooding(sp)) {
zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
&invalid_list);
++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ea7b4fd..715da5a 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
slot = gfn_to_memslot(kvm, sp->gfn);
rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
- spte = rmap_next(kvm, rmapp, NULL);
+ spte = rmap_next(rmapp, NULL);
while (spte) {
if (is_writable_pte(*spte))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
- spte = rmap_next(kvm, rmapp, spte);
+ spte = rmap_next(rmapp, spte);
}
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aad544..a73f0c1 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
};
/* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 2};
+int fixed_pmc_events[] = {1, 0, 7};
static bool pmc_is_gp(struct kvm_pmc *pmc)
{
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
unsigned config, type = PERF_TYPE_RAW;
u8 event_select, unit_mask;
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
+
pmc->eventsel = eventsel;
stop_counter(pmc);
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
- if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+ if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
ARCH_PERFMON_EVENTSEL_INV |
ARCH_PERFMON_EVENTSEL_CMASK))) {
config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
struct kvm_pmc *counters;
u64 ctr;
- pmc &= (3u << 30) - 1;
+ pmc &= ~(3u << 30);
if (!fixed && pmc >= pmu->nr_arch_gp_counters)
return 1;
if (fixed && pmc >= pmu->nr_arch_fixed_counters)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e385214..e334389 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -111,6 +111,12 @@ struct nested_state {
#define MSRPM_OFFSETS 16
static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
@@ -177,11 +183,13 @@ static bool npt_enabled = true;
#else
static bool npt_enabled;
#endif
-static int npt = 1;
+/* allow nested paging (virtualized MMU) for all guests */
+static int npt = true;
module_param(npt, int, S_IRUGO);
-static int nested = 1;
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
module_param(nested, int, S_IRUGO);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void)
erratum_383_found = true;
}
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
static int has_svm(void)
{
const char *msg;
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage)
__get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
}
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ uint64_t len, status = 0;
+ int err;
+
+ len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+ if (!err)
+ status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+ &err);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
svm_init_erratum_383();
amd_pmu_enable_virt();
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
return _tsc;
}
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 ratio;
u64 khz;
- /* TSC scaling supported? */
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
return;
+ }
- /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
- if (user_tsc_khz == 0) {
- vcpu->arch.virtual_tsc_khz = 0;
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
+ /* TSC scaling supported? */
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ } else
+ WARN(1, "user requested TSC rate below hardware speed\n");
return;
}
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
user_tsc_khz);
return;
}
- vcpu->arch.virtual_tsc_khz = user_tsc_khz;
svm->tsc_ratio = ratio;
}
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON(adjustment < 0);
+ if (host)
+ adjustment = svm_scale_tsc(vcpu, adjustment);
+
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ svm_init_osvw(&svm->vcpu);
+
return &svm->vcpu;
free_page4:
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
+static void svm_update_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int cpl;
+
+ if (!is_protmode(vcpu))
+ cpl = 0;
+ else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
+ cpl = 3;
+ else
+ cpl = svm->vmcb->save.cs.selector & 0x3;
+
+ svm->vmcb->save.cpl = cpl;
+}
+
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.rflags;
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
+
to_svm(vcpu)->vmcb->save.rflags = rflags;
+ if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
+ svm_update_cpl(vcpu);
}
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
}
if (seg == VCPU_SREG_CS)
- svm->vmcb->save.cpl
- = (svm->vmcb->save.cs.attrib
- >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ svm_update_cpl(vcpu);
mark_dirty(svm->vmcb, VMCB_SEG);
}
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
(int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
skip_emulated_instruction(&svm->vcpu);
- if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
has_error_code, error_code) == EMULATE_FAIL) {
svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 246490f..280751c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
-static bool __read_mostly yield_on_hlt = 1;
-module_param(yield_on_hlt, bool, S_IRUGO);
-
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
- /* Ensure that we clear the HLT state in the VMCS. We don't need to
- * explicitly skip the instruction because if the HLT state is set, then
- * the instruction is already executing and RIP has already been
- * advanced. */
- if (!yield_on_hlt &&
- vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
-
/*
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- if (!(vmcs12->exception_bitmap & PF_VECTOR))
+ if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
return 0;
nested_vmx_vmexit(vcpu);
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
- vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
}
/*
- * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
- * ioctl. In this case the call-back should update internal vmx state to make
- * the changes effective.
+ * Engage any workarounds for mis-matched TSC rates. Currently limited to
+ * software catchup for faster rates on slower CPUs.
*/
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
- /* Nothing to do here */
+ if (!scale)
+ return;
+
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ } else
+ WARN(1, "user requested TSC rate below hardware speed\n");
}
/*
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
}
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
u64 offset = vmcs_read64(TSC_OFFSET);
vmcs_write64(TSC_OFFSET, offset + adjustment);
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
msr = find_msr_entry(vmx, msr_index);
if (msr) {
msr->data = data;
+ if (msr - vmx->guest_msrs < vmx->save_nmsrs)
+ kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_pin_based_exec_control) < 0)
return -EIO;
- min =
+ min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_INVLPG_EXITING |
CPU_BASED_RDPMC_EXITING;
- if (yield_on_hlt)
- min |= CPU_BASED_HLT_EXITING;
-
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
- vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
- vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
bool has_error_code = false;
u32 error_code = 0;
u16 tss_selector;
- int reason, type, idt_v;
+ int reason, type, idt_v, idt_index;
idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
type != INTR_TYPE_NMI_INTR))
skip_emulated_instruction(vcpu);
- if (kvm_task_switch(vcpu, tss_selector, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
+ if (kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54696b5..4044ce0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
unsigned long max_tsc_khz;
-static inline int kvm_tsc_changes_freq(void)
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
- int cpu = get_cpu();
- int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
- cpufreq_quick_get(cpu) != 0;
- put_cpu();
- return ret;
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
}
-u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
{
- if (vcpu->arch.virtual_tsc_khz)
- return vcpu->arch.virtual_tsc_khz;
- else
- return __this_cpu_read(cpu_tsc_khz);
+ u64 v = (u64)khz * (1000000 + ppm);
+ do_div(v, 1000000);
+ return v;
}
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
- u64 ret;
-
- WARN_ON(preemptible());
- if (kvm_tsc_changes_freq())
- printk_once(KERN_WARNING
- "kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * vcpu_tsc_khz(vcpu);
- do_div(ret, USEC_PER_SEC);
- return ret;
-}
+ u32 thresh_lo, thresh_hi;
+ int use_scaling = 0;
-static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
-{
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &vcpu->arch.tsc_catchup_shift,
- &vcpu->arch.tsc_catchup_mult);
+ &vcpu->arch.virtual_tsc_shift,
+ &vcpu->arch.virtual_tsc_mult);
+ vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+
+ /*
+ * Compute the variation in TSC rate which is acceptable
+ * within the range of tolerance and decide if the
+ * rate being applied is within that bounds of the hardware
+ * rate. If so, no scaling or compensation need be done.
+ */
+ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+ thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+ if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+ pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+ use_scaling = 1;
+ }
+ kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
- u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->arch.tsc_catchup_mult,
- vcpu->arch.tsc_catchup_shift);
- tsc += vcpu->arch.last_tsc_write;
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.this_tsc_write;
return tsc;
}
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- s64 sdiff;
+ s64 usdiff;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
- sdiff = data - kvm->arch.last_tsc_write;
- if (sdiff < 0)
- sdiff = -sdiff;
+
+ /* n.b - signed multiplication and division required */
+ usdiff = data - kvm->arch.last_tsc_write;
+#ifdef CONFIG_X86_64
+ usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+#else
+ /* do_div() only does unsigned */
+ asm("idivl %2; xor %%edx, %%edx"
+ : "=A"(usdiff)
+ : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+#endif
+ do_div(elapsed, 1000);
+ usdiff -= elapsed;
+ if (usdiff < 0)
+ usdiff = -usdiff;
/*
- * Special case: close write to TSC within 5 seconds of
- * another CPU is interpreted as an attempt to synchronize
- * The 5 seconds is to accommodate host load / swapping as
- * well as any reset of TSC during the boot process.
- *
- * In that case, for a reliable TSC, we can match TSC offsets,
- * or make a best guest using elapsed value.
- */
- if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
- elapsed < 5ULL * NSEC_PER_SEC) {
+ * Special case: TSC write with a small delta (1 second) of virtual
+ * cycle time against real time is interpreted as an attempt to
+ * synchronize the CPU.
+ *
+ * For a reliable TSC, we can match TSC offsets, and for an unstable
+ * TSC, we add elapsed time in this computation. We could let the
+ * compensation code attempt to catch up if we fall behind, but
+ * it's better to try to match offsets from the beginning.
+ */
+ if (usdiff < USEC_PER_SEC &&
+ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
if (!check_tsc_unstable()) {
- offset = kvm->arch.last_tsc_offset;
+ offset = kvm->arch.cur_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
- offset += delta;
+ data += delta;
+ offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
- ns = kvm->arch.last_tsc_nsec;
+ } else {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computaion in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = data;
+ kvm->arch.cur_tsc_offset = offset;
+ pr_debug("kvm: new tsc generation %u, clock %llu\n",
+ kvm->arch.cur_tsc_generation, data);
}
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
kvm->arch.last_tsc_nsec = ns;
kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_offset = offset;
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
/* Reset of TSC must disable overshoot protection below */
vcpu->arch.hv_clock.tsc_timestamp = 0;
- vcpu->arch.last_tsc_write = data;
- vcpu->arch.last_tsc_nsec = ns;
+ vcpu->arch.last_guest_tsc = data;
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
}
+
EXPORT_SYMBOL_GPL(kvm_write_tsc);
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_save(flags);
tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
kernel_ns = get_kernel_ns();
- this_tsc_khz = vcpu_tsc_khz(v);
+ this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (vcpu->tsc_catchup) {
u64 tsc = compute_guest_tsc(v, kernel_ns);
if (tsc > tsc_timestamp) {
- kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+ adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
tsc_timestamp = tsc;
}
}
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* observed by the guest and ensure the new system time is greater.
*/
max_kernel_ns = 0;
- if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+ if (vcpu->hv_clock.tsc_timestamp) {
max_kernel_ns = vcpu->last_guest_tsc -
vcpu->hv_clock.tsc_timestamp;
max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
if (data != 0) {
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
*/
pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has_osvw(vcpu))
+ return 1;
+ vcpu->arch.osvw.length = data;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has_osvw(vcpu))
+ return 1;
+ vcpu->arch.osvw.status = data;
+ break;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
*/
data = 0xbe702111;
break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has_osvw(vcpu))
+ return 1;
+ data = vcpu->arch.osvw.length;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has_osvw(vcpu))
+ return 1;
+ data = vcpu->arch.osvw.status;
+ break;
default:
if (kvm_pmu_msr(vcpu, msr))
return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_PCI_2_3:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
kvm_x86_ops->vcpu_load(vcpu, cpu);
- if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
- /* Make sure TSC doesn't go backwards */
- s64 tsc_delta;
- u64 tsc;
- tsc = kvm_x86_ops->read_l1_tsc(vcpu);
- tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
- tsc - vcpu->arch.last_guest_tsc;
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+ vcpu->arch.tsc_offset_adjustment = 0;
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ }
+ if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ native_read_tsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
- kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+ u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+ vcpu->arch.last_guest_tsc);
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+ vcpu->arch.last_host_tsc = native_read_tsc();
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
u32 user_tsc_khz;
r = -EINVAL;
- if (!kvm_has_tsc_control)
- break;
-
user_tsc_khz = (u32)arg;
if (user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
- kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ kvm_set_tsc_khz(vcpu, user_tsc_khz);
r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
- r = -EIO;
- if (check_tsc_unstable())
- goto out;
-
- r = vcpu_tsc_khz(vcpu);
-
+ r = vcpu->arch.virtual_tsc_khz;
goto out;
}
default:
@@ -2815,6 +2881,11 @@ out:
return r;
}
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
{
int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
unsigned long *dirty_bitmap,
unsigned long nr_dirty_pages)
{
+ spin_lock(&kvm->mmu_lock);
+
/* Not many dirty pages compared to # of shadow pages. */
if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
unsigned long gfn = memslot->base_gfn + gfn_offset;
- spin_lock(&kvm->mmu_lock);
kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
- spin_unlock(&kvm->mmu_lock);
}
kvm_flush_remote_tlbs(kvm);
- } else {
- spin_lock(&kvm->mmu_lock);
+ } else
kvm_mmu_slot_remove_write_access(kvm, memslot->id);
- spin_unlock(&kvm->mmu_lock);
- }
+
+ spin_unlock(&kvm->mmu_lock);
}
/*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EEXIST;
if (kvm->arch.vpic)
goto create_irqchip_unlock;
+ r = -EINVAL;
+ if (atomic_read(&kvm->online_vcpus))
+ goto create_irqchip_unlock;
r = -ENOMEM;
vpic = kvm_create_pic(kvm);
if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
return res;
}
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+ kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
+
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
.set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
+ .set_rflags = emulator_set_rflags,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
profile_hit(KVM_PROFILING, (void *)rip);
}
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_lapic_sync_from_vapic(vcpu);
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
return 0;
}
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
- bool has_error_code, u32 error_code)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
- ret = emulator_task_switch(ctxt, tss_selector, reason,
+ ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i;
+ int ret;
+ u64 local_tsc;
+ u64 max_tsc = 0;
+ bool stable, backwards_tsc = false;
kvm_shared_msr_cpu_online();
- list_for_each_entry(kvm, &vm_list, vm_list)
- kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu->cpu == smp_processor_id())
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- return kvm_x86_ops->hardware_enable(garbage);
+ ret = kvm_x86_ops->hardware_enable(garbage);
+ if (ret != 0)
+ return ret;
+
+ local_tsc = native_read_tsc();
+ stable = !check_tsc_unstable();
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!stable && vcpu->cpu == smp_processor_id())
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+ backwards_tsc = true;
+ if (vcpu->arch.last_host_tsc > max_tsc)
+ max_tsc = vcpu->arch.last_host_tsc;
+ }
+ }
+ }
+
+ /*
+ * Sometimes, even reliable TSCs go backwards. This happens on
+ * platforms that reset TSC during suspend or hibernate actions, but
+ * maintain synchronization. We must compensate. Fortunately, we can
+ * detect that condition here, which happens early in CPU bringup,
+ * before any KVM threads can be running. Unfortunately, we can't
+ * bring the TSCs fully up to date with real time, as we aren't yet far
+ * enough into CPU bringup that we know how much real time has actually
+ * elapsed; our helper function, get_kernel_ns() will be using boot
+ * variables that haven't been updated yet.
+ *
+ * So we simply find the maximum observed TSC above, then record the
+ * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
+ * the adjustment will be applied. Note that we accumulate
+ * adjustments, in case multiple suspend cycles happen before some VCPU
+ * gets a chance to run again. In the event that no KVM threads get a
+ * chance to run, we will miss the entire elapsed period, as we'll have
+ * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+ * loose cycle time. This isn't too big a deal, since the loss will be
+ * uniform across all VCPUs (not to mention the scenario is extremely
+ * unlikely). It is possible that a second hibernate recovery happens
+ * much faster than a first, causing the observed TSC here to be
+ * smaller; this would require additional padding adjustment, which is
+ * why we set last_host_tsc to the local tsc observed here.
+ *
+ * N.B. - this code below runs only on platforms with reliable TSC,
+ * as that is the only way backwards_tsc is set above. Also note
+ * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+ * have the same delta_cyc adjustment applied if backwards_tsc
+ * is detected. Note further, this adjustment is only done once,
+ * as we reset last_host_tsc on all VCPUs to stop this from being
+ * called multiple times (one for each physical CPU bringup).
+ *
+ * Platforms with unnreliable TSCs don't have to deal with this, they
+ * will be compensated by the logic in vcpu_load, which sets the TSC to
+ * catchup mode. This will catchup all VCPUs to real time, but cannot
+ * guarantee that they stay in perfect synchronization.
+ */
+ if (backwards_tsc) {
+ u64 delta_cyc = max_tsc - local_tsc;
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.tsc_offset_adjustment += delta_cyc;
+ vcpu->arch.last_host_tsc = local_tsc;
+ }
+
+ /*
+ * We have to disable TSC offset matching.. if you were
+ * booting a VM while issuing an S4 host suspend....
+ * you may have some problem. Solving this issue is
+ * left as an exercise to the reader.
+ */
+ kvm->arch.last_tsc_nsec = 0;
+ kvm->arch.last_tsc_write = 0;
+ }
+
+ }
+ return 0;
}
void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
kvm_x86_ops->check_processor_compatibility(rtn);
}
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+ return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
r = kvm_mmu_create(vcpu);
if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ if (type)
+ return -EINVAL;
+
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
put_page(kvm->arch.ept_identity_pagetable);
}
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+ vfree(free->arch.lpage_info[i]);
+ free->arch.lpage_info[i] = NULL;
+ }
+ }
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ unsigned long ugfn;
+ int lpages;
+ int level = i + 2;
+
+ lpages = gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+
+ slot->arch.lpage_info[i] =
+ vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+ if (!slot->arch.lpage_info[i])
+ goto out_free;
+
+ if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ slot->arch.lpage_info[i][0].write_count = 1;
+ if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+ ugfn = slot->userspace_addr >> PAGE_SHIFT;
+ /*
+ * If the gfn and userspace address are not aligned wrt each
+ * other, or if explicitly asked to, disable large page
+ * support for this slot
+ */
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !kvm_largepages_enabled()) {
+ unsigned long j;
+
+ for (j = 0; j < lpages; ++j)
+ slot->arch.lpage_info[i][j].write_count = 1;
+ }
+ }
+
+ return 0;
+
+out_free:
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ vfree(slot->arch.lpage_info[i]);
+ slot->arch.lpage_info[i] = NULL;
+ }
+ return -ENOMEM;
+}
+
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6cabf65..4f0cec7 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,7 +12,6 @@
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
-#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8663f6c..575d86f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -35,7 +35,6 @@
#include <asm/asm.h>
#include <asm/bios_ebda.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/dma.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 436a030..fc18be0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -35,7 +35,6 @@
#include <asm/processor.h>
#include <asm/bios_ebda.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 036efbe..aef7140 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,3 +1,4 @@
+#include <linux/bug.h>
#include <linux/kernel.h>
#include "opcode.h"
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cac7184..a69bcb8 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,7 +10,6 @@
#include <linux/spinlock.h>
#include <linux/module.h>
-#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 49a5cb5..ed2835e 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -416,7 +416,12 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
kfree(sd);
} else {
get_current_resources(device, busnum, domain, &resources);
- if (list_empty(&resources))
+
+ /*
+ * _CRS with no apertures is normal, so only fall back to
+ * defaults or native bridge info if we're ignoring _CRS.
+ */
+ if (!pci_use_crs)
x86_pci_root_bus_resources(busnum, &resources);
bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
&resources);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6dd8955..d0e6e40 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -164,11 +164,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_
*/
static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev)
{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
- (dev->device & 0xff00) == 0x2400)
+ if ((dev->device & 0xff00) == 0x2400)
dev->transparent = 1;
}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge);
/*
* Fixup for C1 Halt Disconnect problem on nForce2 systems.
@@ -322,9 +322,6 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
struct pci_bus *bus;
u16 config;
- if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
- return;
-
/* Is VGA routed to us? */
bus = pdev->bus;
while (bus) {
@@ -353,7 +350,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
}
}
-DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 91821a1..831971e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -39,6 +39,87 @@
#include <asm/io_apic.h>
+/*
+ * This list of dynamic mappings is for temporarily maintaining
+ * original BIOS BAR addresses for possible reinstatement.
+ */
+struct pcibios_fwaddrmap {
+ struct list_head list;
+ struct pci_dev *dev;
+ resource_size_t fw_addr[DEVICE_COUNT_RESOURCE];
+};
+
+static LIST_HEAD(pcibios_fwaddrmappings);
+static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock);
+
+/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */
+static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
+{
+ struct pcibios_fwaddrmap *map;
+
+ WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock));
+
+ list_for_each_entry(map, &pcibios_fwaddrmappings, list)
+ if (map->dev == dev)
+ return map;
+
+ return NULL;
+}
+
+static void
+pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr)
+{
+ unsigned long flags;
+ struct pcibios_fwaddrmap *map;
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ map = pcibios_fwaddrmap_lookup(dev);
+ if (!map) {
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return;
+
+ map->dev = pci_dev_get(dev);
+ map->fw_addr[idx] = fw_addr;
+ INIT_LIST_HEAD(&map->list);
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ list_add_tail(&map->list, &pcibios_fwaddrmappings);
+ } else
+ map->fw_addr[idx] = fw_addr;
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+}
+
+resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
+{
+ unsigned long flags;
+ struct pcibios_fwaddrmap *map;
+ resource_size_t fw_addr = 0;
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ map = pcibios_fwaddrmap_lookup(dev);
+ if (map)
+ fw_addr = map->fw_addr[idx];
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+
+ return fw_addr;
+}
+
+static void pcibios_fw_addr_list_del(void)
+{
+ unsigned long flags;
+ struct pcibios_fwaddrmap *entry, *next;
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ list_for_each_entry_safe(entry, next, &pcibios_fwaddrmappings, list) {
+ list_del(&entry->list);
+ pci_dev_put(entry->dev);
+ kfree(entry);
+ }
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+}
+
static int
skip_isa_ioresource_align(struct pci_dev *dev) {
@@ -182,7 +263,8 @@ static void __init pcibios_allocate_resources(int pass)
idx, r, disabled, pass);
if (pci_claim_resource(dev, idx) < 0) {
/* We'll assign a new address later */
- dev->fw_addr[idx] = r->start;
+ pcibios_save_fw_addr(dev,
+ idx, r->start);
r->end -= r->start;
r->start = 0;
}
@@ -228,6 +310,7 @@ static int __init pcibios_assign_resources(void)
}
pci_assign_unassigned_resources();
+ pcibios_fw_addr_list_del();
return 0;
}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index cb29191..140942f 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -43,6 +43,8 @@
#define PCI_FIXED_BAR_4_SIZE 0x14
#define PCI_FIXED_BAR_5_SIZE 0x1c
+static int pci_soc_mode = 0;
+
/**
* fixed_bar_cap - return the offset of the fixed BAR cap if found
* @bus: PCI bus
@@ -148,7 +150,9 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
*/
if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
return 0;
- if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0)))
+ if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
+ || devfn == PCI_DEVFN(0, 0)
+ || devfn == PCI_DEVFN(3, 0)))
return 1;
return 0; /* langwell on others */
}
@@ -231,14 +235,43 @@ struct pci_ops pci_mrst_ops = {
*/
int __init pci_mrst_init(void)
{
- printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n");
+ printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n");
pci_mmcfg_late_init();
pcibios_enable_irq = mrst_pci_irq_enable;
pci_root_ops = pci_mrst_ops;
+ pci_soc_mode = 1;
/* Continue with standard init */
return 1;
}
+/* Langwell devices are not true pci devices, they are not subject to 10 ms
+ * d3 to d0 delay required by pci spec.
+ */
+static void __devinit pci_d3delay_fixup(struct pci_dev *dev)
+{
+ /* PCI fixups are effectively decided compile time. If we have a dual
+ SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */
+ if (!pci_soc_mode)
+ return;
+ /* true pci devices in lincroft should allow type 1 access, the rest
+ * are langwell fake pci devices.
+ */
+ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
+ return;
+ dev->d3_delay = 0;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
+
+static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
+{
+ pci_set_power_state(dev, PCI_D3cold);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev);
+
/*
* Langwell devices reside at fixed offsets, don't try to move them.
*/
@@ -248,6 +281,9 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
u32 size;
int i;
+ if (!pci_soc_mode)
+ return;
+
/* Must have extended configuration space */
if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
return;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d99346e..7415aa9 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -324,6 +324,32 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
out:
return ret;
}
+
+static void xen_initdom_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ int ret = 0;
+
+ if (pci_seg_supported) {
+ struct physdev_pci_device restore_ext;
+
+ restore_ext.seg = pci_domain_nr(dev->bus);
+ restore_ext.bus = dev->bus->number;
+ restore_ext.devfn = dev->devfn;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
+ &restore_ext);
+ if (ret == -ENOSYS)
+ pci_seg_supported = false;
+ WARN(ret && ret != -ENOSYS, "restore_msi_ext -> %d\n", ret);
+ }
+ if (!pci_seg_supported) {
+ struct physdev_restore_msi restore;
+
+ restore.bus = dev->bus->number;
+ restore.devfn = dev->devfn;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
+ WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
+ }
+}
#endif
static void xen_teardown_msi_irqs(struct pci_dev *dev)
@@ -446,6 +472,7 @@ int __init pci_xen_initial_domain(void)
#ifdef CONFIG_PCI_MSI
x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
#endif
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index e70be38..ce874f8 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -208,16 +208,19 @@
interrupts = <14 1>;
};
- gpio@b,1 {
+ pcigpio: gpio@b,1 {
+ #gpio-cells = <2>;
+ #interrupt-cells = <2>;
compatible = "pci8086,2e67.2",
"pci8086,2e67",
"pciclassff0000",
"pciclassff00";
- #gpio-cells = <2>;
reg = <0x15900 0x0 0x0 0x0 0x0>;
interrupts = <15 1>;
+ interrupt-controller;
gpio-controller;
+ intel,muxctl = <0>;
};
i2c-controller@b,2 {
diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile
index 246b788..5b51194 100644
--- a/arch/x86/platform/geode/Makefile
+++ b/arch/x86/platform/geode/Makefile
@@ -1,2 +1,3 @@
obj-$(CONFIG_ALIX) += alix.o
obj-$(CONFIG_NET5501) += net5501.o
+obj-$(CONFIG_GEOS) += geos.o
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
new file mode 100644
index 0000000..c2e6d53
--- /dev/null
+++ b/arch/x86/platform/geode/geos.c
@@ -0,0 +1,128 @@
+/*
+ * System Specific setup for Traverse Technologies GEOS.
+ * At the moment this means setup of GPIO control of LEDs.
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ * and Philip Prindeville <philipp@redfish-solutions.com>
+ *
+ * TODO: There are large similarities with leds-net5501.c
+ * by Alessandro Zummo <a.zummo@towertech.it>
+ * In the future leds-net5501.c should be migrated over to platform
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+#include <linux/dmi.h>
+
+#include <asm/geode.h>
+
+static struct gpio_keys_button geos_gpio_buttons[] = {
+ {
+ .code = KEY_RESTART,
+ .gpio = 3,
+ .active_low = 1,
+ .desc = "Reset button",
+ .type = EV_KEY,
+ .wakeup = 0,
+ .debounce_interval = 100,
+ .can_disable = 0,
+ }
+};
+static struct gpio_keys_platform_data geos_buttons_data = {
+ .buttons = geos_gpio_buttons,
+ .nbuttons = ARRAY_SIZE(geos_gpio_buttons),
+ .poll_interval = 20,
+};
+
+static struct platform_device geos_buttons_dev = {
+ .name = "gpio-keys-polled",
+ .id = 1,
+ .dev = {
+ .platform_data = &geos_buttons_data,
+ }
+};
+
+static struct gpio_led geos_leds[] = {
+ {
+ .name = "geos:1",
+ .gpio = 6,
+ .default_trigger = "default-on",
+ .active_low = 1,
+ },
+ {
+ .name = "geos:2",
+ .gpio = 25,
+ .default_trigger = "default-off",
+ .active_low = 1,
+ },
+ {
+ .name = "geos:3",
+ .gpio = 27,
+ .default_trigger = "default-off",
+ .active_low = 1,
+ },
+};
+
+static struct gpio_led_platform_data geos_leds_data = {
+ .num_leds = ARRAY_SIZE(geos_leds),
+ .leds = geos_leds,
+};
+
+static struct platform_device geos_leds_dev = {
+ .name = "leds-gpio",
+ .id = -1,
+ .dev.platform_data = &geos_leds_data,
+};
+
+static struct __initdata platform_device *geos_devs[] = {
+ &geos_buttons_dev,
+ &geos_leds_dev,
+};
+
+static void __init register_geos(void)
+{
+ /* Setup LED control through leds-gpio driver */
+ platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
+}
+
+static int __init geos_init(void)
+{
+ const char *vendor, *product;
+
+ if (!is_geode())
+ return 0;
+
+ vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+ if (!vendor || strcmp(vendor, "Traverse Technologies"))
+ return 0;
+
+ product = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!product || strcmp(product, "Geos"))
+ return 0;
+
+ printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n",
+ KBUILD_MODNAME, vendor, product);
+
+ register_geos();
+
+ return 0;
+}
+
+module_init(geos_init);
+
+MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
+MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 721e652..e0a3723 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -28,6 +28,8 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/mfd/intel_msic.h>
+#include <linux/gpio.h>
+#include <linux/i2c/tc35876x.h>
#include <asm/setup.h>
#include <asm/mpspec_def.h>
@@ -675,6 +677,19 @@ static void *msic_thermal_platform_data(void *info)
return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
}
+/* tc35876x DSI-LVDS bridge chip and panel platform data */
+static void *tc35876x_platform_data(void *data)
+{
+ static struct tc35876x_platform_data pdata;
+
+ /* gpio pins set to -1 will not be used by the driver */
+ pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN");
+ pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN");
+ pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3");
+
+ return &pdata;
+}
+
static const struct devs_id __initconst device_ids[] = {
{"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
@@ -687,6 +702,7 @@ static const struct devs_id __initconst device_ids[] = {
{"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
{"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
{"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data},
+ {"i2c_disp_brig", SFI_DEV_TYPE_I2C, 0, &tc35876x_platform_data},
/* MSIC subdevices */
{"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data},
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 4889655..4793683 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -115,7 +115,7 @@ static void __save_processor_state(struct saved_context *ctxt)
void save_processor_state(void)
{
__save_processor_state(&saved_context);
- save_sched_clock_state();
+ x86_platform.save_sched_clock_state();
}
#ifdef CONFIG_X86_32
EXPORT_SYMBOL(save_processor_state);
@@ -231,8 +231,8 @@ static void __restore_processor_state(struct saved_context *ctxt)
/* Needed by apm.c */
void restore_processor_state(void)
{
+ x86_platform.restore_sched_clock_state();
__restore_processor_state(&saved_context);
- restore_sched_clock_state();
}
#ifdef CONFIG_X86_32
EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 3769079..74202c1 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -10,7 +10,6 @@
#include <linux/suspend.h>
#include <linux/bootmem.h>
-#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mmzone.h>
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index b2b54d2..9926e11 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -15,8 +15,8 @@ config UML_X86
select GENERIC_FIND_FIRST_BIT
config 64BIT
- bool
- default SUBARCH = "x86_64"
+ bool "64-bit kernel" if SUBARCH = "x86"
+ default SUBARCH != "i386"
config X86_32
def_bool !64BIT
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 2c32df6..04f82e0 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -17,6 +17,16 @@
#define ARCH_IS_STACKGROW(address) \
(address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(&current->thread.regs.regs))
+#include <asm/user.h>
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ __asm__ __volatile__("rep;nop": : :"memory");
+}
+
+#define cpu_relax() rep_nop()
+
#include <asm/processor-generic.h>
#endif
diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h
index 018f732..6c6689e 100644
--- a/arch/x86/um/asm/processor_32.h
+++ b/arch/x86/um/asm/processor_32.h
@@ -45,16 +45,6 @@ static inline void arch_copy_thread(struct arch_thread *from,
memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array));
}
-#include <asm/user.h>
-
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
- __asm__ __volatile__("rep;nop": : :"memory");
-}
-
-#define cpu_relax() rep_nop()
-
/*
* Default implementation of macro that returns current
* instruction pointer ("program counter"). Stolen
diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h
index 61de92d..4b02a84 100644
--- a/arch/x86/um/asm/processor_64.h
+++ b/arch/x86/um/asm/processor_64.h
@@ -14,14 +14,6 @@ struct arch_thread {
struct faultinfo faultinfo;
};
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
- __asm__ __volatile__("rep;nop": : :"memory");
-}
-
-#define cpu_relax() rep_nop()
-
#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \
.debugregs_seq = 0, \
.fs = 0, \
@@ -37,8 +29,6 @@ static inline void arch_copy_thread(struct arch_thread *from,
to->fs = from->fs;
}
-#include <asm/user.h>
-
#define current_text_addr() \
({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; })
diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c
index a1fba5f..17d88cf 100644
--- a/arch/x86/um/bugs_32.c
+++ b/arch/x86/um/bugs_32.c
@@ -13,8 +13,6 @@
static int host_has_cmov = 1;
static jmp_buf cmov_test_return;
-#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID]))
-
static void cmov_sigill_test_handler(int sig)
{
host_has_cmov = 0;
@@ -51,7 +49,7 @@ void arch_examine_signal(int sig, struct uml_pt_regs *regs)
* This is testing for a cmov (0x0f 0x4x) instruction causing a
* SIGILL in init.
*/
- if ((sig != SIGILL) || (TASK_PID(get_current()) != 1))
+ if ((sig != SIGILL) || (get_current_pid() != 1))
return;
if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) {
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index 639900a..f40281e 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -23,14 +23,6 @@ static int __init gate_vma_init(void)
gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
gate_vma.vm_page_prot = __P101;
- /*
- * Make sure the vDSO gets into every core dump.
- * Dumping its contents makes post-mortem fully interpretable later
- * without matching up the same kernel and hardware config to see
- * what PC values meant.
- */
- gate_vma.vm_flags |= VM_ALWAYSDUMP;
-
return 0;
}
__initcall(gate_vma_init);
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index 91f4ec9..af91901 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -64,8 +64,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
- VM_ALWAYSDUMP,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
vdsop);
up_write(&mm->mmap_sem);
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6bc0e72..885eff4 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -70,100 +70,98 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
return ret;
}
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+ long ret;
+
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+ return ret;
+}
+
+
notrace static inline long vgetns(void)
{
long v;
cycles_t cycles;
if (gtod->clock.vclock_mode == VCLOCK_TSC)
cycles = vread_tsc();
- else
+ else if (gtod->clock.vclock_mode == VCLOCK_HPET)
cycles = vread_hpet();
+ else
+ return 0;
v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
return (v * gtod->clock.mult) >> gtod->clock.shift;
}
-notrace static noinline int do_realtime(struct timespec *ts)
+/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
+notrace static int __always_inline do_realtime(struct timespec *ts)
{
unsigned long seq, ns;
+ int mode;
+
do {
- seq = read_seqbegin(&gtod->lock);
+ seq = read_seqcount_begin(&gtod->seq);
+ mode = gtod->clock.vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ts->tv_nsec = gtod->wall_time_nsec;
ns = vgetns();
- } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
timespec_add_ns(ts, ns);
- return 0;
+ return mode;
}
-notrace static noinline int do_monotonic(struct timespec *ts)
+notrace static int do_monotonic(struct timespec *ts)
{
- unsigned long seq, ns, secs;
+ unsigned long seq, ns;
+ int mode;
+
do {
- seq = read_seqbegin(&gtod->lock);
- secs = gtod->wall_time_sec;
- ns = gtod->wall_time_nsec + vgetns();
- secs += gtod->wall_to_monotonic.tv_sec;
- ns += gtod->wall_to_monotonic.tv_nsec;
- } while (unlikely(read_seqretry(&gtod->lock, seq)));
-
- /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
- * are all guaranteed to be nonnegative.
- */
- while (ns >= NSEC_PER_SEC) {
- ns -= NSEC_PER_SEC;
- ++secs;
- }
- ts->tv_sec = secs;
- ts->tv_nsec = ns;
+ seq = read_seqcount_begin(&gtod->seq);
+ mode = gtod->clock.vclock_mode;
+ ts->tv_sec = gtod->monotonic_time_sec;
+ ts->tv_nsec = gtod->monotonic_time_nsec;
+ ns = vgetns();
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ timespec_add_ns(ts, ns);
- return 0;
+ return mode;
}
-notrace static noinline int do_realtime_coarse(struct timespec *ts)
+notrace static int do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
do {
- seq = read_seqbegin(&gtod->lock);
+ seq = read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_coarse.tv_sec;
ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
- } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
return 0;
}
-notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+notrace static int do_monotonic_coarse(struct timespec *ts)
{
- unsigned long seq, ns, secs;
+ unsigned long seq;
do {
- seq = read_seqbegin(&gtod->lock);
- secs = gtod->wall_time_coarse.tv_sec;
- ns = gtod->wall_time_coarse.tv_nsec;
- secs += gtod->wall_to_monotonic.tv_sec;
- ns += gtod->wall_to_monotonic.tv_nsec;
- } while (unlikely(read_seqretry(&gtod->lock, seq)));
-
- /* wall_time_nsec and wall_to_monotonic.tv_nsec are
- * guaranteed to be between 0 and NSEC_PER_SEC.
- */
- if (ns >= NSEC_PER_SEC) {
- ns -= NSEC_PER_SEC;
- ++secs;
- }
- ts->tv_sec = secs;
- ts->tv_nsec = ns;
+ seq = read_seqcount_begin(&gtod->seq);
+ ts->tv_sec = gtod->monotonic_time_coarse.tv_sec;
+ ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
return 0;
}
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
+ int ret = VCLOCK_NONE;
+
switch (clock) {
case CLOCK_REALTIME:
- if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
- return do_realtime(ts);
+ ret = do_realtime(ts);
break;
case CLOCK_MONOTONIC:
- if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
- return do_monotonic(ts);
+ ret = do_monotonic(ts);
break;
case CLOCK_REALTIME_COARSE:
return do_realtime_coarse(ts);
@@ -171,32 +169,33 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
return do_monotonic_coarse(ts);
}
- return vdso_fallback_gettime(clock, ts);
+ if (ret == VCLOCK_NONE)
+ return vdso_fallback_gettime(clock, ts);
+ return 0;
}
int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias("__vdso_clock_gettime")));
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
- long ret;
- if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
- if (likely(tv != NULL)) {
- BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
- offsetof(struct timespec, tv_nsec) ||
- sizeof(*tv) != sizeof(struct timespec));
- do_realtime((struct timespec *)tv);
- tv->tv_usec /= 1000;
- }
- if (unlikely(tz != NULL)) {
- /* Avoid memcpy. Some old compilers fail to inline it */
- tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
- tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
- }
- return 0;
+ long ret = VCLOCK_NONE;
+
+ if (likely(tv != NULL)) {
+ BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+ offsetof(struct timespec, tv_nsec) ||
+ sizeof(*tv) != sizeof(struct timespec));
+ ret = do_realtime((struct timespec *)tv);
+ tv->tv_usec /= 1000;
}
- asm("syscall" : "=a" (ret) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
- return ret;
+ if (unlikely(tz != NULL)) {
+ /* Avoid memcpy. Some old compilers fail to inline it */
+ tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
+ tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
+ }
+
+ if (ret == VCLOCK_NONE)
+ return vdso_fallback_gtod(tv, tz);
+ return 0;
}
int gettimeofday(struct timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 468d591..a944020 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -250,13 +250,7 @@ static int __init gate_vma_init(void)
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
gate_vma.vm_page_prot = __P101;
- /*
- * Make sure the vDSO gets into every core dump.
- * Dumping its contents makes post-mortem fully interpretable later
- * without matching up the same kernel and hardware config to see
- * what PC values meant.
- */
- gate_vma.vm_flags |= VM_ALWAYSDUMP;
+
return 0;
}
@@ -343,17 +337,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (compat_uses_vma || !compat) {
/*
* MAYWRITE to allow gdb to COW and set breakpoints
- *
- * Make sure the vDSO gets into every core dump.
- * Dumping its contents makes post-mortem fully
- * interpretable later without matching up the same
- * kernel and hardware config to see what PC values
- * meant.
*/
ret = install_special_mapping(mm, addr, PAGE_SIZE,
VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
- VM_ALWAYSDUMP,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
vdso32_pages);
if (ret)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c..17e1827 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -124,8 +124,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
ret = install_special_mapping(mm, addr, vdso_size,
VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
- VM_ALWAYSDUMP,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
vdso_pages);
if (ret) {
current->mm->context.vdso = NULL;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4172af8..b132ade 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -62,6 +62,15 @@
#include <asm/reboot.h>
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
+#include <asm/mwait.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
#include "xen-ops.h"
#include "mmu.h"
@@ -200,13 +209,17 @@ static void __init xen_banner(void)
static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
+
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
unsigned maskebx = ~0;
unsigned maskecx = ~0;
unsigned maskedx = ~0;
-
+ unsigned setecx = 0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
@@ -214,9 +227,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
switch (*ax) {
case 1:
maskecx = cpuid_leaf1_ecx_mask;
+ setecx = cpuid_leaf1_ecx_set_mask;
maskedx = cpuid_leaf1_edx_mask;
break;
+ case CPUID_MWAIT_LEAF:
+ /* Synthesize the values.. */
+ *ax = 0;
+ *bx = 0;
+ *cx = cpuid_leaf5_ecx_val;
+ *dx = cpuid_leaf5_edx_val;
+ return;
+
case 0xb:
/* Suppress extended topology stuff */
maskebx = 0;
@@ -232,9 +254,75 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*bx &= maskebx;
*cx &= maskecx;
+ *cx |= setecx;
*dx &= maskedx;
+
}
+static bool __init xen_check_mwait(void)
+{
+#ifdef CONFIG_ACPI
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .u.set_pminfo.id = -1,
+ .u.set_pminfo.type = XEN_PM_PDC,
+ };
+ uint32_t buf[3];
+ unsigned int ax, bx, cx, dx;
+ unsigned int mwait_mask;
+
+ /* We need to determine whether it is OK to expose the MWAIT
+ * capability to the kernel to harvest deeper than C3 states from ACPI
+ * _CST using the processor_harvest_xen.c module. For this to work, we
+ * need to gather the MWAIT_LEAF values (which the cstate.c code
+ * checks against). The hypervisor won't expose the MWAIT flag because
+ * it would break backwards compatibility; so we will find out directly
+ * from the hardware and hypercall.
+ */
+ if (!xen_initial_domain())
+ return false;
+
+ ax = 1;
+ cx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+ (1 << (X86_FEATURE_MWAIT % 32));
+
+ if ((cx & mwait_mask) != mwait_mask)
+ return false;
+
+ /* We need to emulate the MWAIT_LEAF and for that we need both
+ * ecx and edx. The hypercall provides only partial information.
+ */
+
+ ax = CPUID_MWAIT_LEAF;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+ */
+ buf[0] = ACPI_PDC_REVISION_ID;
+ buf[1] = 1;
+ buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+ set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+ if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ cpuid_leaf5_ecx_val = cx;
+ cpuid_leaf5_edx_val = dx;
+ }
+ return true;
+#else
+ return false;
+#endif
+}
static void __init xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
@@ -261,6 +349,9 @@ static void __init xen_init_cpuid_mask(void)
/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
if ((cx & xsave_mask) != xsave_mask)
cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
+
+ if (xen_check_mwait())
+ cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
}
static void xen_set_debugreg(int reg, unsigned long val)
@@ -777,11 +868,11 @@ static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
static unsigned long xen_read_cr0(void)
{
- unsigned long cr0 = percpu_read(xen_cr0_value);
+ unsigned long cr0 = this_cpu_read(xen_cr0_value);
if (unlikely(cr0 == 0)) {
cr0 = native_read_cr0();
- percpu_write(xen_cr0_value, cr0);
+ this_cpu_write(xen_cr0_value, cr0);
}
return cr0;
@@ -791,7 +882,7 @@ static void xen_write_cr0(unsigned long cr0)
{
struct multicall_space mcs;
- percpu_write(xen_cr0_value, cr0);
+ this_cpu_write(xen_cr0_value, cr0);
/* Only pay attention to cr0.TS; everything else is
ignored. */
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 8bbb465..1573376 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -26,7 +26,7 @@ static unsigned long xen_save_fl(void)
struct vcpu_info *vcpu;
unsigned long flags;
- vcpu = percpu_read(xen_vcpu);
+ vcpu = this_cpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -50,7 +50,7 @@ static void xen_restore_fl(unsigned long flags)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- vcpu = percpu_read(xen_vcpu);
+ vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();
@@ -72,7 +72,7 @@ static void xen_irq_disable(void)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
- percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
+ this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
@@ -86,7 +86,7 @@ static void xen_irq_enable(void)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */
- vcpu = percpu_read(xen_vcpu);
+ vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 95c1cf6..988828b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1071,14 +1071,14 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;
- active_mm = percpu_read(cpu_tlbstate.active_mm);
+ active_mm = this_cpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
+ if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
+ if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);
}
@@ -1185,17 +1185,17 @@ static void __init xen_pagetable_setup_done(pgd_t *base)
static void xen_write_cr2(unsigned long cr2)
{
- percpu_read(xen_vcpu)->arch.cr2 = cr2;
+ this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
}
static unsigned long xen_read_cr2(void)
{
- return percpu_read(xen_vcpu)->arch.cr2;
+ return this_cpu_read(xen_vcpu)->arch.cr2;
}
unsigned long xen_read_cr2_direct(void)
{
- return percpu_read(xen_vcpu_info.arch.cr2);
+ return this_cpu_read(xen_vcpu_info.arch.cr2);
}
static void xen_flush_tlb(void)
@@ -1278,12 +1278,12 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
static unsigned long xen_read_cr3(void)
{
- return percpu_read(xen_cr3);
+ return this_cpu_read(xen_cr3);
}
static void set_current_cr3(void *v)
{
- percpu_write(xen_current_cr3, (unsigned long)v);
+ this_cpu_write(xen_current_cr3, (unsigned long)v);
}
static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -1306,7 +1306,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
xen_extend_mmuext_op(&op);
if (kernel) {
- percpu_write(xen_cr3, cr3);
+ this_cpu_write(xen_cr3, cr3);
/* Update xen_current_cr3 once the batch has actually
been submitted. */
@@ -1322,7 +1322,7 @@ static void xen_write_cr3(unsigned long cr3)
/* Update while interrupts are disabled, so its atomic with
respect to ipis */
- percpu_write(xen_cr3, cr3);
+ this_cpu_write(xen_cr3, cr3);
__xen_write_cr3(true, cr3);
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index dee79b7..9c2e74f 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -47,7 +47,7 @@ static inline void xen_mc_issue(unsigned mode)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
- local_irq_restore(percpu_read(xen_mc_irq_flags));
+ local_irq_restore(this_cpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e03c636..1ba8dff 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
#include <linux/pm.h>
#include <linux/memblock.h>
#include <linux/cpuidle.h>
+#include <linux/cpufreq.h>
#include <asm/elf.h>
#include <asm/vdso.h>
@@ -420,7 +421,7 @@ void __init xen_arch_setup(void)
boot_cpu_data.hlt_works_ok = 1;
#endif
disable_cpuidle();
- boot_option_idle_override = IDLE_HALT;
+ disable_cpufreq();
WARN_ON(set_pm_idle_to_default());
fiddle_vdso();
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 501d4e0..02900e8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -75,8 +75,14 @@ static void __cpuinit cpu_bringup(void)
xen_setup_cpu_clockevents();
+ notify_cpu_starting(cpu);
+
+ ipi_call_lock();
set_cpu_online(cpu, true);
- percpu_write(cpu_state, CPU_ONLINE);
+ ipi_call_unlock();
+
+ this_cpu_write(cpu_state, CPU_ONLINE);
+
wmb();
/* We can take interrupts now: we're officially "up". */