summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild3
-rw-r--r--arch/x86/Kconfig30
-rw-r--r--arch/x86/Kconfig.debug18
-rw-r--r--arch/x86/boot/tools/build.c1
-rw-r--r--arch/x86/configs/i386_defconfig3
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c11
-rw-r--r--arch/x86/crypto/camellia_glue.c10
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c10
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c11
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c11
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c35
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S2
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c10
-rw-r--r--arch/x86/entry/common.c58
-rw-r--r--arch/x86/entry/entry_32.S265
-rw-r--r--arch/x86/entry/entry_64.S10
-rw-r--r--arch/x86/entry/entry_64_compat.S103
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/vdso2c.h2
-rw-r--r--arch/x86/events/Makefile13
-rw-r--r--arch/x86/events/amd/core.c (renamed from arch/x86/kernel/cpu/perf_event_amd.c)2
-rw-r--r--arch/x86/events/amd/ibs.c (renamed from arch/x86/kernel/cpu/perf_event_amd_ibs.c)12
-rw-r--r--arch/x86/events/amd/iommu.c (renamed from arch/x86/kernel/cpu/perf_event_amd_iommu.c)4
-rw-r--r--arch/x86/events/amd/iommu.h (renamed from arch/x86/kernel/cpu/perf_event_amd_iommu.h)0
-rw-r--r--arch/x86/events/amd/uncore.c (renamed from arch/x86/kernel/cpu/perf_event_amd_uncore.c)4
-rw-r--r--arch/x86/events/core.c (renamed from arch/x86/kernel/cpu/perf_event.c)26
-rw-r--r--arch/x86/events/intel/bts.c (renamed from arch/x86/kernel/cpu/perf_event_intel_bts.c)2
-rw-r--r--arch/x86/events/intel/core.c (renamed from arch/x86/kernel/cpu/perf_event_intel.c)31
-rw-r--r--arch/x86/events/intel/cqm.c (renamed from arch/x86/kernel/cpu/perf_event_intel_cqm.c)34
-rw-r--r--arch/x86/events/intel/cstate.c (renamed from arch/x86/kernel/cpu/perf_event_intel_cstate.c)2
-rw-r--r--arch/x86/events/intel/ds.c (renamed from arch/x86/kernel/cpu/perf_event_intel_ds.c)56
-rw-r--r--arch/x86/events/intel/knc.c (renamed from arch/x86/kernel/cpu/perf_event_knc.c)6
-rw-r--r--arch/x86/events/intel/lbr.c (renamed from arch/x86/kernel/cpu/perf_event_intel_lbr.c)2
-rw-r--r--arch/x86/events/intel/p4.c (renamed from arch/x86/kernel/cpu/perf_event_p4.c)2
-rw-r--r--arch/x86/events/intel/p6.c (renamed from arch/x86/kernel/cpu/perf_event_p6.c)2
-rw-r--r--arch/x86/events/intel/pt.c (renamed from arch/x86/kernel/cpu/perf_event_intel_pt.c)4
-rw-r--r--arch/x86/events/intel/pt.h (renamed from arch/x86/kernel/cpu/intel_pt.h)0
-rw-r--r--arch/x86/events/intel/rapl.c (renamed from arch/x86/kernel/cpu/perf_event_intel_rapl.c)412
-rw-r--r--arch/x86/events/intel/uncore.c (renamed from arch/x86/kernel/cpu/perf_event_intel_uncore.c)677
-rw-r--r--arch/x86/events/intel/uncore.h (renamed from arch/x86/kernel/cpu/perf_event_intel_uncore.h)55
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c (renamed from arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c)8
-rw-r--r--arch/x86/events/intel/uncore_snb.c (renamed from arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c)16
-rw-r--r--arch/x86/events/intel/uncore_snbep.c (renamed from arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c)21
-rw-r--r--arch/x86/events/msr.c (renamed from arch/x86/kernel/cpu/perf_event_msr.c)0
-rw-r--r--arch/x86/events/perf_event.h (renamed from arch/x86/kernel/cpu/perf_event.h)5
-rw-r--r--arch/x86/include/asm/amd_nb.h26
-rw-r--r--arch/x86/include/asm/barrier.h15
-rw-r--r--arch/x86/include/asm/cacheflush.h6
-rw-r--r--arch/x86/include/asm/checksum_32.h9
-rw-r--r--arch/x86/include/asm/checksum_64.h10
-rw-r--r--arch/x86/include/asm/cpufeatures.h13
-rw-r--r--arch/x86/include/asm/dmi.h2
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h3
-rw-r--r--arch/x86/include/asm/fpu/xstate.h9
-rw-r--r--arch/x86/include/asm/gpio.h4
-rw-r--r--arch/x86/include/asm/imr.h2
-rw-r--r--arch/x86/include/asm/ipi.h58
-rw-r--r--arch/x86/include/asm/kvm_host.h31
-rw-r--r--arch/x86/include/asm/kvm_page_track.h61
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/mce.h69
-rw-r--r--arch/x86/include/asm/microcode.h26
-rw-r--r--arch/x86/include/asm/microcode_intel.h1
-rw-r--r--arch/x86/include/asm/msr-index.h12
-rw-r--r--arch/x86/include/asm/pci.h18
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/pmem.h5
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/include/asm/proto.h15
-rw-r--r--arch/x86/include/asm/sections.h2
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/string_64.h13
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/tlbflush.h57
-rw-r--r--arch/x86/include/asm/topology.h11
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h4
-rw-r--r--arch/x86/kernel/acpi/sleep.c7
-rw-r--r--arch/x86/kernel/aperture_64.c12
-rw-r--r--arch/x86/kernel/apic/apic.c14
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/ipi.c60
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/asm-offsets_32.c5
-rw-r--r--arch/x86/kernel/cpu/Makefile24
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c2
-rw-r--r--arch/x86/kernel/cpu/centaur.c10
-rw-r--r--arch/x86/kernel/cpu/common.c85
-rw-r--r--arch/x86/kernel/cpu/cyrix.c10
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c120
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c17
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c19
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c285
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c58
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c44
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c20
-rw-r--r--arch/x86/kernel/cpu/rdrand.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c4
-rw-r--r--arch/x86/kernel/cpu/transmeta.c8
-rw-r--r--arch/x86/kernel/cpu/vmware.c5
-rw-r--r--arch/x86/kernel/crash.c41
-rw-r--r--arch/x86/kernel/dumpstack.c11
-rw-r--r--arch/x86/kernel/e820.c38
-rw-r--r--arch/x86/kernel/fpu/core.c56
-rw-r--r--arch/x86/kernel/fpu/init.c35
-rw-r--r--arch/x86/kernel/fpu/xstate.c3
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/head64.c14
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/nmi.c3
-rw-r--r--arch/x86/kernel/pmem.c4
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c102
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/traps.c94
-rw-r--r--arch/x86/kernel/tsc.c67
-rw-r--r--arch/x86/kernel/vmlinux.lds.S33
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/assigned-dev.c14
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c4
-rw-r--r--arch/x86/kvm/hyperv.c50
-rw-r--r--arch/x86/kvm/i8254.c350
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/ioapic.c30
-rw-r--r--arch/x86/kvm/ioapic.h17
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h8
-rw-r--r--arch/x86/kvm/irq_comm.c27
-rw-r--r--arch/x86/kvm/lapic.c162
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.c506
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/page_track.c222
-rw-r--r--arch/x86/kvm/paging_tmpl.h37
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c3
-rw-r--r--arch/x86/kvm/trace.h12
-rw-r--r--arch/x86/kvm/vmx.c142
-rw-r--r--arch/x86/kvm/x86.c166
-rw-r--r--arch/x86/kvm/x86.h16
-rw-r--r--arch/x86/lguest/boot.c8
-rw-r--r--arch/x86/lib/cmdline.c60
-rw-r--r--arch/x86/lib/csum-wrappers_64.c2
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/memcpy_64.S117
-rw-r--r--arch/x86/mm/dump_pagetables.c11
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/init.c36
-rw-r--r--arch/x86/mm/init_32.c6
-rw-r--r--arch/x86/mm/init_64.c27
-rw-r--r--arch/x86/mm/kasan_init_64.c17
-rw-r--r--arch/x86/mm/kmmio.c88
-rw-r--r--arch/x86/mm/mmap.c20
-rw-r--r--arch/x86/mm/mpx.c2
-rw-r--r--arch/x86/mm/numa.c67
-rw-r--r--arch/x86/mm/pageattr.c34
-rw-r--r--arch/x86/mm/setup_nx.c5
-rw-r--r--arch/x86/oprofile/backtrace.c3
-rw-r--r--arch/x86/pci/common.c27
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/intel_mid_pci.c9
-rw-r--r--arch/x86/pci/irq.c23
-rw-r--r--arch/x86/pci/vmd.c35
-rw-r--r--arch/x86/platform/efi/quirks.c79
-rw-r--r--arch/x86/platform/geode/alix.c14
-rw-r--r--arch/x86/platform/geode/geos.c8
-rw-r--r--arch/x86/platform/geode/net5501.c8
-rw-r--r--arch/x86/platform/intel-mid/mfld.c5
-rw-r--r--arch/x86/platform/intel-mid/mrfl.c5
-rw-r--r--arch/x86/platform/intel-quark/imr.c61
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c30
-rw-r--r--arch/x86/um/asm/checksum.h9
-rw-r--r--arch/x86/um/asm/checksum_32.h2
-rw-r--r--arch/x86/um/os-Linux/task_size.c4
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/pmu.c2
-rw-r--r--arch/x86/xen/smp.c2
206 files changed, 4117 insertions, 2619 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 1538562..eb3abf8 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,6 +1,7 @@
-
obj-y += entry/
+obj-$(CONFIG_PERF_EVENTS) += events/
+
obj-$(CONFIG_KVM) += kvm/
# Xen paravirtualization support
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index adc5a6d..d07cca6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -304,6 +304,9 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config DEBUG_RODATA
+ def_bool y
+
config PGTABLE_LEVELS
int
default 4 if X86_64
@@ -1161,22 +1164,23 @@ config MICROCODE
bool "CPU microcode loading support"
default y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
- depends on BLK_DEV_INITRD
select FW_LOADER
---help---
-
If you say Y here, you will be able to update the microcode on
- certain Intel and AMD processors. The Intel support is for the
- IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
- Xeon etc. The AMD support is for families 0x10 and later. You will
- obviously need the actual microcode binary data itself which is not
- shipped with the Linux kernel.
+ Intel and AMD processors. The Intel support is for the IA32 family,
+ e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
+ AMD support is for families 0x10 and later. You will obviously need
+ the actual microcode binary data itself which is not shipped with
+ the Linux kernel.
- This option selects the general module only, you need to select
- at least one vendor specific module as well.
+ The preferred method to load microcode from a detached initrd is described
+ in Documentation/x86/early-microcode.txt. For that you need to enable
+ CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
+ initrd for microcode blobs.
- To compile this driver as a module, choose M here: the module
- will be called microcode.
+ In addition, you can build-in the microcode into the kernel. For that you
+ need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
+ to the CONFIG_EXTRA_FIRMWARE config option.
config MICROCODE_INTEL
bool "Intel microcode loading support"
@@ -2432,8 +2436,6 @@ config PCI_CNB20LE_QUIRK
You should say N unless you know you need this.
-source "drivers/pci/pcie/Kconfig"
-
source "drivers/pci/Kconfig"
# x86_64 have no ISA slots, but can have ISA-style DMA.
@@ -2589,8 +2591,6 @@ config AMD_NB
source "drivers/pcmcia/Kconfig"
-source "drivers/pci/hotplug/Kconfig"
-
config RAPIDIO
tristate "RapidIO support"
depends on PCI
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 68a2d1f..67eec55 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,28 +74,16 @@ config EFI_PGT_DUMP
issues with the mapping of the EFI runtime regions into that
table.
-config DEBUG_RODATA
- bool "Write protect kernel read-only data structures"
- default y
- depends on DEBUG_KERNEL
- ---help---
- Mark the kernel read-only data as write-protected in the pagetables,
- in order to catch accidental (and incorrect) writes to such const
- data. This is recommended so that we can catch kernel bugs sooner.
- If in doubt, say "Y".
-
config DEBUG_RODATA_TEST
- bool "Testcase for the DEBUG_RODATA feature"
- depends on DEBUG_RODATA
+ bool "Testcase for the marking rodata read-only"
default y
---help---
- This option enables a testcase for the DEBUG_RODATA
- feature as well as for the change_page_attr() infrastructure.
+ This option enables a testcase for the setting rodata read-only
+ as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
config DEBUG_WX
bool "Warn on W+X mappings at boot"
- depends on DEBUG_RODATA
select X86_PTDUMP_CORE
---help---
Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index a7661c4..0702d25 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -49,7 +49,6 @@ typedef unsigned int u32;
/* This must be large enough to hold the entire setup */
u8 buf[SETUP_SECT_MAX*512];
-int is_big_kernel;
#define PECOFF_RELOC_RESERVE 0x20
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 028be48..265901a 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_PRINTK_TIME=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_FRAME_WARN=2048
+CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_DEBUG_KERNEL=y
@@ -303,7 +303,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cb5b3ab..4f404a6 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -300,7 +300,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3633ad6..064c7e2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -639,16 +639,11 @@ static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 5c8b626..aa76cad 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1503,13 +1503,9 @@ int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index fca4595..50e6847 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -329,13 +329,9 @@ static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 5dc3702..6f778d3 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -332,16 +332,11 @@ int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 3643dd5..8943407 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -309,16 +309,11 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index a841e97..a8a0224 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -762,6 +762,38 @@ static int sha1_mb_async_digest(struct ahash_request *req)
return crypto_ahash_digest(mcryptd_req);
}
+static int sha1_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha1_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct shash_desc *desc;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+ desc = &rctx->desc;
+ desc->tfm = child;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
{
struct mcryptd_ahash *mcryptd_tfm;
@@ -796,8 +828,11 @@ static struct ahash_alg sha1_mb_async_alg = {
.final = sha1_mb_async_final,
.finup = sha1_mb_async_finup,
.digest = sha1_mb_async_digest,
+ .export = sha1_mb_async_export,
+ .import = sha1_mb_async_import,
.halg = {
.digestsize = SHA1_DIGEST_SIZE,
+ .statesize = sizeof(struct sha1_hash_ctx),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1_mb",
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
index 1435acf..63a0d9c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -186,7 +186,7 @@ len_is_0:
vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
- movl 4*32(state, idx, 4), DWORD_tmp
+ movl _args_digest+4*32(state, idx, 4), DWORD_tmp
vmovdqu %xmm0, _result_digest(job_rax)
movl DWORD_tmp, _result_digest+1*16(job_rax)
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 56d8a08..2ebb5e9 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -277,13 +277,9 @@ int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1a000f5..e79d93d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -45,6 +45,8 @@ __visible void enter_from_user_mode(void)
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit();
}
+#else
+static inline void enter_from_user_mode(void) {}
#endif
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -85,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
-#ifdef CONFIG_CONTEXT_TRACKING
- /*
- * If TIF_NOHZ is set, we are required to call user_exit() before
- * doing anything that could touch RCU.
- */
- if (work & _TIF_NOHZ) {
- enter_from_user_mode();
- work &= ~_TIF_NOHZ;
- }
-#endif
-
#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
@@ -172,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
BUG_ON(regs != task_pt_regs(current));
- /*
- * If we stepped into a sysenter/syscall insn, it trapped in
- * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
- * If user-mode had set TF itself, then it's still clear from
- * do_debug() and we need to set it again to restore the user
- * state. If we entered on the slow path, TF was already set.
- */
- if (work & _TIF_SINGLESTEP)
- regs->flags |= X86_EFLAGS_TF;
-
#ifdef CONFIG_SECCOMP
/*
* Call seccomp_phase2 before running the other hooks so that
@@ -354,6 +335,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long nr = regs->orig_ax;
+ enter_from_user_mode();
local_irq_enable();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -376,19 +358,12 @@ __visible void do_syscall_64(struct pt_regs *regs)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
- * Does a 32-bit syscall. Called with IRQs on and does all entry and
- * exit work and returns with IRQs off. This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
+ * all entry and exit work and returns with IRQs off. This function is
+ * extremely hot in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned int nr = (unsigned int)regs->orig_ax;
@@ -423,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
syscall_return_slowpath(regs);
}
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
{
+ enter_from_user_mode();
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
-#endif
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
@@ -450,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
*/
regs->ip = landing_pad;
- /*
- * Fetch EBP from where the vDSO stashed it.
- *
- * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
- */
+ enter_from_user_mode();
+
local_irq_enable();
+
+ /* Fetch EBP from where the vDSO stashed it. */
if (
#ifdef CONFIG_X86_64
/*
@@ -473,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
/* User code screwed up. */
local_irq_disable();
regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
- enter_from_user_mode();
-#endif
prepare_exit_to_usermode(regs);
return 0; /* Keep it simple: use IRET. */
}
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4c52283..10868aa 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -287,7 +287,58 @@ need_resched:
END(resume_kernel)
#endif
- # SYSENTER call handler stub
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!). To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp sysenter_past_esp
+#endif
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available. This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO. In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction. This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ */
ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
@@ -301,6 +352,29 @@ sysenter_past_esp:
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/*
+ * SYSENTER doesn't filter flags, so we need to clear NT, AC
+ * and TF ourselves. To save a few cycles, we can check whether
+ * either was set instead of doing an unconditional popfq.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
+ *
+ * If TF is set, we will single-step all the way to here -- do_debug
+ * will ignore all the traps. (Yes, this is slow, but so is
+ * single-stepping in general. This allows us to avoid having
+ * a more complicated code to handle the case where a user program
+ * forces us to single-step through the SYSENTER entry code.)
+ *
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
+ */
+ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+ /*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
*/
@@ -326,6 +400,15 @@ sysenter_past_esp:
popl %eax /* pt_regs->ax */
/*
+ * Restore all flags except IF. (We restore IF separately because
+ * STI gives a one-instruction window in which we won't be interrupted,
+ * whereas POPF does not.)
+ */
+ addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
+ btr $X86_EFLAGS_IF_BIT, (%esp)
+ popfl
+
+ /*
* Return back to the vDSO, which will pop ecx and edx.
* Don't bother with DS and ES (they already contain __USER_DS).
*/
@@ -338,28 +421,63 @@ sysenter_past_esp:
.popsection
_ASM_EXTABLE(1b, 2b)
PTGS_TO_GS_EX
+
+.Lsysenter_fix_flags:
+ pushl $X86_EFLAGS_FIXED
+ popfl
+ jmp .Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
ENDPROC(entry_SYSENTER_32)
- # system call handler stub
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction. INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries. It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call. (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6
+ */
ENTRY(entry_INT80_32)
ASM_CLAC
pushl %eax /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/*
- * User mode is traced as though IRQs are on. Unlike the 64-bit
- * case, INT80 is a trap gate on 32-bit kernels, so interrupts
- * are already on (unless user code is messing around with iopl).
+ * User mode is traced as though IRQs are on, and the interrupt gate
+ * turned them off.
*/
+ TRACE_IRQS_OFF
movl %esp, %eax
- call do_syscall_32_irqs_on
+ call do_int80_syscall_32
.Lsyscall_32_done:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
+ ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX
+
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
/*
* Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -386,19 +504,6 @@ ENTRY(iret_exc )
#ifdef CONFIG_X86_ESPFIX32
ldt_ss:
-#ifdef CONFIG_PARAVIRT
- /*
- * The kernel can't run on a non-flat stack if paravirt mode
- * is active. Rather than try to fixup the high bits of
- * ESP, bypass this code entirely. This may break DOSemu
- * and/or Wine support in a paravirt VM, although the option
- * is still available to implement the setting of the high
- * 16-bits in the INTERRUPT_RETURN paravirt-op.
- */
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
-#endif
-
/*
* Setup and switch to ESPFIX stack
*
@@ -631,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
END(spurious_interrupt_bug)
#ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
- addl $5*4, %esp /* remove xen-provided frame */
- jmp sysenter_past_esp
-
ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
@@ -938,51 +1035,48 @@ error_code:
jmp ret_from_exception
END(page_fault)
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
- cmpw $__KERNEL_CS, 4(%esp)
- jne \ok
-\label:
- movl TSS_sysenter_sp0 + \offset(%esp), %esp
- pushfl
- pushl $__KERNEL_CS
- pushl $sysenter_past_esp
-.endm
-
ENTRY(debug)
+ /*
+ * #DB can happen at the first instruction of
+ * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
+ * happens, then we will be running on a very small stack. We
+ * need to detect this condition and switch to the thread
+ * stack before calling any C code at all.
+ *
+ * If you edit this code, keep in mind that NMIs can happen in here.
+ */
ASM_CLAC
- cmpl $entry_SYSENTER_32, (%esp)
- jne debug_stack_correct
- FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
pushl $-1 # mark this as an int
SAVE_ALL
- TRACE_IRQS_OFF
xorl %edx, %edx # error code 0
movl %esp, %eax # pt_regs pointer
+
+ /* Are we currently on the SYSENTER stack? */
+ PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Ldebug_from_sysenter_stack
+
+ TRACE_IRQS_OFF
+ call do_debug
+ jmp ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+ /* We're on the SYSENTER stack. Switch off. */
+ movl %esp, %ebp
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
+ TRACE_IRQS_OFF
call do_debug
+ movl %ebp, %esp
jmp ret_from_exception
END(debug)
/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
+ * NMI is doubly nasty. It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks. We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
*/
ENTRY(nmi)
ASM_CLAC
@@ -993,41 +1087,32 @@ ENTRY(nmi)
popl %eax
je nmi_espfix_stack
#endif
- cmpl $entry_SYSENTER_32, (%esp)
- je nmi_stack_fixup
- pushl %eax
- movl %esp, %eax
- /*
- * Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1), %eax
- cmpl $(THREAD_SIZE-20), %eax
- popl %eax
- jae nmi_stack_correct
- cmpl $entry_SYSENTER_32, 12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- pushl %eax
+
+ pushl %eax # pt_regs->orig_ax
SAVE_ALL
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
+
+ /* Are we currently on the SYSENTER stack? */
+ PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+ subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
+ cmpl $SIZEOF_SYSENTER_stack, %ecx
+ jb .Lnmi_from_sysenter_stack
+
+ /* Not on SYSENTER stack. */
call do_nmi
jmp restore_all_notrace
-nmi_stack_fixup:
- FIX_STACK 12, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- cmpw $__KERNEL_CS, 16(%esp)
- jne nmi_stack_correct
- cmpl $debug, (%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn, (%esp)
- ja nmi_stack_correct
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
+.Lnmi_from_sysenter_stack:
+ /*
+ * We're on the SYSENTER stack. Switch off. No one (not even debug)
+ * is using the thread stack right now, so it's safe for us to use it.
+ */
+ movl %esp, %ebp
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
+ call do_nmi
+ movl %ebp, %esp
+ jmp restore_all_notrace
#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 70eadb0..858b555 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
+ * This is the only entry point used for 64-bit system calls. The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries. There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index ff1c6d6..847f2f0 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
.section .entry.text, "ax"
/*
- * 32-bit SYSENTER instruction entry.
+ * 32-bit SYSENTER entry.
*
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO. In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction. This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
* SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
*
* Arguments:
* eax system call number
@@ -35,10 +44,6 @@
* edi arg5
* ebp user stack
* 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
*/
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
*/
pushfq /* pt_regs->flags (except IF = 0) */
orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
- ASM_CLAC /* Clear AC after saving FLAGS */
-
pushq $__USER32_CS /* pt_regs->cs */
xorq %r8,%r8
pushq %r8 /* pt_regs->ip = 0 (placeholder) */
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
cld
/*
- * Sysenter doesn't filter flags, so we need to clear NT
+ * SYSENTER doesn't filter flags, so we need to clear NT and AC
* ourselves. To save a few cycles, we can check whether
- * NT was set instead of doing an unconditional popfq.
+ * either was set instead of doing an unconditional popfq.
* This needs to happen before enabling interrupts so that
* we don't get preempted with NT set.
*
+ * If TF is set, we will single-step all the way to here -- do_debug
+ * will ignore all the traps. (Yes, this is slow, but so is
+ * single-stepping in general. This allows us to avoid having
+ * a more complicated code to handle the case where a user program
+ * forces us to single-step through the SYSENTER entry code.)
+ *
* NB.: .Lsysenter_fix_flags is a label with the code under it moved
* out-of-line as an optimization: NT is unlikely to be set in the
* majority of the cases and instead of polluting the I$ unnecessarily,
* we're keeping that code behind a branch which will predict as
* not-taken and therefore its instructions won't be fetched.
*/
- testl $X86_EFLAGS_NT, EFLAGS(%rsp)
+ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
pushq $X86_EFLAGS_FIXED
popfq
jmp .Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
ENDPROC(entry_SYSENTER_compat)
/*
- * 32-bit SYSCALL instruction entry.
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO. In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ * - The calling convention for SYSCALL has changed several times without
+ * anyone noticing.
*
- * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
+ * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ * user task that did SYSCALL without immediately reloading SS
+ * would randomly crash.
*
- * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * - Most programmers do not directly target AMD CPUs, and the 32-bit
+ * SYSCALL instruction does not exist on Intel CPUs. Even on AMD
+ * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ * because the SYSCALL instruction in legacy/native 32-bit mode (as
+ * opposed to compat mode) is sufficiently poorly designed as to be
+ * essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs. RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed). SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
* (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
* (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
* or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
*
@@ -236,7 +267,21 @@ sysret32_from_system_call:
END(entry_SYSCALL_compat)
/*
- * Emulated IA32 system calls via int 0x80.
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction. INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
*
* Arguments:
* eax system call number
@@ -245,22 +290,14 @@ END(entry_SYSCALL_compat)
* edx arg3
* esi arg4
* edi arg5
- * ebp arg6 (note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
+ * ebp arg6
*/
-
ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
*/
PARAVIRT_ADJUST_EXCEPTION_FRAME
+ ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
/*
@@ -299,7 +336,7 @@ ENTRY(entry_INT80_compat)
TRACE_IRQS_OFF
movq %rsp, %rdi
- call do_syscall_32_irqs_off
+ call do_int80_syscall_32
.Lsyscall_32_done:
/* Go back to user mode. */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index cb713df..b30dd81 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -384,3 +384,5 @@
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
377 i386 copy_file_range sys_copy_file_range
+378 i386 preadv2 sys_preadv2
+379 i386 pwritev2 sys_pwritev2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 2e5b565..cac6d17 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -333,6 +333,8 @@
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
326 common copy_file_range sys_copy_file_range
+327 64 preadv2 sys_preadv2
+328 64 pwritev2 sys_pwritev2
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index abe961c..63a03bb 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
- "static unsigned char raw_data[%lu] __page_aligned_data = {",
+ "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
mapping_size);
for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
new file mode 100644
index 0000000..fdfea15
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1,13 @@
+obj-y += core.o
+
+obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o
+obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o
+endif
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/core.o intel/bts.o intel/cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/cstate.o intel/ds.o intel/knc.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/rapl.o msr.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/events/amd/core.c
index 5861053..049ada8d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/events/amd/core.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <asm/apicdef.h>
-#include "perf_event.h"
+#include "../perf_event.h"
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/events/amd/ibs.c
index 989d3c2..51087c2 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -14,7 +14,7 @@
#include <asm/apic.h>
-#include "perf_event.h"
+#include "../perf_event.h"
static u32 ibs_caps;
@@ -670,7 +670,7 @@ static __init int perf_event_ibs_init(void)
perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
- printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+ pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
}
@@ -774,14 +774,14 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
pci_read_config_dword(cpu_cfg, IBSCTL, &value);
if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
pci_dev_put(cpu_cfg);
- printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
- "IBSCTL = 0x%08x\n", value);
+ pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+ value);
return -EINVAL;
}
} while (1);
if (!nodes) {
- printk(KERN_DEBUG "No CPU node configured for IBS\n");
+ pr_debug("No CPU node configured for IBS\n");
return -ENODEV;
}
@@ -810,7 +810,7 @@ static void force_ibs_eilvt_setup(void)
preempt_enable();
if (offset == APIC_EILVT_NR_MAX) {
- printk(KERN_DEBUG "No EILVT entry available\n");
+ pr_debug("No EILVT entry available\n");
return;
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/events/amd/iommu.c
index 97242a9..635e5eb 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -16,8 +16,8 @@
#include <linux/cpumask.h>
#include <linux/slab.h>
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
+#include "../perf_event.h"
+#include "iommu.h"
#define COUNTER_SHIFT 16
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/events/amd/iommu.h
index 845d173..845d173 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h
+++ b/arch/x86/events/amd/iommu.h
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/events/amd/uncore.c
index 8836fc9..3db9569 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -538,7 +538,7 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_nb;
- printk(KERN_INFO "perf: AMD NB counters detected\n");
+ pr_info("perf: AMD NB counters detected\n");
ret = 0;
}
@@ -552,7 +552,7 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_l2;
- printk(KERN_INFO "perf: AMD L2I counters detected\n");
+ pr_info("perf: AMD L2I counters detected\n");
ret = 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/events/core.c
index 1b443db..9b6ad08 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/events/core.c
@@ -254,15 +254,16 @@ static bool check_hw_exists(void)
* We still allow the PMU driver to operate:
*/
if (bios_fail) {
- printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
- printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
+ pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+ pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+ reg_fail, val_fail);
}
return true;
msr_fail:
- printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
- printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+ pr_cont("Broken PMU hardware detected, using software events only.\n");
+ pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
reg, val_new);
@@ -596,6 +597,19 @@ void x86_pmu_disable_all(void)
}
}
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2180,11 +2194,11 @@ static int backtrace_stack(void *data, char *name)
return 0;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- perf_callchain_store(entry, addr);
+ return perf_callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/events/intel/bts.c
index 2cad71d..b99dc92 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -26,7 +26,7 @@
#include <asm-generic/sizes.h>
#include <asm/perf_event.h>
-#include "perf_event.h"
+#include "../perf_event.h"
struct bts_ctx {
struct perf_output_handle handle;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/events/intel/core.c
index fed2ab1..68fa55b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/events/intel/core.c
@@ -18,7 +18,7 @@
#include <asm/hardirq.h>
#include <asm/apic.h>
-#include "perf_event.h"
+#include "../perf_event.h"
/*
* Intel PerfMon, used on Core and later.
@@ -1502,7 +1502,15 @@ static __initconst const u64 knl_hw_cache_extra_regs
};
/*
- * Use from PMIs where the LBRs are already disabled.
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
*/
static void __intel_pmu_disable_all(void)
{
@@ -1884,6 +1892,16 @@ again:
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
handled++;
x86_pmu.drain_pebs(regs);
+ /*
+ * There are cases where, even though, the PEBS ovfl bit is set
+ * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+ * overflow bits set for their counters. We must clear them
+ * here because they have been processed as exact samples in
+ * the drain_pebs() routine. They must not be processed again
+ * in the for_each_bit_set() loop for regular samples below.
+ */
+ status &= ~cpuc->pebs_enabled;
+ status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
}
/*
@@ -1929,7 +1947,10 @@ again:
goto again;
done:
- __intel_pmu_enable_all(0, true);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ __intel_pmu_enable_all(0, true);
+
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -3396,6 +3417,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+ intel_pmu_pebs_data_source_nhm();
x86_add_quirk(intel_nehalem_quirk);
pr_cont("Nehalem events, ");
@@ -3459,6 +3481,7 @@ __init int intel_pmu_init(void)
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+ intel_pmu_pebs_data_source_nhm();
pr_cont("Westmere events, ");
break;
@@ -3581,7 +3604,7 @@ __init int intel_pmu_init(void)
intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_bdw_event_constraints;
- x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
x86_pmu.pebs_prec_dist = true;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/events/intel/cqm.c
index a316ca9..93cb412 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -7,7 +7,7 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
-#include "perf_event.h"
+#include "../perf_event.h"
#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
@@ -1244,15 +1244,12 @@ static struct pmu intel_cqm_pmu = {
static inline void cqm_pick_event_reader(int cpu)
{
- int phys_id = topology_physical_package_id(cpu);
- int i;
+ int reader;
- for_each_cpu(i, &cqm_cpumask) {
- if (phys_id == topology_physical_package_id(i))
- return; /* already got reader for this socket */
- }
-
- cpumask_set_cpu(cpu, &cqm_cpumask);
+ /* First online cpu in package becomes the reader */
+ reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
+ if (reader >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cqm_cpumask);
}
static void intel_cqm_cpu_starting(unsigned int cpu)
@@ -1270,24 +1267,17 @@ static void intel_cqm_cpu_starting(unsigned int cpu)
static void intel_cqm_cpu_exit(unsigned int cpu)
{
- int phys_id = topology_physical_package_id(cpu);
- int i;
+ int target;
- /*
- * Is @cpu a designated cqm reader?
- */
+ /* Is @cpu the current cqm reader for this package ? */
if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
return;
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
+ /* Find another online reader in this package */
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
- if (phys_id == topology_physical_package_id(i)) {
- cpumask_set_cpu(i, &cqm_cpumask);
- break;
- }
- }
+ if (target < nr_cpu_ids)
+ cpumask_set_cpu(target, &cqm_cpumask);
}
static int intel_cqm_cpu_notifier(struct notifier_block *nb,
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/events/intel/cstate.c
index 75a38b5..7946c42 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -89,7 +89,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
-#include "perf_event.h"
+#include "../perf_event.h"
#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/events/intel/ds.c
index 10602f0..ce7211a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -5,7 +5,7 @@
#include <asm/perf_event.h>
#include <asm/insn.h>
-#include "perf_event.h"
+#include "../perf_event.h"
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@@ -51,7 +51,8 @@ union intel_x86_pebs_dse {
#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-static const u64 pebs_data_source[] = {
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
@@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = {
OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
};
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -269,7 +278,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+ buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -286,7 +295,7 @@ static int alloc_pebs_buffer(int cpu)
per_cpu(insn_buffer, cpu) = ibuffer;
}
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+ max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
ds->pebs_index = ds->pebs_buffer_base;
@@ -722,6 +731,30 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+
struct event_constraint intel_skl_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
@@ -1319,19 +1352,28 @@ void __init intel_ds_init(void)
x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
if (x86_pmu.pebs) {
char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
int format = x86_pmu.intel_cap.pebs_format;
switch (format) {
case 0:
- printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
+ pr_cont("PEBS fmt0%c, ", pebs_type);
x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ /*
+ * Using >PAGE_SIZE buffers makes the WRMSR to
+ * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+ * mysteriously hang on Core2.
+ *
+ * As a workaround, we don't do this.
+ */
+ x86_pmu.pebs_buffer_size = PAGE_SIZE;
x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
break;
case 1:
- printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
+ pr_cont("PEBS fmt1%c, ", pebs_type);
x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
@@ -1351,7 +1393,7 @@ void __init intel_ds_init(void)
break;
default:
- printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
+ pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
x86_pmu.pebs = 0;
}
}
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/events/intel/knc.c
index 5b0c232..548d5f7 100644
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ b/arch/x86/events/intel/knc.c
@@ -5,7 +5,7 @@
#include <asm/hardirq.h>
-#include "perf_event.h"
+#include "../perf_event.h"
static const u64 knc_perfmon_event_map[] =
{
@@ -263,7 +263,9 @@ again:
goto again;
done:
- knc_pmu_enable_all(0);
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ knc_pmu_enable_all(0);
return handled;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/events/intel/lbr.c
index 653f88d..69dd118 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -5,7 +5,7 @@
#include <asm/msr.h>
#include <asm/insn.h>
-#include "perf_event.h"
+#include "../perf_event.h"
enum {
LBR_FORMAT_32 = 0x00,
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/events/intel/p4.c
index f2e5678..0a5ede1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -13,7 +13,7 @@
#include <asm/hardirq.h>
#include <asm/apic.h>
-#include "perf_event.h"
+#include "../perf_event.h"
#define P4_CNTR_LIMIT 3
/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/events/intel/p6.c
index 7c1a0c0..1f5c47a 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -1,7 +1,7 @@
#include <linux/perf_event.h>
#include <linux/types.h>
-#include "perf_event.h"
+#include "../perf_event.h"
/*
* Not sure about some of these
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/events/intel/pt.c
index c0bbd10..6af7cf7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -29,8 +29,8 @@
#include <asm/io.h>
#include <asm/intel_pt.h>
-#include "perf_event.h"
-#include "intel_pt.h"
+#include "../perf_event.h"
+#include "pt.h"
static DEFINE_PER_CPU(struct pt, pt_ctx);
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/events/intel/pt.h
index 336878a..336878a 100644
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ b/arch/x86/events/intel/pt.h
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/events/intel/rapl.c
index 24a351a..b834a3f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -44,11 +44,14 @@
* the duration of the measurement. Tools may use a function such as
* ldexp(raw_count, -32);
*/
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <asm/cpu_device_id.h>
-#include "perf_event.h"
+#include "../perf_event.h"
/*
* RAPL energy status counters
@@ -107,7 +110,7 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+#define RAPL_CNTR_WIDTH 32
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
static struct perf_pmu_events_attr event_attr_##v = { \
@@ -117,23 +120,33 @@ static struct perf_pmu_events_attr event_attr_##v = { \
};
struct rapl_pmu {
- spinlock_t lock;
- int n_active; /* number of active events */
- struct list_head active_list;
- struct pmu *pmu; /* pointer to rapl_pmu_class */
- ktime_t timer_interval; /* in ktime_t unit */
- struct hrtimer hrtimer;
+ raw_spinlock_t lock;
+ int n_active;
+ int cpu;
+ struct list_head active_list;
+ struct pmu *pmu;
+ ktime_t timer_interval;
+ struct hrtimer hrtimer;
};
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
-static struct pmu rapl_pmu_class;
+struct rapl_pmus {
+ struct pmu pmu;
+ unsigned int maxpkg;
+ struct rapl_pmu *pmus[];
+};
+
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus;
static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
+static unsigned int rapl_cntr_mask;
+static u64 rapl_timer_ms;
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+ return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+}
-static struct x86_pmu_quirk *rapl_quirks;
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -141,19 +154,10 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
-#define rapl_add_quirk(func_) \
-do { \
- static struct x86_pmu_quirk __quirk __initdata = { \
- .func = func_, \
- }; \
- __quirk.next = rapl_quirks; \
- rapl_quirks = &__quirk; \
-} while (0)
-
static inline u64 rapl_scale(u64 v, int cfg)
{
if (cfg > NR_RAPL_DOMAINS) {
- pr_warn("invalid domain %d, failed to scale data\n", cfg);
+ pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
/*
@@ -206,27 +210,21 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
HRTIMER_MODE_REL_PINNED);
}
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
- hrtimer_cancel(&pmu->hrtimer);
-}
-
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
- struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+ struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
if (!pmu->n_active)
return HRTIMER_NORESTART;
- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);
- list_for_each_entry(event, &pmu->active_list, active_entry) {
+ list_for_each_entry(event, &pmu->active_list, active_entry)
rapl_event_update(event);
- }
- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
hrtimer_forward_now(hrtimer, pmu->timer_interval);
@@ -260,28 +258,28 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+ struct rapl_pmu *pmu = event->pmu_private;
unsigned long flags;
- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);
__rapl_pmu_event_start(pmu, event);
- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+ struct rapl_pmu *pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
WARN_ON_ONCE(pmu->n_active <= 0);
pmu->n_active--;
if (pmu->n_active == 0)
- rapl_stop_hrtimer(pmu);
+ hrtimer_cancel(&pmu->hrtimer);
list_del(&event->active_entry);
@@ -299,23 +297,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE;
}
- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+ struct rapl_pmu *pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
__rapl_pmu_event_start(pmu, event);
- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
return 0;
}
@@ -329,15 +327,19 @@ static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, msr, ret = 0;
+ struct rapl_pmu *pmu;
/* only look at RAPL events */
- if (event->attr.type != rapl_pmu_class.type)
+ if (event->attr.type != rapl_pmus->pmu.type)
return -ENOENT;
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
return -EINVAL;
+ if (event->cpu < 0)
+ return -EINVAL;
+
/*
* check event is known (determines counter)
*/
@@ -376,6 +378,9 @@ static int rapl_pmu_event_init(struct perf_event *event)
return -EINVAL;
/* must be done before validate_group */
+ pmu = cpu_to_rapl_pmu(event->cpu);
+ event->cpu = pmu->cpu;
+ event->pmu_private = pmu;
event->hw.event_base = msr;
event->hw.config = cfg;
event->hw.idx = bit;
@@ -506,139 +511,62 @@ const struct attribute_group *rapl_attr_groups[] = {
NULL,
};
-static struct pmu rapl_pmu_class = {
- .attr_groups = rapl_attr_groups,
- .task_ctx_nr = perf_invalid_context, /* system-wide only */
- .event_init = rapl_pmu_event_init,
- .add = rapl_pmu_event_add, /* must have */
- .del = rapl_pmu_event_del, /* must have */
- .start = rapl_pmu_event_start,
- .stop = rapl_pmu_event_stop,
- .read = rapl_pmu_event_read,
-};
-
static void rapl_cpu_exit(int cpu)
{
- struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
- int i, phys_id = topology_physical_package_id(cpu);
- int target = -1;
+ struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ int target;
- /* find a new cpu on same package */
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
- if (phys_id == topology_physical_package_id(i)) {
- target = i;
- break;
- }
- }
- /*
- * clear cpu from cpumask
- * if was set in cpumask and still some cpu on package,
- * then move to new cpu
- */
- if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
- cpumask_set_cpu(target, &rapl_cpu_mask);
+ /* Check if exiting cpu is used for collecting rapl events */
+ if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+ return;
- WARN_ON(cpumask_empty(&rapl_cpu_mask));
- /*
- * migrate events and context to new cpu
- */
- if (target >= 0)
- perf_pmu_migrate_context(pmu->pmu, cpu, target);
+ pmu->cpu = -1;
+ /* Find a new cpu to collect rapl events */
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
- /* cancel overflow polling timer for CPU */
- rapl_stop_hrtimer(pmu);
+ /* Migrate rapl events to the new target */
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &rapl_cpu_mask);
+ pmu->cpu = target;
+ perf_pmu_migrate_context(pmu->pmu, cpu, target);
+ }
}
static void rapl_cpu_init(int cpu)
{
- int i, phys_id = topology_physical_package_id(cpu);
+ struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ int target;
- /* check if phys_is is already covered */
- for_each_cpu(i, &rapl_cpu_mask) {
- if (phys_id == topology_physical_package_id(i))
- return;
- }
- /* was not found, so add it */
- cpumask_set_cpu(cpu, &rapl_cpu_mask);
-}
-
-static __init void rapl_hsw_server_quirk(void)
-{
/*
- * DRAM domain on HSW server has fixed energy unit which can be
- * different than the unit from power unit MSR.
- * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
- * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ * Check if there is an online cpu in the package which collects rapl
+ * events already.
*/
- rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+ target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+ if (target < nr_cpu_ids)
+ return;
+
+ cpumask_set_cpu(cpu, &rapl_cpu_mask);
+ pmu->cpu = cpu;
}
static int rapl_cpu_prepare(int cpu)
{
- struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
- int phys_id = topology_physical_package_id(cpu);
- u64 ms;
+ struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
if (pmu)
return 0;
- if (phys_id < 0)
- return -1;
-
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
- return -1;
- spin_lock_init(&pmu->lock);
+ return -ENOMEM;
+ raw_spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
-
- pmu->pmu = &rapl_pmu_class;
-
- /*
- * use reference of 200W for scaling the timeout
- * to avoid missing counter overflows.
- * 200W = 200 Joules/sec
- * divide interval by 2 to avoid lockstep (2 * 100)
- * if hw unit is 32, then we use 2 ms 1/200/2
- */
- if (rapl_hw_unit[0] < 32)
- ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
- else
- ms = 2;
-
- pmu->timer_interval = ms_to_ktime(ms);
-
+ pmu->pmu = &rapl_pmus->pmu;
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ pmu->cpu = -1;
rapl_hrtimer_init(pmu);
-
- /* set RAPL pmu for this cpu for now */
- per_cpu(rapl_pmu, cpu) = pmu;
- per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
- return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
- struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
- kfree(pmu);
-
- per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
- struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
- if (!pmu)
- return 0;
-
- per_cpu(rapl_pmu, cpu) = NULL;
-
- per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
+ rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
return 0;
}
@@ -651,28 +579,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
case CPU_UP_PREPARE:
rapl_cpu_prepare(cpu);
break;
- case CPU_STARTING:
- rapl_cpu_init(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_DYING:
- rapl_cpu_dying(cpu);
- break;
+
+ case CPU_DOWN_FAILED:
case CPU_ONLINE:
- case CPU_DEAD:
- rapl_cpu_kfree(cpu);
+ rapl_cpu_init(cpu);
break;
+
case CPU_DOWN_PREPARE:
rapl_cpu_exit(cpu);
break;
- default:
- break;
}
-
return NOTIFY_OK;
}
-static int rapl_check_hw_unit(void)
+static int rapl_check_hw_unit(bool apply_quirk)
{
u64 msr_rapl_power_unit_bits;
int i;
@@ -683,28 +603,107 @@ static int rapl_check_hw_unit(void)
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ /*
+ * DRAM domain on HSW server and KNL has fixed energy unit which can be
+ * different than the unit from power unit MSR. See
+ * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+ * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ */
+ if (apply_quirk)
+ rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+
+ /*
+ * Calculate the timer rate:
+ * Use reference of 200W for scaling the timeout to avoid counter
+ * overflows. 200W = 200 Joules/sec
+ * Divide interval by 2 to avoid lockstep (2 * 100)
+ * if hw unit is 32, then we use 2 ms 1/200/2
+ */
+ rapl_timer_ms = 2;
+ if (rapl_hw_unit[0] < 32) {
+ rapl_timer_ms = (1000 / (2 * 100));
+ rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+ }
+ return 0;
+}
+
+static void __init rapl_advertise(void)
+{
+ int i;
+
+ pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+ hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+ for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+ if (rapl_cntr_mask & (1 << i)) {
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_domain_names[i], rapl_hw_unit[i]);
+ }
+ }
+}
+
+static int __init rapl_prepare_cpus(void)
+{
+ unsigned int cpu, pkg;
+ int ret;
+
+ for_each_online_cpu(cpu) {
+ pkg = topology_logical_package_id(cpu);
+ if (rapl_pmus->pmus[pkg])
+ continue;
+
+ ret = rapl_cpu_prepare(cpu);
+ if (ret)
+ return ret;
+ rapl_cpu_init(cpu);
+ }
+ return 0;
+}
+
+static void __init cleanup_rapl_pmus(void)
+{
+ int i;
+
+ for (i = 0; i < rapl_pmus->maxpkg; i++)
+ kfree(rapl_pmus->pmus + i);
+ kfree(rapl_pmus);
+}
+
+static int __init init_rapl_pmus(void)
+{
+ int maxpkg = topology_max_packages();
+ size_t size;
+
+ size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+ rapl_pmus = kzalloc(size, GFP_KERNEL);
+ if (!rapl_pmus)
+ return -ENOMEM;
+
+ rapl_pmus->maxpkg = maxpkg;
+ rapl_pmus->pmu.attr_groups = rapl_attr_groups;
+ rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
+ rapl_pmus->pmu.event_init = rapl_pmu_event_init;
+ rapl_pmus->pmu.add = rapl_pmu_event_add;
+ rapl_pmus->pmu.del = rapl_pmu_event_del;
+ rapl_pmus->pmu.start = rapl_pmu_event_start;
+ rapl_pmus->pmu.stop = rapl_pmu_event_stop;
+ rapl_pmus->pmu.read = rapl_pmu_event_read;
return 0;
}
-static const struct x86_cpu_id rapl_cpu_match[] = {
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
[1] = {},
};
static int __init rapl_pmu_init(void)
{
- struct rapl_pmu *pmu;
- int cpu, ret;
- struct x86_pmu_quirk *quirk;
- int i;
+ bool apply_quirk = false;
+ int ret;
- /*
- * check for Intel processor family 6
- */
if (!x86_match_cpu(rapl_cpu_match))
- return 0;
+ return -ENODEV;
- /* check supported CPU */
switch (boot_cpu_data.x86_model) {
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
@@ -712,7 +711,7 @@ static int __init rapl_pmu_init(void)
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
case 63: /* Haswell-Server */
- rapl_add_quirk(rapl_hsw_server_quirk);
+ apply_quirk = true;
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
break;
@@ -728,56 +727,41 @@ static int __init rapl_pmu_init(void)
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
break;
case 87: /* Knights Landing */
- rapl_add_quirk(rapl_hsw_server_quirk);
+ apply_quirk = true;
rapl_cntr_mask = RAPL_IDX_KNL;
rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
+ break;
default:
- /* unsupported */
- return 0;
+ return -ENODEV;
}
- ret = rapl_check_hw_unit();
+
+ ret = rapl_check_hw_unit(apply_quirk);
if (ret)
return ret;
- /* run cpu model quirks */
- for (quirk = rapl_quirks; quirk; quirk = quirk->next)
- quirk->func();
- cpu_notifier_register_begin();
+ ret = init_rapl_pmus();
+ if (ret)
+ return ret;
- for_each_online_cpu(cpu) {
- ret = rapl_cpu_prepare(cpu);
- if (ret)
- goto out;
- rapl_cpu_init(cpu);
- }
+ cpu_notifier_register_begin();
- __perf_cpu_notifier(rapl_cpu_notifier);
+ ret = rapl_prepare_cpus();
+ if (ret)
+ goto out;
- ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
- if (WARN_ON(ret)) {
- pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
- cpu_notifier_register_done();
- return -1;
- }
+ ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+ if (ret)
+ goto out;
- pmu = __this_cpu_read(rapl_pmu);
+ __perf_cpu_notifier(rapl_cpu_notifier);
+ cpu_notifier_register_done();
+ rapl_advertise();
+ return 0;
- pr_info("RAPL PMU detected,"
- " API unit is 2^-32 Joules,"
- " %d fixed counters"
- " %llu ms ovfl timer\n",
- hweight32(rapl_cntr_mask),
- ktime_to_ms(pmu->timer_interval));
- for (i = 0; i < NR_RAPL_DOMAINS; i++) {
- if (rapl_cntr_mask & (1 << i)) {
- pr_info("hw unit of domain %s 2^-%d Joules\n",
- rapl_domain_names[i], rapl_hw_unit[i]);
- }
- }
out:
+ pr_warn("Initialization failed (%d), disabled\n", ret);
+ cleanup_rapl_pmus();
cpu_notifier_register_done();
-
- return 0;
+ return ret;
}
device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/events/intel/uncore.c
index 3bf41d4..7012d18 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1,4 +1,4 @@
-#include "perf_event_intel_uncore.h"
+#include "uncore.h"
static struct intel_uncore_type *empty_uncore[] = { NULL, };
struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
@@ -9,9 +9,9 @@ struct pci_driver *uncore_pci_driver;
/* pci bus to socket mapping */
DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
/* mask of cpus that collect uncore events */
static cpumask_t uncore_cpu_mask;
@@ -21,7 +21,7 @@ static struct event_constraint uncore_constraint_fixed =
struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
{
struct pci2phy_map *map;
int phys_id = -1;
@@ -38,6 +38,16 @@ int uncore_pcibus_to_physid(struct pci_bus *bus)
return phys_id;
}
+static void uncore_free_pcibus_map(void)
+{
+ struct pci2phy_map *map, *tmp;
+
+ list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+ list_del(&map->list);
+ kfree(map);
+ }
+}
+
struct pci2phy_map *__find_pci2phy_map(int segment)
{
struct pci2phy_map *map, *alloc = NULL;
@@ -82,43 +92,9 @@ ssize_t uncore_event_show(struct kobject *kobj,
return sprintf(buf, "%s", event->config);
}
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
- return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
{
- struct intel_uncore_box *box;
-
- box = *per_cpu_ptr(pmu->box, cpu);
- if (box)
- return box;
-
- raw_spin_lock(&uncore_box_lock);
- /* Recheck in lock to handle races. */
- if (*per_cpu_ptr(pmu->box, cpu))
- goto out;
- list_for_each_entry(box, &pmu->box_list, list) {
- if (box->phys_id == topology_physical_package_id(cpu)) {
- atomic_inc(&box->refcnt);
- *per_cpu_ptr(pmu->box, cpu) = box;
- break;
- }
- }
-out:
- raw_spin_unlock(&uncore_box_lock);
-
- return *per_cpu_ptr(pmu->box, cpu);
-}
-
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
- /*
- * perf core schedules event on the basis of cpu, uncore events are
- * collected by one of the cpus inside a physical package.
- */
- return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+ return pmu->boxes[topology_logical_package_id(cpu)];
}
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -207,7 +183,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
return config;
}
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+ struct perf_event *event, int idx)
{
struct hw_perf_event *hwc = &event->hw;
@@ -302,24 +279,25 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
box->hrtimer.function = uncore_pmu_hrtimer;
}
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+ int node)
{
+ int i, size, numshared = type->num_shared_regs ;
struct intel_uncore_box *box;
- int i, size;
- size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+ size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
box = kzalloc_node(size, GFP_KERNEL, node);
if (!box)
return NULL;
- for (i = 0; i < type->num_shared_regs; i++)
+ for (i = 0; i < numshared; i++)
raw_spin_lock_init(&box->shared_regs[i].lock);
uncore_pmu_init_hrtimer(box);
- atomic_set(&box->refcnt, 1);
box->cpu = -1;
- box->phys_id = -1;
+ box->pci_phys_id = -1;
+ box->pkgid = -1;
/* set default hrtimer timeout */
box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
@@ -341,7 +319,8 @@ static bool is_uncore_event(struct perf_event *event)
}
static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+ bool dogrp)
{
struct perf_event *event;
int n, max_count;
@@ -402,7 +381,8 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve
return &type->unconstrainted;
}
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+ struct perf_event *event)
{
if (box->pmu->type->ops->put_constraint)
box->pmu->type->ops->put_constraint(box, event);
@@ -582,7 +562,7 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags)
if (event == box->event_list[i]) {
uncore_put_event_constraint(box, event);
- while (++i < box->n_events)
+ for (++i; i < box->n_events; i++)
box->event_list[i - 1] = box->event_list[i];
--box->n_events;
@@ -676,6 +656,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
if (!box || box->cpu < 0)
return -EINVAL;
event->cpu = box->cpu;
+ event->pmu_private = box;
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
@@ -760,64 +741,110 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
}
ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+ if (!ret)
+ pmu->registered = true;
return ret;
}
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+ if (!pmu->registered)
+ return;
+ perf_pmu_unregister(&pmu->pmu);
+ pmu->registered = false;
+}
+
+static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+{
+ struct intel_uncore_pmu *pmu = type->pmus;
+ struct intel_uncore_box *box;
+ int i, pkg;
+
+ if (pmu) {
+ pkg = topology_physical_package_id(cpu);
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (box)
+ uncore_box_exit(box);
+ }
+ }
+}
+
+static void __init uncore_exit_boxes(void *dummy)
+{
+ struct intel_uncore_type **types;
+
+ for (types = uncore_msr_uncores; *types; types++)
+ __uncore_exit_boxes(*types++, smp_processor_id());
+}
+
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+ int pkg;
+
+ for (pkg = 0; pkg < max_packages; pkg++)
+ kfree(pmu->boxes[pkg]);
+ kfree(pmu->boxes);
+}
+
static void __init uncore_type_exit(struct intel_uncore_type *type)
{
+ struct intel_uncore_pmu *pmu = type->pmus;
int i;
- for (i = 0; i < type->num_boxes; i++)
- free_percpu(type->pmus[i].box);
- kfree(type->pmus);
- type->pmus = NULL;
+ if (pmu) {
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ uncore_pmu_unregister(pmu);
+ uncore_free_boxes(pmu);
+ }
+ kfree(type->pmus);
+ type->pmus = NULL;
+ }
kfree(type->events_group);
type->events_group = NULL;
}
static void __init uncore_types_exit(struct intel_uncore_type **types)
{
- int i;
- for (i = 0; types[i]; i++)
- uncore_type_exit(types[i]);
+ for (; *types; types++)
+ uncore_type_exit(*types);
}
-static int __init uncore_type_init(struct intel_uncore_type *type)
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
{
struct intel_uncore_pmu *pmus;
struct attribute_group *attr_group;
struct attribute **attrs;
+ size_t size;
int i, j;
pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
if (!pmus)
return -ENOMEM;
- type->pmus = pmus;
+ size = max_packages * sizeof(struct intel_uncore_box *);
+ for (i = 0; i < type->num_boxes; i++) {
+ pmus[i].func_id = setid ? i : -1;
+ pmus[i].pmu_idx = i;
+ pmus[i].type = type;
+ pmus[i].boxes = kzalloc(size, GFP_KERNEL);
+ if (!pmus[i].boxes)
+ return -ENOMEM;
+ }
+
+ type->pmus = pmus;
type->unconstrainted = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
0, type->num_counters, 0, 0);
- for (i = 0; i < type->num_boxes; i++) {
- pmus[i].func_id = -1;
- pmus[i].pmu_idx = i;
- pmus[i].type = type;
- INIT_LIST_HEAD(&pmus[i].box_list);
- pmus[i].box = alloc_percpu(struct intel_uncore_box *);
- if (!pmus[i].box)
- goto fail;
- }
-
if (type->event_descs) {
- i = 0;
- while (type->event_descs[i].attr.attr.name)
- i++;
+ for (i = 0; type->event_descs[i].attr.attr.name; i++);
attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
sizeof(*attr_group), GFP_KERNEL);
if (!attr_group)
- goto fail;
+ return -ENOMEM;
attrs = (struct attribute **)(attr_group + 1);
attr_group->name = "events";
@@ -831,25 +858,19 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
type->pmu_group = &uncore_pmu_attr_group;
return 0;
-fail:
- uncore_type_exit(type);
- return -ENOMEM;
}
-static int __init uncore_types_init(struct intel_uncore_type **types)
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
{
- int i, ret;
+ int ret;
- for (i = 0; types[i]; i++) {
- ret = uncore_type_init(types[i]);
+ for (; *types; types++) {
+ ret = uncore_type_init(*types, setid);
if (ret)
- goto fail;
+ return ret;
}
return 0;
-fail:
- while (--i >= 0)
- uncore_type_exit(types[i]);
- return ret;
}
/*
@@ -857,28 +878,28 @@ fail:
*/
static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
+ struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
- struct intel_uncore_type *type;
- int phys_id;
- bool first_box = false;
+ int phys_id, pkg, ret;
phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
+ pkg = topology_phys_to_logical_pkg(phys_id);
+ if (WARN_ON_ONCE(pkg < 0))
+ return -EINVAL;
+
if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
- uncore_extra_pci_dev[phys_id][idx] = pdev;
+
+ uncore_extra_pci_dev[pkg].dev[idx] = pdev;
pci_set_drvdata(pdev, NULL);
return 0;
}
type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
- box = uncore_alloc_box(type, NUMA_NO_NODE);
- if (!box)
- return -ENOMEM;
-
/*
* for performance monitoring unit with multiple boxes,
* each box has a different function id.
@@ -890,44 +911,60 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
* some device types. Hence PCI device idx would be 0 for all devices.
* So increment pmu pointer to point to an unused array element.
*/
- if (boot_cpu_data.x86_model == 87)
+ if (boot_cpu_data.x86_model == 87) {
while (pmu->func_id >= 0)
pmu++;
+ }
+
+ if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+ return -EINVAL;
+
+ box = uncore_alloc_box(type, NUMA_NO_NODE);
+ if (!box)
+ return -ENOMEM;
+
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
- box->phys_id = phys_id;
+ atomic_inc(&box->refcnt);
+ box->pci_phys_id = phys_id;
+ box->pkgid = pkg;
box->pci_dev = pdev;
box->pmu = pmu;
uncore_box_init(box);
pci_set_drvdata(pdev, box);
- raw_spin_lock(&uncore_box_lock);
- if (list_empty(&pmu->box_list))
- first_box = true;
- list_add_tail(&box->list, &pmu->box_list);
- raw_spin_unlock(&uncore_box_lock);
+ pmu->boxes[pkg] = box;
+ if (atomic_inc_return(&pmu->activeboxes) > 1)
+ return 0;
- if (first_box)
- uncore_pmu_register(pmu);
- return 0;
+ /* First active box registers the pmu */
+ ret = uncore_pmu_register(pmu);
+ if (ret) {
+ pci_set_drvdata(pdev, NULL);
+ pmu->boxes[pkg] = NULL;
+ uncore_box_exit(box);
+ kfree(box);
+ }
+ return ret;
}
static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
- int i, cpu, phys_id;
- bool last_box = false;
+ int i, phys_id, pkg;
phys_id = uncore_pcibus_to_physid(pdev->bus);
+ pkg = topology_phys_to_logical_pkg(phys_id);
+
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
- if (uncore_extra_pci_dev[phys_id][i] == pdev) {
- uncore_extra_pci_dev[phys_id][i] = NULL;
+ if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+ uncore_extra_pci_dev[pkg].dev[i] = NULL;
break;
}
}
@@ -936,33 +973,20 @@ static void uncore_pci_remove(struct pci_dev *pdev)
}
pmu = box->pmu;
- if (WARN_ON_ONCE(phys_id != box->phys_id))
+ if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
return;
pci_set_drvdata(pdev, NULL);
-
- raw_spin_lock(&uncore_box_lock);
- list_del(&box->list);
- if (list_empty(&pmu->box_list))
- last_box = true;
- raw_spin_unlock(&uncore_box_lock);
-
- for_each_possible_cpu(cpu) {
- if (*per_cpu_ptr(pmu->box, cpu) == box) {
- *per_cpu_ptr(pmu->box, cpu) = NULL;
- atomic_dec(&box->refcnt);
- }
- }
-
- WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+ pmu->boxes[pkg] = NULL;
+ if (atomic_dec_return(&pmu->activeboxes) == 0)
+ uncore_pmu_unregister(pmu);
+ uncore_box_exit(box);
kfree(box);
-
- if (last_box)
- perf_pmu_unregister(&pmu->pmu);
}
static int __init uncore_pci_init(void)
{
+ size_t size;
int ret;
switch (boot_cpu_data.x86_model) {
@@ -999,25 +1023,40 @@ static int __init uncore_pci_init(void)
ret = skl_uncore_pci_init();
break;
default:
- return 0;
+ return -ENODEV;
}
if (ret)
return ret;
- ret = uncore_types_init(uncore_pci_uncores);
+ size = max_packages * sizeof(struct pci_extra_dev);
+ uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+ if (!uncore_extra_pci_dev) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = uncore_types_init(uncore_pci_uncores, false);
if (ret)
- return ret;
+ goto errtype;
uncore_pci_driver->probe = uncore_pci_probe;
uncore_pci_driver->remove = uncore_pci_remove;
ret = pci_register_driver(uncore_pci_driver);
- if (ret == 0)
- pcidrv_registered = true;
- else
- uncore_types_exit(uncore_pci_uncores);
+ if (ret)
+ goto errtype;
+
+ pcidrv_registered = true;
+ return 0;
+errtype:
+ uncore_types_exit(uncore_pci_uncores);
+ kfree(uncore_extra_pci_dev);
+ uncore_extra_pci_dev = NULL;
+ uncore_free_pcibus_map();
+err:
+ uncore_pci_uncores = empty_uncore;
return ret;
}
@@ -1027,173 +1066,139 @@ static void __init uncore_pci_exit(void)
pcidrv_registered = false;
pci_unregister_driver(uncore_pci_driver);
uncore_types_exit(uncore_pci_uncores);
- }
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
- struct intel_uncore_box *box;
-
- while (!list_empty(&boxes_to_free)) {
- box = list_entry(boxes_to_free.next,
- struct intel_uncore_box, list);
- list_del(&box->list);
- kfree(box);
+ kfree(uncore_extra_pci_dev);
+ uncore_free_pcibus_map();
}
}
static void uncore_cpu_dying(int cpu)
{
- struct intel_uncore_type *type;
+ struct intel_uncore_type *type, **types = uncore_msr_uncores;
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
- int i, j;
-
- for (i = 0; uncore_msr_uncores[i]; i++) {
- type = uncore_msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- box = *per_cpu_ptr(pmu->box, cpu);
- *per_cpu_ptr(pmu->box, cpu) = NULL;
- if (box && atomic_dec_and_test(&box->refcnt))
- list_add(&box->list, &boxes_to_free);
+ int i, pkg;
+
+ pkg = topology_logical_package_id(cpu);
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (box && atomic_dec_return(&box->refcnt) == 0)
+ uncore_box_exit(box);
}
}
}
-static int uncore_cpu_starting(int cpu)
+static void uncore_cpu_starting(int cpu, bool init)
{
- struct intel_uncore_type *type;
+ struct intel_uncore_type *type, **types = uncore_msr_uncores;
struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box, *exist;
- int i, j, k, phys_id;
-
- phys_id = topology_physical_package_id(cpu);
-
- for (i = 0; uncore_msr_uncores[i]; i++) {
- type = uncore_msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- box = *per_cpu_ptr(pmu->box, cpu);
- /* called by uncore_cpu_init? */
- if (box && box->phys_id >= 0) {
- uncore_box_init(box);
- continue;
- }
+ struct intel_uncore_box *box;
+ int i, pkg, ncpus = 1;
- for_each_online_cpu(k) {
- exist = *per_cpu_ptr(pmu->box, k);
- if (exist && exist->phys_id == phys_id) {
- atomic_inc(&exist->refcnt);
- *per_cpu_ptr(pmu->box, cpu) = exist;
- if (box) {
- list_add(&box->list,
- &boxes_to_free);
- box = NULL;
- }
- break;
- }
- }
+ if (init) {
+ /*
+ * On init we get the number of online cpus in the package
+ * and set refcount for all of them.
+ */
+ ncpus = cpumask_weight(topology_core_cpumask(cpu));
+ }
- if (box) {
- box->phys_id = phys_id;
+ pkg = topology_logical_package_id(cpu);
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (!box)
+ continue;
+ /* The first cpu on a package activates the box */
+ if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
uncore_box_init(box);
- }
}
}
- return 0;
}
-static int uncore_cpu_prepare(int cpu, int phys_id)
+static int uncore_cpu_prepare(int cpu)
{
- struct intel_uncore_type *type;
+ struct intel_uncore_type *type, **types = uncore_msr_uncores;
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
- int i, j;
-
- for (i = 0; uncore_msr_uncores[i]; i++) {
- type = uncore_msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- if (pmu->func_id < 0)
- pmu->func_id = j;
-
+ int i, pkg;
+
+ pkg = topology_logical_package_id(cpu);
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ if (pmu->boxes[pkg])
+ continue;
+ /* First cpu of a package allocates the box */
box = uncore_alloc_box(type, cpu_to_node(cpu));
if (!box)
return -ENOMEM;
-
box->pmu = pmu;
- box->phys_id = phys_id;
- *per_cpu_ptr(pmu->box, cpu) = box;
+ box->pkgid = pkg;
+ pmu->boxes[pkg] = box;
}
}
return 0;
}
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+ int new_cpu)
{
- struct intel_uncore_type *type;
- struct intel_uncore_pmu *pmu;
+ struct intel_uncore_pmu *pmu = type->pmus;
struct intel_uncore_box *box;
- int i, j;
+ int i, pkg;
- for (i = 0; uncores[i]; i++) {
- type = uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- if (old_cpu < 0)
- box = uncore_pmu_to_box(pmu, new_cpu);
- else
- box = uncore_pmu_to_box(pmu, old_cpu);
- if (!box)
- continue;
-
- if (old_cpu < 0) {
- WARN_ON_ONCE(box->cpu != -1);
- box->cpu = new_cpu;
- continue;
- }
+ pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (!box)
+ continue;
- WARN_ON_ONCE(box->cpu != old_cpu);
- if (new_cpu >= 0) {
- uncore_pmu_cancel_hrtimer(box);
- perf_pmu_migrate_context(&pmu->pmu,
- old_cpu, new_cpu);
- box->cpu = new_cpu;
- } else {
- box->cpu = -1;
- }
+ if (old_cpu < 0) {
+ WARN_ON_ONCE(box->cpu != -1);
+ box->cpu = new_cpu;
+ continue;
}
+
+ WARN_ON_ONCE(box->cpu != old_cpu);
+ box->cpu = -1;
+ if (new_cpu < 0)
+ continue;
+
+ uncore_pmu_cancel_hrtimer(box);
+ perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+ box->cpu = new_cpu;
}
}
+static void uncore_change_context(struct intel_uncore_type **uncores,
+ int old_cpu, int new_cpu)
+{
+ for (; *uncores; uncores++)
+ uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
static void uncore_event_exit_cpu(int cpu)
{
- int i, phys_id, target;
+ int target;
- /* if exiting cpu is used for collecting uncore events */
+ /* Check if exiting cpu is used for collecting uncore events */
if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
return;
- /* find a new cpu to collect uncore events */
- phys_id = topology_physical_package_id(cpu);
- target = -1;
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
- if (phys_id == topology_physical_package_id(i)) {
- target = i;
- break;
- }
- }
+ /* Find a new cpu to collect uncore events */
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
- /* migrate uncore events to the new cpu */
- if (target >= 0)
+ /* Migrate uncore events to the new target */
+ if (target < nr_cpu_ids)
cpumask_set_cpu(target, &uncore_cpu_mask);
+ else
+ target = -1;
uncore_change_context(uncore_msr_uncores, cpu, target);
uncore_change_context(uncore_pci_uncores, cpu, target);
@@ -1201,13 +1206,15 @@ static void uncore_event_exit_cpu(int cpu)
static void uncore_event_init_cpu(int cpu)
{
- int i, phys_id;
+ int target;
- phys_id = topology_physical_package_id(cpu);
- for_each_cpu(i, &uncore_cpu_mask) {
- if (phys_id == topology_physical_package_id(i))
- return;
- }
+ /*
+ * Check if there is an online cpu in the package
+ * which collects uncore events already.
+ */
+ target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+ if (target < nr_cpu_ids)
+ return;
cpumask_set_cpu(cpu, &uncore_cpu_mask);
@@ -1220,39 +1227,25 @@ static int uncore_cpu_notifier(struct notifier_block *self,
{
unsigned int cpu = (long)hcpu;
- /* allocate/free data structure for uncore box */
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- uncore_cpu_prepare(cpu, -1);
- break;
+ return notifier_from_errno(uncore_cpu_prepare(cpu));
+
case CPU_STARTING:
- uncore_cpu_starting(cpu);
+ uncore_cpu_starting(cpu, false);
+ case CPU_DOWN_FAILED:
+ uncore_event_init_cpu(cpu);
break;
+
case CPU_UP_CANCELED:
case CPU_DYING:
uncore_cpu_dying(cpu);
break;
- case CPU_ONLINE:
- case CPU_DEAD:
- uncore_kfree_boxes();
- break;
- default:
- break;
- }
- /* select the cpu that collects uncore events */
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- case CPU_STARTING:
- uncore_event_init_cpu(cpu);
- break;
case CPU_DOWN_PREPARE:
uncore_event_exit_cpu(cpu);
break;
- default:
- break;
}
-
return NOTIFY_OK;
}
@@ -1265,9 +1258,29 @@ static struct notifier_block uncore_cpu_nb = {
.priority = CPU_PRI_PERF + 1,
};
-static void __init uncore_cpu_setup(void *dummy)
+static int __init type_pmu_register(struct intel_uncore_type *type)
{
- uncore_cpu_starting(smp_processor_id());
+ int i, ret;
+
+ for (i = 0; i < type->num_boxes; i++) {
+ ret = uncore_pmu_register(&type->pmus[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+ struct intel_uncore_type **types = uncore_msr_uncores;
+ int ret;
+
+ for (; *types; types++) {
+ ret = type_pmu_register(*types);
+ if (ret)
+ return ret;
+ }
+ return 0;
}
static int __init uncore_cpu_init(void)
@@ -1311,71 +1324,61 @@ static int __init uncore_cpu_init(void)
knl_uncore_cpu_init();
break;
default:
- return 0;
+ return -ENODEV;
}
- ret = uncore_types_init(uncore_msr_uncores);
+ ret = uncore_types_init(uncore_msr_uncores, true);
if (ret)
- return ret;
+ goto err;
+ ret = uncore_msr_pmus_register();
+ if (ret)
+ goto err;
return 0;
+err:
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_msr_uncores = empty_uncore;
+ return ret;
}
-static int __init uncore_pmus_register(void)
+static void __init uncore_cpu_setup(void *dummy)
{
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_type *type;
- int i, j;
-
- for (i = 0; uncore_msr_uncores[i]; i++) {
- type = uncore_msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- uncore_pmu_register(pmu);
- }
- }
-
- return 0;
+ uncore_cpu_starting(smp_processor_id(), true);
}
-static void __init uncore_cpumask_init(void)
-{
- int cpu;
-
- /*
- * ony invoke once from msr or pci init code
- */
- if (!cpumask_empty(&uncore_cpu_mask))
- return;
+/* Lazy to avoid allocation of a few bytes for the normal case */
+static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
- cpu_notifier_register_begin();
+static int __init uncore_cpumask_init(bool msr)
+{
+ unsigned int cpu;
for_each_online_cpu(cpu) {
- int i, phys_id = topology_physical_package_id(cpu);
+ unsigned int pkg = topology_logical_package_id(cpu);
+ int ret;
- for_each_cpu(i, &uncore_cpu_mask) {
- if (phys_id == topology_physical_package_id(i)) {
- phys_id = -1;
- break;
- }
- }
- if (phys_id < 0)
+ if (test_and_set_bit(pkg, packages))
continue;
-
- uncore_cpu_prepare(cpu, phys_id);
+ /*
+ * The first online cpu of each package allocates and takes
+ * the refcounts for all other online cpus in that package.
+ * If msrs are not enabled no allocation is required.
+ */
+ if (msr) {
+ ret = uncore_cpu_prepare(cpu);
+ if (ret)
+ return ret;
+ }
uncore_event_init_cpu(cpu);
+ smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
}
- on_each_cpu(uncore_cpu_setup, NULL, 1);
-
__register_cpu_notifier(&uncore_cpu_nb);
-
- cpu_notifier_register_done();
+ return 0;
}
-
static int __init intel_uncore_init(void)
{
- int ret;
+ int pret, cret, ret;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return -ENODEV;
@@ -1383,19 +1386,27 @@ static int __init intel_uncore_init(void)
if (cpu_has_hypervisor)
return -ENODEV;
- ret = uncore_pci_init();
- if (ret)
- goto fail;
- ret = uncore_cpu_init();
- if (ret) {
- uncore_pci_exit();
- goto fail;
- }
- uncore_cpumask_init();
+ max_packages = topology_max_packages();
+
+ pret = uncore_pci_init();
+ cret = uncore_cpu_init();
- uncore_pmus_register();
+ if (cret && pret)
+ return -ENODEV;
+
+ cpu_notifier_register_begin();
+ ret = uncore_cpumask_init(!cret);
+ if (ret)
+ goto err;
+ cpu_notifier_register_done();
return 0;
-fail:
+
+err:
+ /* Undo box->init_box() */
+ on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_pci_exit();
+ cpu_notifier_register_done();
return ret;
}
device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/events/intel/uncore.h
index a7086b8..79766b9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -1,8 +1,10 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/pci.h>
+#include <asm/apicdef.h>
+
#include <linux/perf_event.h>
-#include "perf_event.h"
+#include "../perf_event.h"
#define UNCORE_PMU_NAME_LEN 32
#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
@@ -19,11 +21,12 @@
#define UNCORE_EXTRA_PCI_DEV 0xff
#define UNCORE_EXTRA_PCI_DEV_MAX 3
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX 8
-
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+struct pci_extra_dev {
+ struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
struct intel_uncore_ops;
struct intel_uncore_pmu;
struct intel_uncore_box;
@@ -61,6 +64,7 @@ struct intel_uncore_type {
struct intel_uncore_ops {
void (*init_box)(struct intel_uncore_box *);
+ void (*exit_box)(struct intel_uncore_box *);
void (*disable_box)(struct intel_uncore_box *);
void (*enable_box)(struct intel_uncore_box *);
void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
@@ -73,13 +77,14 @@ struct intel_uncore_ops {
};
struct intel_uncore_pmu {
- struct pmu pmu;
- char name[UNCORE_PMU_NAME_LEN];
- int pmu_idx;
- int func_id;
- struct intel_uncore_type *type;
- struct intel_uncore_box ** __percpu box;
- struct list_head box_list;
+ struct pmu pmu;
+ char name[UNCORE_PMU_NAME_LEN];
+ int pmu_idx;
+ int func_id;
+ bool registered;
+ atomic_t activeboxes;
+ struct intel_uncore_type *type;
+ struct intel_uncore_box **boxes;
};
struct intel_uncore_extra_reg {
@@ -89,7 +94,8 @@ struct intel_uncore_extra_reg {
};
struct intel_uncore_box {
- int phys_id;
+ int pci_phys_id;
+ int pkgid;
int n_active; /* number of active events */
int n_events;
int cpu; /* cpu to collect events */
@@ -123,7 +129,6 @@ struct pci2phy_map {
int pbus_to_physid[256];
};
-int uncore_pcibus_to_physid(struct pci_bus *bus);
struct pci2phy_map *__find_pci2phy_map(int segment);
ssize_t uncore_event_show(struct kobject *kobj,
@@ -305,14 +310,30 @@ static inline void uncore_box_init(struct intel_uncore_box *box)
}
}
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+ if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->exit_box)
+ box->pmu->type->ops->exit_box(box);
+ }
+}
+
static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
- return (box->phys_id < 0);
+ return (box->pkgid < 0);
+}
+
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+ return event->pmu_private;
}
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
@@ -328,7 +349,7 @@ extern struct intel_uncore_type **uncore_pci_uncores;
extern struct pci_driver *uncore_pci_driver;
extern raw_spinlock_t pci2phy_map_lock;
extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct pci_extra_dev *uncore_extra_pci_dev;
extern struct event_constraint uncore_constraint_empty;
/* perf_event_intel_uncore_snb.c */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 2749965..cda5693 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -1,5 +1,5 @@
/* Nehalem-EX/Westmere-EX uncore support */
-#include "perf_event_intel_uncore.h"
+#include "uncore.h"
/* NHM-EX event control */
#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
@@ -201,6 +201,11 @@ static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
}
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
{
unsigned msr = uncore_msr_box_ctl(box);
@@ -250,6 +255,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p
#define NHMEX_UNCORE_OPS_COMMON_INIT() \
.init_box = nhmex_uncore_msr_init_box, \
+ .exit_box = nhmex_uncore_msr_exit_box, \
.disable_box = nhmex_uncore_msr_disable_box, \
.enable_box = nhmex_uncore_msr_enable_box, \
.disable_event = nhmex_uncore_msr_disable_event, \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 2bd030d..96531d2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1,5 +1,5 @@
/* Nehalem/SandBridge/Haswell uncore support */
-#include "perf_event_intel_uncore.h"
+#include "uncore.h"
/* Uncore IMC PCI IDs */
#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
@@ -95,6 +95,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
}
}
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
static struct uncore_event_desc snb_uncore_events[] = {
INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
{ /* end: all zeroes */ },
@@ -116,6 +122,7 @@ static struct attribute_group snb_uncore_format_group = {
static struct intel_uncore_ops snb_uncore_msr_ops = {
.init_box = snb_uncore_msr_init_box,
+ .exit_box = snb_uncore_msr_exit_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = snb_uncore_msr_enable_event,
.read_counter = uncore_msr_read_counter,
@@ -231,6 +238,11 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
}
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+ iounmap(box->io_addr);
+}
+
static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
{}
@@ -301,6 +313,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
return -EINVAL;
event->cpu = box->cpu;
+ event->pmu_private = box;
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
@@ -458,6 +471,7 @@ static struct pmu snb_uncore_imc_pmu = {
static struct intel_uncore_ops snb_uncore_imc_ops = {
.init_box = snb_uncore_imc_init_box,
+ .exit_box = snb_uncore_imc_exit_box,
.enable_box = snb_uncore_imc_enable_box,
.disable_box = snb_uncore_imc_disable_box,
.disable_event = snb_uncore_imc_disable_event,
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 33acb88..93f6bd9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,5 @@
/* SandyBridge-EP/IvyTown uncore support */
-#include "perf_event_intel_uncore.h"
-
+#include "uncore.h"
/* SNB-EP Box level control */
#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
@@ -987,7 +986,9 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve
if (reg1->idx != EXTRA_REG_NONE) {
int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
- struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+ int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+ struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
if (filter_pdev) {
pci_write_config_dword(filter_pdev, reg1->reg,
(u32)reg1->config);
@@ -2521,14 +2522,16 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
void hswep_uncore_cpu_init(void)
{
+ int pkg = topology_phys_to_logical_pkg(0);
+
if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
/* Detect 6-8 core systems with only two SBOXes */
- if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+ if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
u32 capid4;
- pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+ pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
0x94, &capid4);
if (((capid4 >> 6) & 0x3) == 0)
hswep_uncore_sbox.num_boxes = 2;
@@ -2875,11 +2878,13 @@ static struct intel_uncore_type bdx_uncore_sbox = {
.format_group = &hswep_uncore_sbox_format_group,
};
+#define BDX_MSR_UNCORE_SBOX 3
+
static struct intel_uncore_type *bdx_msr_uncores[] = {
&bdx_uncore_ubox,
&bdx_uncore_cbox,
- &bdx_uncore_sbox,
&hswep_uncore_pcu,
+ &bdx_uncore_sbox,
NULL,
};
@@ -2888,6 +2893,10 @@ void bdx_uncore_cpu_init(void)
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
+
+ /* BDX-DE doesn't have SBOX */
+ if (boot_cpu_data.x86_model == 86)
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
}
static struct intel_uncore_type bdx_uncore_ha = {
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/events/msr.c
index ec863b9..ec863b9 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/events/msr.c
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/events/perf_event.h
index 7bb61e3..68155ca 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -586,6 +586,7 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1;
int pebs_record_size;
+ int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
@@ -860,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
extern struct event_constraint intel_skl_pebs_event_constraints[];
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
@@ -904,6 +907,8 @@ void intel_pmu_lbr_init_skl(void);
void intel_pmu_lbr_init_knl(void);
+void intel_pmu_pebs_data_source_nhm(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 3c56ef1..5e828da 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
};
struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- bool interrupt_capable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
+ unsigned int block; /* Number within bank */
+ unsigned int bank; /* MCA bank the block belongs to */
+ unsigned int cpu; /* CPU which controls MCA bank */
+ u32 address; /* MSR address for the block */
+ u16 interrupt_enable; /* Enable/Disable APIC interrupt */
+ bool interrupt_capable; /* Bank can generate an interrupt. */
+
+ u16 threshold_limit; /*
+ * Value upon which threshold
+ * interrupt is generated.
+ */
+
+ struct kobject kobj; /* sysfs object */
+ struct list_head miscj; /*
+ * List of threshold blocks
+ * within a bank.
+ */
};
struct threshold_bank {
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a584e1c..bfb28ca 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -6,18 +6,17 @@
/*
* Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
+ * And yes, this might be required on UP too when we're talking
* to devices.
*/
#ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
#else
#define mb() asm volatile("mfence":::"memory")
#define rmb() asm volatile("lfence":::"memory")
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e63aa38..61518cf 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,16 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
extern const int rodata_test_data;
extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
#ifdef CONFIG_DEBUG_RODATA_TEST
int rodata_test(void);
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index f50de69..532f85e 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -112,8 +112,7 @@ static inline __sum16 csum_fold(__wsum sum)
}
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
asm("addl %1, %0 ;\n"
@@ -131,8 +130,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
* returns a 16-bit checksum, already complemented
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
@@ -151,8 +149,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
#define _HAVE_ARCH_IPV6_CSUM
static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto,
- __wsum sum)
+ __u32 len, __u8 proto, __wsum sum)
{
asm("addl 0(%1), %0 ;\n"
"adcl 4(%1), %0 ;\n"
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e17..c020ee7 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -84,8 +84,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
* 32bit unfolded.
*/
static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
- unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
asm(" addl %1, %0\n"
" adcl %2, %0\n"
@@ -110,8 +110,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
* complemented and ready to be filled in.
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto, __wsum sum)
+ __u32 len, __u8 proto,
+ __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
@@ -177,7 +177,7 @@ struct in6_addr;
#define _HAVE_ARCH_IPV6_CSUM 1
extern __sum16
csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
- __u32 len, unsigned short proto, __wsum sum);
+ __u32 len, __u8 proto, __wsum sum);
static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6663fae..074b760 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -85,7 +85,7 @@
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */
#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
@@ -220,6 +220,7 @@
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
@@ -230,6 +231,8 @@
#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
@@ -286,4 +289,12 @@
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 535192f..3c69fed 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
/* Use early IO mappings for DMI because it's initialized early */
#define dmi_early_remap early_ioremap
#define dmi_early_unmap early_iounmap
-#define dmi_remap ioremap
+#define dmi_remap ioremap_cache
#define dmi_unmap iounmap
#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0010c78..08b1f2f 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -25,6 +25,8 @@
#define EFI32_LOADER_SIGNATURE "EL32"
#define EFI64_LOADER_SIGNATURE "EL64"
+#define MAX_CMDLINE_ADDRESS UINT_MAX
+
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1514753..15340e3 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -256,7 +256,7 @@ extern int force_personality32;
instruction set this CPU supports. This could be done in user space,
but it's not easy, and we've already done it here. */
-#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX])
/* This yields a string that ld.so will use to load implementation
specific libraries for optimization. This is more specific in
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 6d7d0e5..8554f96 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
extern int fixmaps_set;
extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
+#define kmap_prot PAGE_KERNEL
extern pte_t *pkmap_page_table;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index c2e46eb..a212434 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -590,7 +590,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
* If the task has used the math, pre-load the FPU on xsave processors
* or if the past 5 consecutive context-switches used math.
*/
- fpu.preload = new_fpu->fpstate_active &&
+ fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
+ new_fpu->fpstate_active &&
(use_eager_fpu() || new_fpu->counter > 5);
if (old_fpu->fpregs_active) {
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index af30fde..f23cd8c 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,16 +20,15 @@
/* Supported features which support lazy state saving */
#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_SSE | \
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
/* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
deleted file mode 100644
index b3799d8..0000000
--- a/arch/x86/include/asm/gpio.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __LINUX_GPIO_H
-#warning Include linux/gpio.h instead of asm/gpio.h
-#include <linux/gpio.h>
-#endif
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
index cd2ce40..ebea2c9 100644
--- a/arch/x86/include/asm/imr.h
+++ b/arch/x86/include/asm/imr.h
@@ -53,7 +53,7 @@
#define IMR_MASK (IMR_ALIGN - 1)
int imr_add_range(phys_addr_t base, size_t size,
- unsigned int rmask, unsigned int wmask, bool lock);
+ unsigned int rmask, unsigned int wmask);
int imr_remove_range(phys_addr_t base, size_t size);
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index cfc9a0d..a4fe16e 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void)
cpu_relax();
}
-static inline void
-__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
-{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- __xapic_wait_icr_idle();
-
- /*
- * No need to touch the target chip field
- */
- cfg = __prepare_ICR(shortcut, vector, dest);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
/*
* This is used to send an IPI with no shorthand notation (the destination is
* specified in bits 56 to 63 of the ICR).
*/
-static inline void
- __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
-{
- unsigned long cfg;
-
- /*
- * Wait for idle.
- */
- if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
- else
- __xapic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- native_apic_mem_write(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
extern void default_send_IPI_single(int cpu, int vector);
extern void default_send_IPI_single_phys(int cpu, int vector);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44adbb8..01c8b50 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -32,6 +32,7 @@
#include <asm/mtrr.h>
#include <asm/msr-index.h>
#include <asm/asm.h>
+#include <asm/kvm_page_track.h>
#define KVM_MAX_VCPUS 255
#define KVM_SOFT_MAX_VCPUS 160
@@ -214,6 +215,14 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
};
+/*
+ * the pages used as guest page table on soft mmu are tracked by
+ * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
+ * by indirect shadow page can not be more than 15 bits.
+ *
+ * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ */
union kvm_mmu_page_role {
unsigned word;
struct {
@@ -276,7 +285,7 @@ struct kvm_mmu_page {
#endif
/* Number of writes since the last time traversal visited this page. */
- int write_flooding_count;
+ atomic_t write_flooding_count;
};
struct kvm_pio_request {
@@ -338,12 +347,8 @@ struct kvm_mmu {
struct rsvd_bits_validate guest_rsvd_check;
- /*
- * Bitmap: bit set = last pte in walk
- * index[0:1]: level (zero-based)
- * index[2]: pte.ps
- */
- u8 last_pte_bitmap;
+ /* Can have large pages at levels 2..last_nonleaf_level-1. */
+ u8 last_nonleaf_level;
bool nx;
@@ -498,7 +503,6 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_page_header_cache;
struct fpu guest_fpu;
- bool eager_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
u32 guest_xstate_size;
@@ -644,12 +648,13 @@ struct kvm_vcpu_arch {
};
struct kvm_lpage_info {
- int write_count;
+ int disallow_lpage;
};
struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
};
/*
@@ -694,6 +699,8 @@ struct kvm_arch {
*/
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
+ struct kvm_page_track_notifier_node mmu_sp_tracker;
+ struct kvm_page_track_notifier_head track_notifier_head;
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
@@ -754,6 +761,8 @@ struct kvm_arch {
bool irqchip_split;
u8 nr_reserved_ioapic_pins;
+
+ bool disabled_lapic_found;
};
struct kvm_vm_stat {
@@ -988,6 +997,8 @@ void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -1127,8 +1138,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
new file mode 100644
index 0000000..c2b8d24
--- /dev/null
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_KVM_PAGE_TRACK_H
+#define _ASM_X86_KVM_PAGE_TRACK_H
+
+enum kvm_page_track_mode {
+ KVM_PAGE_TRACK_WRITE,
+ KVM_PAGE_TRACK_MAX,
+};
+
+/*
+ * The notifier represented by @kvm_page_track_notifier_node is linked into
+ * the head which will be notified when guest is triggering the track event.
+ *
+ * Write access on the head is protected by kvm->mmu_lock, read access
+ * is protected by track_srcu.
+ */
+struct kvm_page_track_notifier_head {
+ struct srcu_struct track_srcu;
+ struct hlist_head track_notifier_list;
+};
+
+struct kvm_page_track_notifier_node {
+ struct hlist_node node;
+
+ /*
+ * It is called when guest is writing the write-tracked page
+ * and write emulation is finished at that time.
+ *
+ * @vcpu: the vcpu where the write access happened.
+ * @gpa: the physical address written by guest.
+ * @new: the data was written to the address.
+ * @bytes: the written length.
+ */
+ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+};
+
+void kvm_page_track_init(struct kvm *kvm);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont);
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+#endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c1adf33..bc62e7c 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
}
#endif /* CONFIG_KVM_GUEST */
-#ifdef CONFIG_DEBUG_RODATA
#define KVM_HYPERCALL \
ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index e795f52..7e68f95 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -25,7 +25,6 @@
#include <linux/module.h>
#include <linux/ftrace.h>
-#ifdef CONFIG_LIVEPATCH
static inline int klp_check_compiler_support(void)
{
#ifndef CC_USING_FENTRY
@@ -40,8 +39,5 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
{
regs->ip = ip;
}
-#else
-#error Include linux/livepatch.h, not asm/livepatch.h
-#endif
#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 18d2ba9..92b6f65 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,8 +40,20 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
/* AMD-specific bits */
-#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank.
+ * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ * But should not be used to determine MSR numbers.
+ * - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX 0x1
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
/*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -91,6 +103,16 @@
#define MCE_LOG_LEN 32
#define MCE_LOG_SIGNATURE "MACHINECHECK"
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
+#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+
/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
@@ -288,4 +310,49 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+ SMCA_F17H_CORE = 0, /* Core errors */
+ SMCA_DF, /* Data Fabric */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+ const char *name;
+ unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 cache */
+ SMCA_DE, /* Decoder unit */
+ RES, /* Reserved */
+ SMCA_EX, /* Execution unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 cache */
+ N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+ SMCA_CS = 0, /* Coherent Slave */
+ SMCA_PIE, /* Power management, Interrupts, etc */
+ N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 1e1b07a..9d3a96c 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -3,6 +3,7 @@
#include <asm/cpu.h>
#include <linux/earlycpio.h>
+#include <linux/initrd.h>
#define native_rdmsr(msr, val1, val2) \
do { \
@@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { }
static inline bool
get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
#endif
+
+static inline unsigned long get_initrd_start(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ return initrd_start;
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned long get_initrd_start_addr(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_X86_32
+ unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+
+ return (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+ return get_initrd_start();
+#endif
+#else /* CONFIG_BLK_DEV_INITRD */
+ return 0;
+#endif
+}
+
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 8559b01..603417f 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -40,7 +40,6 @@ struct extended_sigtable {
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
-#define DWSIZE (sizeof(u32))
#define get_totalsize(mc) \
(((struct microcode_intel *)mc)->hdr.datasize ? \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 75a5bb6..2da46ac 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -235,10 +235,10 @@
#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11)
/* IA32_HWP_CAPABILITIES */
-#define HWP_HIGHEST_PERF(x) (x & 0xff)
-#define HWP_GUARANTEED_PERF(x) ((x & (0xff << 8)) >>8)
-#define HWP_MOSTEFFICIENT_PERF(x) ((x & (0xff << 16)) >>16)
-#define HWP_LOWEST_PERF(x) ((x & (0xff << 24)) >>24)
+#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff)
+#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff)
+#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff)
+#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff)
/* IA32_HWP_REQUEST */
#define HWP_MIN_PERF(x) (x & 0xff)
@@ -269,10 +269,6 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
-/* 'SMCA': AMD64 Scalable MCA */
-#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
-#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
-
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 4625943..9ab7507 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -20,6 +20,9 @@ struct pci_sysdata {
#ifdef CONFIG_X86_64
void *iommu; /* IOMMU private data */
#endif
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+ void *fwnode; /* IRQ domain for MSI assignment */
+#endif
};
extern int pci_routeirq;
@@ -32,6 +35,7 @@ extern int noioapicreroute;
static inline int pci_domain_nr(struct pci_bus *bus)
{
struct pci_sysdata *sd = bus->sysdata;
+
return sd->domain;
}
@@ -41,6 +45,17 @@ static inline int pci_proc_domain(struct pci_bus *bus)
}
#endif
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->fwnode;
+}
+
+#define pci_root_bus_fwnode _pci_root_bus_fwnode
+#endif
+
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
or architectures with incomplete PCI setup by the loader */
@@ -105,9 +120,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
#include <asm/pci_64.h>
#endif
-/* implement the pci_ DMA API in terms of the generic device dma_ one */
-#include <asm-generic/pci-dma-compat.h>
-
/* generic pci stuff */
#include <asm-generic/pci.h>
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 46873fb..d08eacd2 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock;
extern int (*pcibios_enable_irq)(struct pci_dev *dev);
extern void (*pcibios_disable_irq)(struct pci_dev *dev);
+extern bool mp_should_keep_irq(struct device *dev);
+
struct pci_raw_ops {
int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 *val);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7bcb861..5a2ed3e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
#define GLOBAL_STATUS_ASIF BIT_ULL(60)
#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55)
/*
* IBS cpuid feature detection
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index c57fd1e..bf8b35d 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -137,6 +137,11 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
arch_wb_cache_pmem(addr, size);
}
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+ clflush_cache_range((void __force *) addr, size);
+}
+
static inline bool __arch_has_wmb_pmem(void)
{
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ecb4103..983738a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -128,6 +128,8 @@ struct cpuinfo_x86 {
u16 booted_cores;
/* Physical processor id: */
u16 phys_proc_id;
+ /* Logical processor id: */
+ u16 logical_proc_id;
/* Core id: */
u16 cpu_core_id;
/* Compute unit id */
@@ -297,10 +299,13 @@ struct tss_struct {
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+#ifdef CONFIG_X86_32
/*
- * Space for the temporary SYSENTER stack:
+ * Space for the temporary SYSENTER stack.
*/
+ unsigned long SYSENTER_stack_canary;
unsigned long SYSENTER_stack[64];
+#endif
} ____cacheline_aligned;
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a4a7728..9b9b30b 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -7,12 +7,23 @@
void syscall_init(void);
+#ifdef CONFIG_X86_64
void entry_SYSCALL_64(void);
-void entry_SYSCALL_compat(void);
+#endif
+
+#ifdef CONFIG_X86_32
void entry_INT80_32(void);
-void entry_INT80_compat(void);
void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_compat(void);
+#endif
void x86_configure_nx(void);
void x86_report_nx(void);
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 0a52424..13b6cdd 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
extern char __end_rodata_hpage_align[];
#endif
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 70bbe39..7c247e7 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo,
/* Generic stack tracer with callbacks */
struct stacktrace_ops {
- void (*address)(void *data, unsigned long address, int reliable);
+ int (*address)(void *data, unsigned long address, int reliable);
/* On negative return stop dumping */
int (*stack)(void *data, char *name);
walk_stack_t walk_stack;
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a1..ca6ba36 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
#define memset(s, c, n) __memset(s, c, n)
#endif
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst: destination address
+ * @src: source address
+ * @cnt: number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c0778fc..8286669 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -134,10 +134,13 @@ struct thread_info {
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
- _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
_TIF_NOHZ)
/* work to do on any return to user space */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0bb31cb..c24b422 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -8,6 +8,54 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ *
+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -105,6 +153,15 @@ static inline void __native_flush_tlb_global(void)
{
unsigned long flags;
+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0fb4648..7f991bd5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { }
extern const struct cpumask *cpu_coregroup_mask(int cpu);
+#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
#ifdef ENABLE_TOPO_DEFINES
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages() (__max_logical_packages)
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+extern int topology_phys_to_logical_pkg(unsigned int pkg);
+#else
+#define topology_max_packages() (1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 6d7c547..174c421 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void)
return rdtsc();
}
+extern struct system_counterval_t convert_art_to_tsc(cycle_t art);
+
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 7956412..9b1a918 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -226,7 +226,9 @@
(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
/* Declare the various hypercall operations. */
-#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_POST_MESSAGE 0x005c
+#define HVCALL_SIGNAL_EVENT 0x005d
#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d1daead..adb3eaf 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
#include <asm/cacheflush.h>
#include <asm/realmode.h>
+#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
do_suspend_lowlevel();
+ unpause_graph_tracing();
return 0;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e85f71..0a2bb1f 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
return 0;
}
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
static int __init parse_gart_mem(char *p)
{
- if (!p)
- return -EINVAL;
-
- if (!strncmp(p, "off", 3))
- gart_fix_e820 = 0;
- else if (!strncmp(p, "on", 2))
- gart_fix_e820 = 1;
-
- return 0;
+ return kstrtobool(p, &gart_fix_e820);
}
early_param("gart_fix_e820", parse_gart_mem);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8a5cdda..531b961 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2078,6 +2078,20 @@ int generic_processor_info(int apicid, int version)
cpu = cpumask_next_zero(-1, cpu_present_mask);
/*
+ * This can happen on physical hotplug. The sanity check at boot time
+ * is done from native_smp_prepare_cpus() after num_possible_cpus() is
+ * established.
+ */
+ if (topology_update_package_map(apicid, cpu) < 0) {
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+ disabled_cpus++;
+ return -ENOSPC;
+ }
+
+ /*
* Validate version
*/
if (version == 0x0) {
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 9968f30..76f89e2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ void flat_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
+static void _flat_send_IPI_mask(unsigned long mask, int vector)
{
unsigned long flags;
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index eb45fc9..28bde88 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,66 @@
#include <asm/proto.h>
#include <asm/ipi.h>
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+{
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ __xapic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ __xapic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ native_apic_mem_write(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
void default_send_IPI_single_phys(int cpu, int vector)
{
unsigned long flags;
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 84a7524..5c04246 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -59,7 +59,6 @@ void common(void) {
#ifdef CONFIG_PARAVIRT
BLANK();
- OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fdeb0ce..ecdc1d2 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -52,6 +52,11 @@ void foo(void)
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
offsetofend(struct tss_struct, SYSENTER_stack));
+ /* Offset from cpu_tss to SYSENTER_stack */
+ OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+ /* Size of SYSENTER_stack */
+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index faa7b52..0d373d7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,33 +30,11 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
-
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
-endif
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o
-
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
- perf_event_intel_uncore_snb.o \
- perf_event_intel_uncore_snbep.o \
- perf_event_intel_uncore_nhmex.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o
-obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o
-endif
-
-
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fe2f089..5026a13 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -120,7 +120,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
void (*f_vide)(void);
u64 d, d2;
- printk(KERN_INFO "AMD K6 stepping B detected - ");
+ pr_info("AMD K6 stepping B detected - ");
/*
* It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -136,10 +136,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
d = d2-d;
if (d > 20*K6_BUG_LOOP)
- printk(KERN_CONT
- "system stability may be impaired when more than 32 MB are used.\n");
+ pr_cont("system stability may be impaired when more than 32 MB are used.\n");
else
- printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+ pr_cont("probably OK (after B9730xxxx).\n");
}
/* K6 with old style WHCR */
@@ -157,7 +156,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+ pr_info("Enabling old style K6 write allocation for %d Mb\n",
mbytes);
}
return;
@@ -178,7 +177,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+ pr_info("Enabling new style K6 write allocation for %d Mb\n",
mbytes);
}
@@ -205,7 +204,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
*/
if (c->x86_model >= 6 && c->x86_model <= 10) {
if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+ pr_info("Enabling disabled K7/SSE Support.\n");
msr_clear_bit(MSR_K7_HWCR, 15);
set_cpu_cap(c, X86_FEATURE_XMM);
}
@@ -219,9 +218,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
rdmsr(MSR_K7_CLK_CTL, l, h);
if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
+ pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
}
}
@@ -488,7 +486,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
unsigned long pfn = tseg >> PAGE_SHIFT;
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ pr_debug("tseg: %010llx\n", tseg);
if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
@@ -503,8 +501,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
rdmsrl(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
- printk(KERN_WARNING FW_BUG "TSC doesn't count "
- "with P0 frequency!\n");
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
}
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 04f0fe5..a972ac4 100644
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
{
identify_boot_cpu();
#if !defined(CONFIG_SMP)
- printk(KERN_INFO "CPU: ");
+ pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
alternative_instructions();
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 6608c03..1661d8e 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
wrmsr(MSR_VIA_FCR, lo, hi);
- printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+ pr_info("CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
@@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
wrmsr(MSR_VIA_RNG, lo, hi);
- printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+ pr_info("CPU: Enabled h/w RNG\n");
}
/* store Centaur Extended Feature Flags as
@@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
name = "C6";
fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
fcr_clr = DPDC;
- printk(KERN_NOTICE "Disabling bugged TSC.\n");
+ pr_notice("Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
break;
case 8:
@@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c)
newlo = (lo|fcr_set) & (~fcr_clr);
if (newlo != lo) {
- printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+ pr_info("Centaur FCR was 0x%X now 0x%X\n",
lo, newlo);
wrmsr(MSR_IDT_FCR1, newlo, hi);
} else {
- printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+ pr_info("Centaur FCR is 0x%X\n", lo);
}
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 079d83f..249461f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
+static int __init x86_noinvpcid_setup(char *s)
+{
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
@@ -228,7 +244,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
lo |= 0x200000;
wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
+ pr_notice("CPU serial number disabled.\n");
clear_cpu_cap(c, X86_FEATURE_PN);
/* Disabling the serial number may affect the cpuid level */
@@ -329,9 +345,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- printk(KERN_WARNING
- "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
- x86_cap_flag(df->feature), df->level);
+ pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+ x86_cap_flag(df->feature), df->level);
}
}
@@ -510,7 +525,7 @@ void detect_ht(struct cpuinfo_x86 *c)
smp_num_siblings = (ebx & 0xff0000) >> 16;
if (smp_num_siblings == 1) {
- printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+ pr_info_once("CPU0: Hyper-Threading is disabled\n");
goto out;
}
@@ -531,10 +546,10 @@ void detect_ht(struct cpuinfo_x86 *c)
out:
if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
+ pr_info("CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ pr_info("CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
printed = 1;
}
#endif
@@ -559,9 +574,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c)
}
}
- printk_once(KERN_ERR
- "CPU: vendor_id '%s' unknown, using generic init.\n" \
- "CPU: Your system may be unstable.\n", v);
+ pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
c->x86_vendor = X86_VENDOR_UNKNOWN;
this_cpu = &default_cpu;
@@ -760,7 +774,7 @@ void __init early_cpu_init(void)
int count = 0;
#ifdef CONFIG_PROCESSOR_SELECT
- printk(KERN_INFO "KERNEL supported cpus:\n");
+ pr_info("KERNEL supported cpus:\n");
#endif
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
@@ -778,7 +792,7 @@ void __init early_cpu_init(void)
for (j = 0; j < 2; j++) {
if (!cpudev->c_ident[j])
continue;
- printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
+ pr_info(" %s %s\n", cpudev->c_vendor,
cpudev->c_ident[j]);
}
}
@@ -803,6 +817,31 @@ static void detect_nopl(struct cpuinfo_x86 *c)
#else
set_cpu_cap(c, X86_FEATURE_NOPL);
#endif
+
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_PARAVIRT
+ do {
+ extern void native_iret(void);
+ if (pv_cpu_ops.iret == native_iret)
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+ } while (0);
+#else
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
+#endif
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -977,6 +1016,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
+ /* The boot/hotplug time assigment got cleared, restore it */
+ c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
}
/*
@@ -1061,7 +1102,7 @@ static void __print_cpu_msr(void)
for (index = index_min; index < index_max; index++) {
if (rdmsrl_safe(index, &val))
continue;
- printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+ pr_info(" MSR%08x: %016llx\n", index, val);
}
}
}
@@ -1100,19 +1141,19 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
if (vendor && !strstr(c->x86_model_id, vendor))
- printk(KERN_CONT "%s ", vendor);
+ pr_cont("%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
+ pr_cont("%s", c->x86_model_id);
else
- printk(KERN_CONT "%d86", c->x86);
+ pr_cont("%d86", c->x86);
- printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+ pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
+ pr_cont(", stepping: 0x%x)\n", c->x86_mask);
else
- printk(KERN_CONT ")\n");
+ pr_cont(")\n");
print_cpu_msr(c);
}
@@ -1438,7 +1479,7 @@ void cpu_init(void)
show_ucode_info_early();
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+ pr_info("Initializing CPU#%d\n", cpu);
if (cpu_feature_enabled(X86_FEATURE_VME) ||
cpu_has_tsc ||
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 15e47c1..6adef9c 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -104,7 +104,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c)
local_irq_restore(flags);
if (ccr5 & 2) { /* possible wrong calibration done */
- printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+ pr_info("Recalibrating delay loop with SLOP bit reset\n");
calibrate_delay();
c->loops_per_jiffy = loops_per_jiffy;
}
@@ -116,7 +116,7 @@ static void set_cx86_reorder(void)
{
u8 ccr3;
- printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
@@ -129,7 +129,7 @@ static void set_cx86_reorder(void)
static void set_cx86_memwb(void)
{
- printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
@@ -269,7 +269,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
* VSA1 we work around however.
*/
- printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+ pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
/* We do this before the PCI layer is running. However we
@@ -427,7 +427,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
- printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+ pr_info("Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
/* enable MAPEN */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index d820d8e..73d391a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -56,7 +56,7 @@ detect_hypervisor_vendor(void)
}
if (max_pri)
- printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
+ pr_info("Hypervisor detected: %s\n", x86_hyper->name);
}
void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9299e3b..1f7fdb9 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
c->microcode < 0x20e) {
- printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+ pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
}
@@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
- printk(KERN_INFO "Disabled fast string operations\n");
+ pr_info("Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
@@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
+
+ if (c->cpuid_level >= 0x00000001) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+ /*
+ * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+ * apicids which are reserved per package. Store the resulting
+ * shift value for the package management code.
+ */
+ if (edx & (1U << 28))
+ c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+ }
}
#ifdef CONFIG_X86_32
@@ -176,7 +189,7 @@ int ppro_with_ram_bug(void)
boot_cpu_data.x86 == 6 &&
boot_cpu_data.x86_model == 1 &&
boot_cpu_data.x86_mask < 8) {
- printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+ pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
return 0;
@@ -225,7 +238,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
- printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+ pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
}
@@ -244,7 +257,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* Forcefully enable PAE if kernel parameter "forcepae" is present.
*/
if (forcepae) {
- printk(KERN_WARNING "PAE forced!\n");
+ pr_warn("PAE forced!\n");
set_cpu_cap(c, X86_FEATURE_PAE);
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 341449c..de6626c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
err = amd_set_l3_disable_slot(nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
- pr_warning("L3 slot %d in use/index already disabled!\n",
+ pr_warn("L3 slot %d in use/index already disabled!\n",
slot);
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 4cfba43..517619e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -115,7 +115,7 @@ static int raise_local(void)
int cpu = m->extcpu;
if (m->inject_flags & MCJ_EXCEPTION) {
- printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+ pr_info("Triggering MCE exception on CPU %d\n", cpu);
switch (context) {
case MCJ_CTX_IRQ:
/*
@@ -128,15 +128,15 @@ static int raise_local(void)
raise_exception(m, NULL);
break;
default:
- printk(KERN_INFO "Invalid MCE context\n");
+ pr_info("Invalid MCE context\n");
ret = -EINVAL;
}
- printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+ pr_info("MCE exception done on CPU %d\n", cpu);
} else if (m->status) {
- printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+ pr_info("Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
mce_notify_irq();
- printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+ pr_info("Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
@@ -183,8 +183,7 @@ static void raise_mce(struct mce *m)
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
if (!time_before(jiffies, start + 2*HZ)) {
- printk(KERN_ERR
- "Timeout waiting for mce inject %lx\n",
+ pr_err("Timeout waiting for mce inject %lx\n",
*cpumask_bits(mce_inject_cpumask));
break;
}
@@ -241,7 +240,7 @@ static int inject_init(void)
{
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
- printk(KERN_INFO "Machine check injector initialized\n");
+ pr_info("Machine check injector initialized\n");
register_mce_write_callback(mce_write);
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
"mce_notify");
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27b..9d656fd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
/*
- * (c) 2005-2015 Advanced Micro Devices, Inc.
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
"execution_unit",
};
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+ [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
+ [SMCA_DF] = { "data_fabric", 0x2E },
+ [SMCA_UMC] = { "umc", 0x96 },
+ [SMCA_PB] = { "param_block", 0x5 },
+ [SMCA_PSP] = { "psp", 0xFF },
+ [SMCA_SMU] = { "smu", 0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+ [SMCA_LS] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [RES] = "",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+ [SMCA_CS] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
@@ -172,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
return 1;
};
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
@@ -275,6 +301,51 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block)
+{
+ u32 addr = 0, offset = 0;
+
+ if (mce_flags.smca) {
+ if (!block) {
+ addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+ } else {
+ /*
+ * For SMCA enabled processors, BLKPTR field of the
+ * first MISC register (MCx_MISC0) indicates presence of
+ * additional MISC register set (MISC1-4).
+ */
+ u32 low, high;
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ return addr;
+
+ if (!(low & MCI_CONFIG_MCAX))
+ return addr;
+
+ if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ (low & MASK_BLKPTR_LO))
+ addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ }
+ return addr;
+ }
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = MSR_IA32_MCx_MISC(bank);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
static int
prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
@@ -337,16 +408,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
- address = MSR_IA32_MCx_MISC(bank);
- else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
-
- address += MCG_XBLK_ADDR;
- } else
- ++address;
+ address = get_block_address(address, low, high, bank, block);
+ if (!address)
+ break;
if (rdmsr_safe(address, &low, &high))
break;
@@ -451,16 +515,9 @@ static void amd_threshold_interrupt(void)
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0) {
- address = MSR_IA32_MCx_MISC(bank);
- } else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
+ address = get_block_address(address, low, high, bank, block);
+ if (!address)
+ break;
if (rdmsr_safe(address, &low, &high))
break;
@@ -680,16 +737,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
- if (!block) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- return 0;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
+ address = get_block_address(address, low, high, bank, ++block);
+ if (!address)
+ return 0;
- err = allocate_threshold_blocks(cpu, bank, ++block, address);
+ err = allocate_threshold_blocks(cpu, bank, block, address);
if (err)
goto out_free;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 12402e1..2a0717b 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
- printk(KERN_EMERG
- "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
- smp_processor_id(), loaddr, lotype);
+ pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
if (lotype & (1<<5)) {
- printk(KERN_EMERG
- "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
- smp_processor_id());
+ pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
- printk(KERN_INFO
- "Intel old style machine check architecture supported.\n");
+ pr_info("Intel old style machine check architecture supported.\n");
/* Enable MCE: */
cr4_set_bits(X86_CR4_MCE);
- printk(KERN_INFO
- "Intel old style machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
+ pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 2c5aaf8..0b445c2 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
/* if we just entered the thermal event */
if (new_event) {
if (event == THERMAL_THROTTLING_EVENT)
- printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
@@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level)
}
if (old_event) {
if (event == THERMAL_THROTTLING_EVENT)
- printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
- this_cpu,
+ pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
@@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void)
static void unexpected_thermal_interrupt(void)
{
- printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
- smp_processor_id());
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
}
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
if (system_state == SYSTEM_BOOTING)
- printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
@@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
- tm2 ? "TM2" : "TM1");
+ pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+ tm2 ? "TM2" : "TM1");
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 7245980..fcf9ae9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -12,8 +12,8 @@
static void default_threshold_interrupt(void)
{
- printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
- THRESHOLD_APIC_VECTOR);
+ pr_err("Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
}
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 01dd870..c6a722e 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
ist_enter(regs);
- printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+ pr_emerg("CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs);
@@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
cr4_set_bits(X86_CR4_MCE);
- printk(KERN_INFO
- "Winchip machine check reporting enabled on CPU#0.\n");
+ pr_info("Winchip machine check reporting enabled on CPU#0.\n");
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2233f8a..8581963 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void)
else
container = cont_va;
- if (ucode_new_rev)
- pr_info("microcode: updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
@@ -469,8 +465,7 @@ void reload_ucode_amd(void)
if (mc && rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
ucode_new_rev = mc->hdr.patch_id;
- pr_info("microcode: reload patch_level=0x%08x\n",
- ucode_new_rev);
+ pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
}
}
}
@@ -793,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
return -EINVAL;
}
- patch->data = kzalloc(patch_size, GFP_KERNEL);
+ patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
if (!patch->data) {
pr_err("Patch data allocation failure.\n");
kfree(patch);
return -EINVAL;
}
- /* All looks ok, copy patch... */
- memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
INIT_LIST_HEAD(&patch->plist);
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
@@ -953,10 +946,14 @@ struct microcode_ops * __init init_amd_microcode(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+ pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
+ if (ucode_new_rev)
+ pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+ ucode_new_rev);
+
return &microcode_amd_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index faec712..ac360bf 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -43,16 +43,8 @@
#define MICROCODE_VERSION "2.01"
static struct microcode_ops *microcode_ops;
-
static bool dis_ucode_ldr;
-static int __init disable_loader(char *str)
-{
- dis_ucode_ldr = true;
- return 1;
-}
-__setup("dis_ucode_ldr", disable_loader);
-
/*
* Synchronization.
*
@@ -81,15 +73,16 @@ struct cpu_info_ctx {
static bool __init check_loader_disabled_bsp(void)
{
+ static const char *__dis_opt_str = "dis_ucode_ldr";
+
#ifdef CONFIG_X86_32
const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
- const char *opt = "dis_ucode_ldr";
- const char *option = (const char *)__pa_nodebug(opt);
+ const char *option = (const char *)__pa_nodebug(__dis_opt_str);
bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
#else /* CONFIG_X86_64 */
const char *cmdline = boot_command_line;
- const char *option = "dis_ucode_ldr";
+ const char *option = __dis_opt_str;
bool *res = &dis_ucode_ldr;
#endif
@@ -479,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
enum ucode_state ustate;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- if (uci && uci->valid)
+ if (uci->valid)
return UCODE_OK;
if (collect_cpu_info(cpu))
@@ -630,7 +623,7 @@ int __init microcode_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
- if (paravirt_enabled() || dis_ucode_ldr)
+ if (dis_ucode_ldr)
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ee81c54..cbb3cf0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,9 +39,15 @@
#include <asm/setup.h>
#include <asm/msr.h>
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+/*
+ * Temporary microcode blobs pointers storage. We note here the pointers to
+ * microcode blobs we've got from whatever storage (detached initrd, builtin).
+ * Later on, we put those into final storage mc_saved_data.mc_saved.
+ */
+static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
+
static struct mc_saved_data {
- unsigned int mc_saved_count;
+ unsigned int num_saved;
struct microcode_intel **mc_saved;
} mc_saved_data;
@@ -78,53 +84,50 @@ load_microcode_early(struct microcode_intel **saved,
}
static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
- unsigned long off, int num_saved)
+copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
+ unsigned long off, int num_saved)
{
int i;
for (i = 0; i < num_saved; i++)
- mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+ mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
}
#ifdef CONFIG_X86_32
static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
- struct mc_saved_data *mc_saved_data)
+microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
{
int i;
struct microcode_intel ***mc_saved;
- mc_saved = (struct microcode_intel ***)
- __pa_nodebug(&mc_saved_data->mc_saved);
- for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+ mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+
+ for (i = 0; i < mcs->num_saved; i++) {
struct microcode_intel *p;
- p = *(struct microcode_intel **)
- __pa_nodebug(mc_saved_data->mc_saved + i);
+ p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
}
}
#endif
static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
- unsigned long initrd_start, struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
+ unsigned long offset, struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int count = mc_saved_data->mc_saved_count;
+ unsigned int count = mcs->num_saved;
- if (!mc_saved_data->mc_saved) {
- copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+ if (!mcs->mc_saved) {
+ copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
return load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
- microcode_phys(mc_saved_tmp, mc_saved_data);
+ microcode_phys(mc_saved_tmp, mcs);
return load_microcode_early(mc_saved_tmp, count, uci);
#else
- return load_microcode_early(mc_saved_data->mc_saved,
- count, uci);
+ return load_microcode_early(mcs->mc_saved, count, uci);
#endif
}
}
@@ -175,25 +178,25 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
}
static int
-save_microcode(struct mc_saved_data *mc_saved_data,
+save_microcode(struct mc_saved_data *mcs,
struct microcode_intel **mc_saved_src,
- unsigned int mc_saved_count)
+ unsigned int num_saved)
{
int i, j;
struct microcode_intel **saved_ptr;
int ret;
- if (!mc_saved_count)
+ if (!num_saved)
return -EINVAL;
/*
* Copy new microcode data.
*/
- saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
if (!saved_ptr)
return -ENOMEM;
- for (i = 0; i < mc_saved_count; i++) {
+ for (i = 0; i < num_saved; i++) {
struct microcode_header_intel *mc_hdr;
struct microcode_intel *mc;
unsigned long size;
@@ -207,20 +210,18 @@ save_microcode(struct mc_saved_data *mc_saved_data,
mc_hdr = &mc->hdr;
size = get_totalsize(mc_hdr);
- saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
if (!saved_ptr[i]) {
ret = -ENOMEM;
goto err;
}
-
- memcpy(saved_ptr[i], mc, size);
}
/*
* Point to newly saved microcode.
*/
- mc_saved_data->mc_saved = saved_ptr;
- mc_saved_data->mc_saved_count = mc_saved_count;
+ mcs->mc_saved = saved_ptr;
+ mcs->num_saved = num_saved;
return 0;
@@ -284,22 +285,20 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
* BSP can stay in the platform.
*/
static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
- void *data, size_t size,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
+get_matching_model_microcode(unsigned long start, void *data, size_t size,
+ struct mc_saved_data *mcs, unsigned long *mc_ptrs,
struct ucode_cpu_info *uci)
{
- u8 *ucode_ptr = data;
- unsigned int leftover = size;
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ struct microcode_header_intel *mc_header;
+ unsigned int num_saved = mcs->num_saved;
enum ucode_state state = UCODE_OK;
+ unsigned int leftover = size;
+ u8 *ucode_ptr = data;
unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
int i;
- while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+ while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
if (leftover < sizeof(mc_header))
break;
@@ -318,32 +317,31 @@ get_matching_model_microcode(int cpu, unsigned long start,
* the platform, we need to find and save microcode patches
* with the same family and model as the BSP.
*/
- if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
- UCODE_OK) {
+ if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
ucode_ptr += mc_size;
continue;
}
- mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+ num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
ucode_ptr += mc_size;
}
if (leftover) {
state = UCODE_ERROR;
- goto out;
+ return state;
}
- if (mc_saved_count == 0) {
+ if (!num_saved) {
state = UCODE_NFOUND;
- goto out;
+ return state;
}
- for (i = 0; i < mc_saved_count; i++)
- mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+ for (i = 0; i < num_saved; i++)
+ mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+ mcs->num_saved = num_saved;
- mc_saved_data->mc_saved_count = mc_saved_count;
-out:
return state;
}
@@ -373,7 +371,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
}
- native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+ native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
@@ -396,11 +394,11 @@ static void show_saved_mc(void)
unsigned int sig, pf, rev, total_size, data_size, date;
struct ucode_cpu_info uci;
- if (mc_saved_data.mc_saved_count == 0) {
+ if (!mc_saved_data.num_saved) {
pr_debug("no microcode data saved.\n");
return;
}
- pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+ pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
collect_cpu_info_early(&uci);
@@ -409,7 +407,7 @@ static void show_saved_mc(void)
rev = uci.cpu_sig.rev;
pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
- for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+ for (i = 0; i < mc_saved_data.num_saved; i++) {
struct microcode_header_intel *mc_saved_header;
struct extended_sigtable *ext_header;
int ext_sigcount;
@@ -465,7 +463,7 @@ int save_mc_for_early(u8 *mc)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int mc_saved_count_init;
- unsigned int mc_saved_count;
+ unsigned int num_saved;
struct microcode_intel **mc_saved;
int ret = 0;
int i;
@@ -476,23 +474,23 @@ int save_mc_for_early(u8 *mc)
*/
mutex_lock(&x86_cpu_microcode_mutex);
- mc_saved_count_init = mc_saved_data.mc_saved_count;
- mc_saved_count = mc_saved_data.mc_saved_count;
+ mc_saved_count_init = mc_saved_data.num_saved;
+ num_saved = mc_saved_data.num_saved;
mc_saved = mc_saved_data.mc_saved;
- if (mc_saved && mc_saved_count)
+ if (mc_saved && num_saved)
memcpy(mc_saved_tmp, mc_saved,
- mc_saved_count * sizeof(struct microcode_intel *));
+ num_saved * sizeof(struct microcode_intel *));
/*
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
- mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+ num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
/*
* Save the mc_save_tmp in global mc_saved_data.
*/
- ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+ ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
if (ret) {
pr_err("Cannot save microcode patch.\n");
goto out;
@@ -536,7 +534,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
unsigned long start, unsigned long size,
struct ucode_cpu_info *uci)
{
@@ -551,14 +549,18 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
cd.data = NULL;
cd.size = 0;
- cd = find_cpio_data(p, (void *)start, size, &offset);
- if (!cd.data) {
+ /* try built-in microcode if no initrd */
+ if (!size) {
if (!load_builtin_intel_microcode(&cd))
return UCODE_ERROR;
+ } else {
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data)
+ return UCODE_ERROR;
}
- return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, initrd, uci);
+ return get_matching_model_microcode(start, cd.data, cd.size,
+ mcs, mc_ptrs, uci);
}
/*
@@ -567,14 +569,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
static void
print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
{
- int cpu = smp_processor_id();
-
- pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
- cpu,
- uci->cpu_sig.rev,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
+ pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
}
#ifdef CONFIG_X86_32
@@ -603,19 +602,19 @@ void show_ucode_info_early(void)
*/
static void print_ucode(struct ucode_cpu_info *uci)
{
- struct microcode_intel *mc_intel;
+ struct microcode_intel *mc;
int *delay_ucode_info_p;
int *current_mc_date_p;
- mc_intel = uci->mc;
- if (mc_intel == NULL)
+ mc = uci->mc;
+ if (!mc)
return;
delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
*delay_ucode_info_p = 1;
- *current_mc_date_p = mc_intel->hdr.date;
+ *current_mc_date_p = mc->hdr.date;
}
#else
@@ -630,37 +629,35 @@ static inline void flush_tlb_early(void)
static inline void print_ucode(struct ucode_cpu_info *uci)
{
- struct microcode_intel *mc_intel;
+ struct microcode_intel *mc;
- mc_intel = uci->mc;
- if (mc_intel == NULL)
+ mc = uci->mc;
+ if (!mc)
return;
- print_ucode_info(uci, mc_intel->hdr.date);
+ print_ucode_info(uci, mc->hdr.date);
}
#endif
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
- struct microcode_intel *mc_intel;
+ struct microcode_intel *mc;
unsigned int val[2];
- mc_intel = uci->mc;
- if (mc_intel == NULL)
+ mc = uci->mc;
+ if (!mc)
return 0;
/* write microcode via MSR 0x79 */
- native_wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) mc_intel->bits,
- (unsigned long) mc_intel->bits >> 16 >> 16);
- native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+ native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (val[1] != mc_intel->hdr.rev)
+ if (val[1] != mc->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
@@ -672,25 +669,26 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (early)
print_ucode(uci);
else
- print_ucode_info(uci, mc_intel->hdr.date);
+ print_ucode_info(uci, mc->hdr.date);
return 0;
}
/*
* This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
*/
int __init save_microcode_in_initrd_intel(void)
{
- unsigned int count = mc_saved_data.mc_saved_count;
+ unsigned int count = mc_saved_data.num_saved;
struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
int ret = 0;
- if (count == 0)
+ if (!count)
return ret;
- copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count);
+
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -701,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void)
}
static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *initrd,
+_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
unsigned long start, unsigned long size)
{
struct ucode_cpu_info uci;
@@ -710,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
collect_cpu_info_early(&uci);
- ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ ret = scan_microcode(mcs, mc_ptrs, start, size, &uci);
if (ret != UCODE_OK)
return;
- ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ ret = load_microcode(mcs, mc_ptrs, start, &uci);
if (ret != UCODE_OK)
return;
@@ -728,53 +725,49 @@ void __init load_ucode_intel_bsp(void)
struct boot_params *p;
p = (struct boot_params *)__pa_nodebug(&boot_params);
- start = p->hdr.ramdisk_image;
size = p->hdr.ramdisk_size;
- _load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- start, size);
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ start = (size ? p->hdr.ramdisk_image : 0);
+
+ _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_tmp_ptrs),
+ start, size);
#else
- start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
size = boot_params.hdr.ramdisk_size;
+ start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+ _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size);
#endif
}
void load_ucode_intel_ap(void)
{
- struct mc_saved_data *mc_saved_data_p;
+ unsigned long *mcs_tmp_p;
+ struct mc_saved_data *mcs_p;
struct ucode_cpu_info uci;
- unsigned long *mc_saved_in_initrd_p;
- unsigned long initrd_start_addr;
enum ucode_state ret;
#ifdef CONFIG_X86_32
- unsigned long *initrd_start_p;
- mc_saved_in_initrd_p =
- (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
- mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
- initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+ mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
+ mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
#else
- mc_saved_data_p = &mc_saved_data;
- mc_saved_in_initrd_p = mc_saved_in_initrd;
- initrd_start_addr = initrd_start;
+ mcs_tmp_p = mc_tmp_ptrs;
+ mcs_p = &mc_saved_data;
#endif
/*
* If there is no valid ucode previously saved in memory, no need to
* update ucode on this AP.
*/
- if (mc_saved_data_p->mc_saved_count == 0)
+ if (!mcs_p->num_saved)
return;
collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
-
+ ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci);
if (ret != UCODE_OK)
return;
@@ -786,13 +779,13 @@ void reload_ucode_intel(void)
struct ucode_cpu_info uci;
enum ucode_state ret;
- if (!mc_saved_data.mc_saved_count)
+ if (!mc_saved_data.num_saved)
return;
collect_cpu_info_early(&uci);
ret = load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
+ mc_saved_data.num_saved, &uci);
if (ret != UCODE_OK)
return;
@@ -825,7 +818,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
* return 0 - no update found
* return 1 - found update
*/
-static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
+static int get_matching_mc(struct microcode_intel *mc, int cpu)
{
struct cpu_signature cpu_sig;
unsigned int csig, cpf, crev;
@@ -836,39 +829,36 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
- return has_newer_microcode(mc_intel, csig, cpf, crev);
+ return has_newer_microcode(mc, csig, cpf, crev);
}
static int apply_microcode_intel(int cpu)
{
- struct microcode_intel *mc_intel;
+ struct microcode_intel *mc;
struct ucode_cpu_info *uci;
+ struct cpuinfo_x86 *c;
unsigned int val[2];
- int cpu_num = raw_smp_processor_id();
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-
- uci = ucode_cpu_info + cpu;
- mc_intel = uci->mc;
/* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
+ if (WARN_ON(raw_smp_processor_id() != cpu))
+ return -1;
- if (mc_intel == NULL)
+ uci = ucode_cpu_info + cpu;
+ mc = uci->mc;
+ if (!mc)
return 0;
/*
* Microcode on this CPU could be updated earlier. Only apply the
- * microcode patch in mc_intel when it is newer than the one on this
+ * microcode patch in mc when it is newer than the one on this
* CPU.
*/
- if (get_matching_mc(mc_intel, cpu) == 0)
+ if (!get_matching_mc(mc, cpu))
return 0;
/* write microcode via MSR 0x79 */
- wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) mc_intel->bits,
- (unsigned long) mc_intel->bits >> 16 >> 16);
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+ wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
@@ -876,16 +866,19 @@ static int apply_microcode_intel(int cpu)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (val[1] != mc_intel->hdr.rev) {
+ if (val[1] != mc->hdr.rev) {
pr_err("CPU%d update to revision 0x%x failed\n",
- cpu_num, mc_intel->hdr.rev);
+ cpu, mc->hdr.rev);
return -1;
}
+
pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
- cpu_num, val[1],
- mc_intel->hdr.date & 0xffff,
- mc_intel->hdr.date >> 24,
- (mc_intel->hdr.date >> 16) & 0xff);
+ cpu, val[1],
+ mc->hdr.date & 0xffff,
+ mc->hdr.date >> 24,
+ (mc->hdr.date >> 16) & 0xff);
+
+ c = &cpu_data(cpu);
uci->cpu_sig.rev = val[1];
c->microcode = val[1];
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index b96896b..2ce1a7d 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err)
unsigned long total_size, data_size, ext_table_size;
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header = NULL;
- int sum, orig_sum, ext_sigcount = 0, i;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
struct extended_signature *ext_sig;
total_size = get_totalsize(mc_header);
@@ -57,69 +57,85 @@ int microcode_sanity_check(void *mc, int print_err)
if (data_size + MC_HEADER_SIZE > total_size) {
if (print_err)
- pr_err("error! Bad data size in microcode data file\n");
+ pr_err("Error: bad microcode data file size.\n");
return -EINVAL;
}
if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
if (print_err)
- pr_err("error! Unknown microcode update format\n");
+ pr_err("Error: invalid/unknown microcode update format.\n");
return -EINVAL;
}
+
ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
if ((ext_table_size < EXT_HEADER_SIZE)
|| ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
if (print_err)
- pr_err("error! Small exttable size in microcode data file\n");
+ pr_err("Error: truncated extended signature table.\n");
return -EINVAL;
}
+
ext_header = mc + MC_HEADER_SIZE + data_size;
if (ext_table_size != exttable_size(ext_header)) {
if (print_err)
- pr_err("error! Bad exttable size in microcode data file\n");
+ pr_err("Error: extended signature table size mismatch.\n");
return -EFAULT;
}
+
ext_sigcount = ext_header->count;
- }
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int *ext_tablep = (int *)ext_header;
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
- i = ext_table_size / DWSIZE;
+ i = ext_table_size / sizeof(u32);
while (i--)
ext_table_sum += ext_tablep[i];
+
if (ext_table_sum) {
if (print_err)
- pr_warn("aborting, bad extended signature table checksum\n");
+ pr_warn("Bad extended signature table checksum, aborting.\n");
return -EINVAL;
}
}
- /* calculate the checksum */
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
while (i--)
- orig_sum += ((int *)mc)[i];
+ orig_sum += ((u32 *)mc)[i];
+
if (orig_sum) {
if (print_err)
- pr_err("aborting, bad checksum\n");
+ pr_err("Bad microcode data checksum, aborting.\n");
return -EINVAL;
}
+
if (!ext_table_size)
return 0;
- /* check extended signature checksum */
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
for (i = 0; i < ext_sigcount; i++) {
ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
EXT_SIGNATURE_SIZE * i;
- sum = orig_sum
- - (mc_header->sig + mc_header->pf + mc_header->cksum)
- + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
if (sum) {
if (print_err)
- pr_err("aborting, bad checksum\n");
+ pr_err("Bad extended signature checksum, aborting.\n");
return -EINVAL;
}
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 20e242e..4e7c693 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints);
+ pr_info("HyperV: features 0x%x, hints 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints);
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
@@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void)
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_frequency = hv_lapic_frequency;
- printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
- lapic_timer_frequency);
+ pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_frequency);
}
#endif
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index 316fe3e..3d68993 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
*/
if (type != MTRR_TYPE_WRCOMB &&
(centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
- pr_warning("mtrr: only write-combining%s supported\n",
+ pr_warn("mtrr: only write-combining%s supported\n",
centaur_mcr_type ? " and uncacheable are" : " is");
return -EINVAL;
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 0d98503..31e951c 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -57,9 +57,9 @@ static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
@@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
base, base + size);
}
if (debug_print) {
- printk(KERN_DEBUG "After WB checking\n");
+ pr_debug("After WB checking\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
@@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
(mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
- printk(BIOS_BUG_MSG, i);
+ pr_warn(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
continue;
size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
extra_remove_base + extra_remove_size);
if (debug_print) {
- printk(KERN_DEBUG "After UC checking\n");
+ pr_debug("After UC checking\n");
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
}
@@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
/* sort the ranges */
nr_range = clean_sort_range(range, RANGE_NUM);
if (debug_print) {
- printk(KERN_DEBUG "After sorting\n");
+ pr_debug("After sorting\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end);
}
@@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void)
start_base = to_size_factor(start_base, &start_factor),
type = range_state[i].type;
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits)
return 0;
/* Print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
+ pr_debug("original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
@@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits)
x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM covered: %ldM\n",
+ pr_info("total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
@@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits)
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ pr_debug("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
- "will find optimal one\n");
+ pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
}
i = 0;
@@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits)
x_remove_base, x_remove_size, i);
if (debug_print) {
mtrr_print_out_one_result(i);
- printk(KERN_INFO "\n");
+ pr_info("\n");
}
i++;
@@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits)
index_good = mtrr_search_optimal_index();
if (index_good != -1) {
- printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ pr_info("Found optimal setting for mtrr clean up\n");
i = index_good;
mtrr_print_out_one_result(i);
@@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits)
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ pr_debug("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
@@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
}
- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+ pr_info("mtrr_cleanup: can not find optimal value\n");
+ pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
return 0;
}
@@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
if (!highest_pfn) {
- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+ pr_info("CPU MTRRs all blank - virtualized system.\n");
return 0;
}
@@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
end_pfn);
if (total_trim_size) {
- pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+ pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
if (!changed_by_mtrr_cleanup)
WARN_ON(1);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index c870af1..fcbcb2f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
rdmsr(MSR_K8_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
- printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
@@ -501,14 +501,14 @@ void __init mtrr_state_warn(void)
if (!mask)
return;
if (mask & MTRR_CHANGE_MASK_FIXED)
- pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_VARIABLE)
- pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_DEFTYPE)
- pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
- printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
- printk(KERN_INFO "mtrr: corrected configuration.\n");
+ pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+ pr_info("mtrr: corrected configuration.\n");
}
/*
@@ -519,8 +519,7 @@ void __init mtrr_state_warn(void)
void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
{
if (wrmsr_safe(msr, a, b) < 0) {
- printk(KERN_ERR
- "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+ pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
smp_processor_id(), msr, a, b);
}
}
@@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1ULL<<(hi - 1)) - 1);
if (tmp != mask) {
- printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
mask = tmp;
}
@@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
boot_cpu_data.x86_model == 1 &&
boot_cpu_data.x86_mask <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
- pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+ pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
- pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+ pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
return -EINVAL;
}
}
@@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
lbase = lbase >> 1, last = last >> 1)
;
if (lbase != last) {
- pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+ pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
return -EINVAL;
}
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 74f1d90..10f8d47 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return error;
if (type >= MTRR_NUM_TYPES) {
- pr_warning("mtrr: type: %u invalid\n", type);
+ pr_warn("mtrr: type: %u invalid\n", type);
return -EINVAL;
}
/* If the type is WC, check that this processor supports it */
if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
- pr_warning("mtrr: your processor doesn't support write-combining\n");
+ pr_warn("mtrr: your processor doesn't support write-combining\n");
return -ENOSYS;
}
if (!size) {
- pr_warning("mtrr: zero sized request\n");
+ pr_warn("mtrr: zero sized request\n");
return -EINVAL;
}
if ((base | (base + size - 1)) >>
(boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
- pr_warning("mtrr: base or size exceeds the MTRR width\n");
+ pr_warn("mtrr: base or size exceeds the MTRR width\n");
return -EINVAL;
}
@@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
} else if (types_compatible(type, ltype))
continue;
}
- pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+ pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
" 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
@@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (ltype != type) {
if (types_compatible(type, ltype))
continue;
- pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
mtrr_attrib_to_str(type));
goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
- pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+ pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
@@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
}
}
if (reg >= max) {
- pr_warning("mtrr: register: %d too big\n", reg);
+ pr_warn("mtrr: register: %d too big\n", reg);
goto out;
}
mtrr_if->get(reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
- pr_warning("mtrr: MTRR %d not used\n", reg);
+ pr_warn("mtrr: MTRR %d not used\n", reg);
goto out;
}
if (mtrr_usage_table[reg] < 1) {
- pr_warning("mtrr: reg: %d has count=0\n", reg);
+ pr_warn("mtrr: reg: %d has count=0\n", reg);
goto out;
}
if (--mtrr_usage_table[reg] < 1)
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 819d949..f6f50c4 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
if (!rdrand_long(&tmp)) {
clear_cpu_cap(c, X86_FEATURE_RDRAND);
- printk_once(KERN_WARNING "rdrand: disabled\n");
+ pr_warn_once("rdrand: disabled\n");
return;
}
}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 4c60eaf..cd53135 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
if (!printed) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ pr_info("CPU: Physical Processor ID: %d\n",
c->phys_proc_id);
if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ pr_info("CPU: Processor Core ID: %d\n",
c->cpu_core_id);
printed = 1;
}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index a19a663..3417856 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860001) {
cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
if (cpu_rev != 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+ pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
(cpu_rev >> 24) & 0xff,
(cpu_rev >> 16) & 0xff,
(cpu_rev >> 8) & 0xff,
@@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860002) {
cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
if (cpu_rev == 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+ pr_info("CPU: Processor revision %08X, %u MHz\n",
new_cpu_rev, cpu_freq);
}
- printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+ pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
(cms_rev1 >> 24) & 0xff,
(cms_rev1 >> 16) & 0xff,
(cms_rev1 >> 8) & 0xff,
@@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
(void *)&cpu_info[56],
(void *)&cpu_info[60]);
cpu_info[64] = '\0';
- printk(KERN_INFO "CPU: %s\n", cpu_info);
+ pr_info("CPU: %s\n", cpu_info);
}
/* Unhide possibly hidden capability flags */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 628a059..364e583 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void)
tsc_hz = eax | (((uint64_t)ebx) << 32);
do_div(tsc_hz, 1000);
BUG_ON(tsc_hz >> 32);
- printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
@@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void)
if (ebx != UINT_MAX)
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
else
- printk(KERN_WARNING
- "Failed to get TSC freq from the hypervisor\n");
+ pr_warn("Failed to get TSC freq from the hypervisor\n");
}
/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 58f3431..9ef978d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -57,10 +57,9 @@ struct crash_elf_data {
struct kimage *image;
/*
* Total number of ram ranges we have after various adjustments for
- * GART, crash reserved region etc.
+ * crash reserved region, etc.
*/
unsigned int max_nr_ranges;
- unsigned long gart_start, gart_end;
/* Pointer to elf header */
void *ehdr;
@@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
return 0;
}
-static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
-{
- struct crash_elf_data *ced = arg;
-
- ced->gart_start = start;
- ced->gart_end = end;
-
- /* Not expecting more than 1 gart aperture */
- return 1;
-}
-
/* Gather all the required information to prepare elf headers for ram regions */
static void fill_up_crash_elf_data(struct crash_elf_data *ced,
@@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
ced->max_nr_ranges = nr_ranges;
- /*
- * We don't create ELF headers for GART aperture as an attempt
- * to dump this memory in second kernel leads to hang/crash.
- * If gart aperture is present, one needs to exclude that region
- * and that could lead to need of extra phdr.
- */
- walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
- ced, get_gart_ranges_callback);
-
- /*
- * If we have gart region, excluding that could potentially split
- * a memory range, resulting in extra header. Account for that.
- */
- if (ced->gart_end)
- ced->max_nr_ranges++;
-
/* Exclusion of crash region could split memory ranges */
ced->max_nr_ranges++;
@@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
return ret;
}
- /* Exclude GART region */
- if (ced->gart_end) {
- ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
- if (ret)
- return ret;
- }
-
return ret;
}
@@ -599,12 +564,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
/* Add ACPI tables */
cmd.type = E820_ACPI;
flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+ walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
memmap_entry_callback);
/* Add ACPI Non-volatile Storage */
cmd.type = E820_NVS;
- walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+ walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
memmap_entry_callback);
/* Add crashk_low_res region */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9c30acf..8efa57a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo,
if (!__kernel_text_address(addr))
break;
- ops->address(data, addr, 1);
+ if (ops->address(data, addr, 1))
+ break;
frame = frame->next_frame;
ret_addr = &frame->return_address;
print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
@@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name)
/*
* Print one address/symbol entries per line.
*/
-static void print_trace_address(void *data, unsigned long addr, int reliable)
+static int print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
printk_stack_address(addr, reliable, data);
+ return 0;
}
static const struct stacktrace_ops print_trace_ops = {
@@ -265,9 +267,8 @@ int __die(const char *str, struct pt_regs *regs, long err)
#ifdef CONFIG_SMP
printk("SMP ");
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC ");
-#endif
+ if (debug_pagealloc_enabled())
+ printk("DEBUG_PAGEALLOC ");
#ifdef CONFIG_KASAN
printk("KASAN");
#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index b3c2a69..621b501 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -926,6 +926,41 @@ static const char *e820_type_to_string(int e820_type)
}
}
+static unsigned long e820_type_to_iomem_type(int e820_type)
+{
+ switch (e820_type) {
+ case E820_RESERVED_KERN:
+ case E820_RAM:
+ return IORESOURCE_SYSTEM_RAM;
+ case E820_ACPI:
+ case E820_NVS:
+ case E820_UNUSABLE:
+ case E820_PRAM:
+ case E820_PMEM:
+ default:
+ return IORESOURCE_MEM;
+ }
+}
+
+static unsigned long e820_type_to_iores_desc(int e820_type)
+{
+ switch (e820_type) {
+ case E820_ACPI:
+ return IORES_DESC_ACPI_TABLES;
+ case E820_NVS:
+ return IORES_DESC_ACPI_NV_STORAGE;
+ case E820_PMEM:
+ return IORES_DESC_PERSISTENT_MEMORY;
+ case E820_PRAM:
+ return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+ case E820_RESERVED_KERN:
+ case E820_RAM:
+ case E820_UNUSABLE:
+ default:
+ return IORES_DESC_NONE;
+ }
+}
+
static bool do_mark_busy(u32 type, struct resource *res)
{
/* this is the legacy bios/dos rom-shadow + mmio region */
@@ -968,7 +1003,8 @@ void __init e820_reserve_resources(void)
res->start = e820.map[i].addr;
res->end = end;
- res->flags = IORESOURCE_MEM;
+ res->flags = e820_type_to_iomem_type(e820.map[i].type);
+ res->desc = e820_type_to_iores_desc(e820.map[i].type);
/*
* don't register the region that could be conflicted with
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d25097c..0b1b9ab 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -114,6 +114,10 @@ void __kernel_fpu_begin(void)
kernel_fpu_disable();
if (fpu->fpregs_active) {
+ /*
+ * Ignore return value -- we don't care if reg state
+ * is clobbered.
+ */
copy_fpregs_to_fpstate(fpu);
} else {
this_cpu_write(fpu_fpregs_owner_ctx, NULL);
@@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu)
preempt_disable();
if (fpu->fpregs_active) {
- if (!copy_fpregs_to_fpstate(fpu))
- fpregs_deactivate(fpu);
+ if (!copy_fpregs_to_fpstate(fpu)) {
+ if (use_eager_fpu())
+ copy_kernel_to_fpregs(&fpu->state);
+ else
+ fpregs_deactivate(fpu);
+ }
}
preempt_enable();
}
@@ -223,14 +231,15 @@ void fpstate_init(union fpregs_state *state)
}
EXPORT_SYMBOL_GPL(fpstate_init);
-/*
- * Copy the current task's FPU state to a new task's FPU context.
- *
- * In both the 'eager' and the 'lazy' case we save hardware registers
- * directly to the destination buffer.
- */
-static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
+ dst_fpu->counter = 0;
+ dst_fpu->fpregs_active = 0;
+ dst_fpu->last_cpu = -1;
+
+ if (!src_fpu->fpstate_active || !cpu_has_fpu)
+ return 0;
+
WARN_ON_FPU(src_fpu != &current->thread.fpu);
/*
@@ -243,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
/*
* Save current FPU registers directly into the child
* FPU context, without any memory-to-memory copying.
- *
- * If the FPU context got destroyed in the process (FNSAVE
- * done on old CPUs) then copy it back into the source
- * context and mark the current task for lazy restore.
+ * In lazy mode, if the FPU context isn't loaded into
+ * fpregs, CR0.TS will be set and do_device_not_available
+ * will load the FPU context.
*
* We have to do all this with preemption disabled,
* mostly because of the FNSAVE case, because in that
@@ -259,19 +267,13 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
preempt_disable();
if (!copy_fpregs_to_fpstate(dst_fpu)) {
memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
- fpregs_deactivate(src_fpu);
+
+ if (use_eager_fpu())
+ copy_kernel_to_fpregs(&src_fpu->state);
+ else
+ fpregs_deactivate(src_fpu);
}
preempt_enable();
-}
-
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
-{
- dst_fpu->counter = 0;
- dst_fpu->fpregs_active = 0;
- dst_fpu->last_cpu = -1;
-
- if (src_fpu->fpstate_active && cpu_has_fpu)
- fpu_copy(dst_fpu, src_fpu);
return 0;
}
@@ -409,8 +411,10 @@ static inline void copy_init_fpstate_to_fpregs(void)
{
if (use_xsave())
copy_kernel_to_xregs(&init_fpstate.xsave, -1);
- else
+ else if (static_cpu_has(X86_FEATURE_FXSR))
copy_kernel_to_fxregs(&init_fpstate.fxsave);
+ else
+ copy_kernel_to_fregs(&init_fpstate.fsave);
}
/*
@@ -423,7 +427,7 @@ void fpu__clear(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
- if (!use_eager_fpu()) {
+ if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
/* FPU state will be reallocated lazily at the first use. */
fpu__drop(fpu);
} else {
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6d9f0a7..54c86ff 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
write_cr0(cr0);
- asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
- : "+m" (fsw), "+m" (fcw));
+ if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+ : "+m" (fsw), "+m" (fcw));
- if (fsw == 0 && (fcw & 0x103f) == 0x003f)
- set_cpu_cap(c, X86_FEATURE_FPU);
- else
- clear_cpu_cap(c, X86_FEATURE_FPU);
+ if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+ set_cpu_cap(c, X86_FEATURE_FPU);
+ else
+ clear_cpu_cap(c, X86_FEATURE_FPU);
+ }
#ifndef CONFIG_MATH_EMULATION
if (!cpu_has_fpu) {
@@ -132,7 +134,7 @@ static void __init fpu__init_system_generic(void)
* Set up the legacy init FPU context. (xstate init might overwrite this
* with a more modern format, if the CPU supports it.)
*/
- fpstate_init_fxstate(&init_fpstate.fxsave);
+ fpstate_init(&init_fpstate);
fpu__init_system_mxcsr();
}
@@ -260,7 +262,10 @@ static void __init fpu__init_system_xstate_size_legacy(void)
* not only saved the restores along the way, but we also have the
* FPU ready to be used for the original task.
*
- * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * 'lazy' is deprecated because it's almost never a performance win
+ * and it's much more complicated than 'eager'.
+ *
+ * 'eager' switching is by default on all CPUs, there we switch the FPU
* state during every context switch, regardless of whether the task
* has used FPU instructions in that time slice or not. This is done
* because modern FPU context saving instructions are able to optimize
@@ -271,7 +276,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
* to use 'eager' restores, if we detect that a task is using the FPU
* frequently. See the fpu->counter logic in fpu/internal.h for that. ]
*/
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
/*
* Find supported xfeatures based on cpu features and command-line input.
@@ -300,12 +305,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
static void __init fpu__clear_eager_fpu_features(void)
{
setup_clear_cpu_cap(X86_FEATURE_MPX);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- setup_clear_cpu_cap(X86_FEATURE_AVX512F);
- setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
- setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
- setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
}
/*
@@ -348,15 +347,9 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- /*
- * No need to check "eagerfpu=auto" again, since it is the
- * initial default.
- */
if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
eagerfpu = DISABLE;
fpu__clear_eager_fpu_features();
- } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
- eagerfpu = ENABLE;
}
if (cmdline_find_option_bool(boot_command_line, "no387"))
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d425cda5..6e8354f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 04f9641e..702547c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
static unsigned long text_ip_addr(unsigned long ip)
{
/*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
+ * On x86_64, kernel text mappings are mapped read-only, so we use
+ * the kernel identity mapping instead of the kernel text mapping
+ * to modify the kernel text.
*
* For 32bit kernels, these mappings are same and we can use
* kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2c0f340..1f4422d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
- unsigned long i;
-
- for (i = 0; i < PTRS_PER_PGD-1; i++)
- early_level4_pgt[i].pgd = 0;
-
+ memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
-
write_cr3(__pa_nodebug(early_level4_pgt));
}
@@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void)
int __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET;
- unsigned long i;
pgdval_t pgd, *pgd_p;
pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p;
@@ -81,8 +75,7 @@ again:
}
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
- for (i = 0; i < PTRS_PER_PUD; i++)
- pud_p[i] = 0;
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
@@ -97,8 +90,7 @@ again:
}
pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
- for (i = 0; i < PTRS_PER_PMD; i++)
- pmd_p[i] = 0;
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pmd = (physaddr & PMD_MASK) + early_pmd_flags;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index af11129..54cdbd2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -389,6 +389,12 @@ default_entry:
/* Make changes effective */
wrmsr
+ /*
+ * And make sure that all the mappings we set up have NX set from
+ * the beginning.
+ */
+ orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
+
enable_paging:
/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2e97468..22fbf9d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -75,9 +75,7 @@ startup_64:
subq $_text - __START_KERNEL_map, %rbp
/* Is the address not 2M aligned? */
- movq %rbp, %rax
- andl $~PMD_PAGE_MASK, %eax
- testl %eax, %eax
+ testl $~PMD_PAGE_MASK, %ebp
jnz bad_address
/*
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 44256a6..ed15cd48 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
-#ifdef CONFIG_DEBUG_RODATA
char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
return err;
err = probe_kernel_write((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
if (!err)
return err;
/*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
return -EINVAL;
bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
return err;
}
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
-#ifdef CONFIG_DEBUG_RODATA
int err;
char opc[BREAK_INSTR_SIZE];
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
goto knl_write;
return err;
+
knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
return probe_kernel_write((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 30ca760..97340f2 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
processor.cpuflag = CPU_ENABLED;
processor.cpufeature = (boot_cpu_data.x86 << 8) |
(boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.featureflag = boot_cpu_data.x86_capability[0];
+ processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
processor.reserved[0] = 0;
processor.reserved[1] = 0;
for (i = 0; i < 2; i++) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8a2cdd7..04b132a 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -30,6 +30,7 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
+#include <asm/cache.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -69,7 +70,7 @@ struct nmi_stats {
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
-static int ignore_nmis;
+static int ignore_nmis __read_mostly;
int unknown_nmi_panic;
/*
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 14415af..92f7014 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data)
static __init int register_e820_pmem(void)
{
- char *pmem = "Persistent Memory (legacy)";
struct platform_device *pdev;
int rc;
- rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, NULL, found);
if (rc <= 0)
return 0;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c..2915d54 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
+#ifdef CONFIG_X86_32
+ .SYSENTER_stack_canary = STACK_END_MAGIC,
+#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
@@ -418,9 +421,9 @@ static void mwait_idle(void)
if (!current_set_polling_and_test()) {
trace_cpu_idle_rcuidle(1, smp_processor_id());
if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
- smp_mb(); /* quirk */
+ mb(); /* quirk */
clflush((void *)&current_thread_info()->flags);
- smp_mb(); /* quirk */
+ mb(); /* quirk */
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3d80e6..aa52c10 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -152,21 +152,21 @@ static struct resource data_resource = {
.name = "Kernel data",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource code_resource = {
.name = "Kernel code",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource bss_resource = {
.name = "Kernel bss",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 24d57f7..643dbdc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+/* Logical package management. We might want to allocate that dynamically */
+static int *physical_to_logical_pkg __read_mostly;
+static unsigned long *physical_package_map __read_mostly;;
+static unsigned long *logical_package_map __read_mostly;
+static unsigned int max_physical_pkg_id __read_mostly;
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
@@ -248,7 +256,98 @@ static void notrace start_secondary(void *unused)
x86_cpuinit.setup_percpu_clockev();
wmb();
- cpu_startup_entry(CPUHP_ONLINE);
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+
+int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+{
+ unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+ /* Called from early boot ? */
+ if (!physical_package_map)
+ return 0;
+
+ if (pkg >= max_physical_pkg_id)
+ return -EINVAL;
+
+ /* Set the logical package id */
+ if (test_and_set_bit(pkg, physical_package_map))
+ goto found;
+
+ if (pkg < __max_logical_packages) {
+ set_bit(pkg, logical_package_map);
+ physical_to_logical_pkg[pkg] = pkg;
+ goto found;
+ }
+ new = find_first_zero_bit(logical_package_map, __max_logical_packages);
+ if (new >= __max_logical_packages) {
+ physical_to_logical_pkg[pkg] = -1;
+ pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+ apicid, pkg);
+ return -ENOSPC;
+ }
+ set_bit(new, logical_package_map);
+ pr_info("APIC(%x) Converting physical %u to logical package %u\n",
+ apicid, pkg, new);
+ physical_to_logical_pkg[pkg] = new;
+
+found:
+ cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+ return 0;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+ if (phys_pkg >= max_physical_pkg_id)
+ return -1;
+ return physical_to_logical_pkg[phys_pkg];
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+static void __init smp_init_package_map(void)
+{
+ unsigned int ncpus, cpu;
+ size_t size;
+
+ /*
+ * Today neither Intel nor AMD support heterogenous systems. That
+ * might change in the future....
+ */
+ ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+ __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+
+ /*
+ * Possibly larger than what we need as the number of apic ids per
+ * package can be smaller than the actual used apic ids.
+ */
+ max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
+ size = max_physical_pkg_id * sizeof(unsigned int);
+ physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
+ memset(physical_to_logical_pkg, 0xff, size);
+ size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
+ physical_package_map = kzalloc(size, GFP_KERNEL);
+ size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
+ logical_package_map = kzalloc(size, GFP_KERNEL);
+
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+ for_each_present_cpu(cpu) {
+ unsigned int apicid = apic->cpu_present_to_apicid(cpu);
+
+ if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
+ continue;
+ if (!topology_update_package_map(apicid, cpu))
+ continue;
+ pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
+ per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
+ set_cpu_possible(cpu, false);
+ set_cpu_present(cpu, false);
+ }
}
void __init smp_store_boot_cpu_info(void)
@@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
+ smp_init_package_map();
}
/*
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index fdd0c64..9ee98ee 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name)
return 0;
}
-static void
+static int
__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
struct stack_trace *trace = data;
#ifdef CONFIG_FRAME_POINTER
if (!reliable)
- return;
+ return 0;
#endif
if (nosched && in_sched_functions(addr))
- return;
+ return 0;
if (trace->skip > 0) {
trace->skip--;
- return;
+ return 0;
}
- if (trace->nr_entries < trace->max_entries)
+ if (trace->nr_entries < trace->max_entries) {
trace->entries[trace->nr_entries++] = addr;
+ return 0;
+ } else {
+ return -1; /* no more room, stop walking the stack */
+ }
}
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static int save_stack_address(void *data, unsigned long addr, int reliable)
{
return __save_stack_address(data, addr, reliable, false);
}
-static void
+static int
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
return __save_stack_address(data, addr, reliable, true);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496..e72a07f 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
if (!pmd)
return -1;
- pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
if (!pte)
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 3f92ce0..27538f1 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
* by the error message
*/
-#ifdef CONFIG_DEBUG_RODATA
/* Test 3: Check if the .rodata section is executable */
if (rodata_test_data != 0xC3) {
printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
printk(KERN_ERR "test_nx: .rodata section is executable\n");
ret = -ENODEV;
}
-#endif
#if 0
/* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index 5ecbfe5..cb4a01b 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
}
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b0a3915..06cbe25 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -248,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
-#ifdef CONFIG_X86_64
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
printk_ratelimit()) {
pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
@@ -257,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
print_vma_addr(" in ", regs->ip);
pr_cont("\n");
}
-#endif
force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
}
@@ -559,6 +557,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
NOKPROBE_SYMBOL(fixup_bad_iret);
#endif
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+ /*
+ * We don't try for precision here. If we're anywhere in the region of
+ * code that can be single-stepped in the SYSENTER entry path, then
+ * assume that this is a useless single-step trap due to SYSENTER
+ * being invoked with TF set. (We don't know in advance exactly
+ * which instructions will be hit because BTF could plausibly
+ * be set.)
+ */
+#ifdef CONFIG_X86_32
+ return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+ (unsigned long)__end_SYSENTER_singlestep_region -
+ (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+ return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+ (unsigned long)__end_entry_SYSENTER_compat -
+ (unsigned long)entry_SYSENTER_compat;
+#else
+ return false;
+#endif
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -593,11 +614,42 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
ist_enter(regs);
get_debugreg(dr6, 6);
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3. The remaining
+ * contents of the DR6 register are never cleared by the
+ * processor. To avoid confusion in identifying debug
+ * exceptions, debug handlers should clear the register before
+ * returning to the interrupted task.
+ *
+ * Keep it simple: clear DR6 immediately.
+ */
+ set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
dr6 &= ~DR6_RESERVED;
/*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+ if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+ is_sysenter_singlestep(regs))) {
+ dr6 &= ~DR_STEP;
+ if (!dr6)
+ goto exit;
+ /*
+ * else we might have gotten a single-step trap and hit a
+ * watchpoint at the same time, in which case we should fall
+ * through and handle the watchpoint.
+ */
+ }
+
+ /*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
@@ -605,18 +657,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (!dr6 && user_mode(regs))
user_icebp = 1;
- /* Catch kmemcheck conditions first of all! */
+ /* Catch kmemcheck conditions! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
goto exit;
- /* DR6 may or may not be cleared by the CPU */
- set_debugreg(0, 6);
-
- /*
- * The processor cleared BTF, so don't mark that we need it set.
- */
- clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
-
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
@@ -648,14 +692,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
goto exit;
}
- /*
- * Single-stepping through system calls: ignore any exceptions in
- * kernel space, but re-enable TF when returning to user mode.
- *
- * We already checked v86 mode above, so we can check for kernel mode
- * by just checking the CPL of CS.
- */
- if ((dr6 & DR_STEP) && !user_mode(regs)) {
+ if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+ /*
+ * Historical junk that used to handle SYSENTER single-stepping.
+ * This should be unreachable now. If we survive for a while
+ * without anyone hitting this warning, we'll turn this into
+ * an oops.
+ */
tsk->thread.debugreg6 &= ~DR_STEP;
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
@@ -668,6 +711,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
+#if defined(CONFIG_X86_32)
+ /*
+ * This is the most likely code path that involves non-trivial use
+ * of the SYSENTER stack. Check that we haven't overrun it.
+ */
+ WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+ "Overran or corrupted SYSENTER stack\n");
+#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@@ -741,10 +792,9 @@ dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- BUG_ON(use_eager_fpu());
#ifdef CONFIG_MATH_EMULATION
- if (read_cr0() & X86_CR0_EM) {
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
struct math_emu_info info = { };
cond_local_irq_enable(regs);
@@ -859,7 +909,7 @@ void __init trap_init(void)
#endif
#ifdef CONFIG_X86_32
- set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da..5638044 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
+static u32 art_to_tsc_numerator;
+static u32 art_to_tsc_denominator;
+static u64 art_to_tsc_offset;
+struct clocksource *art_related_clocksource;
+
/*
* Use a ring-buffer like data structure, where a writer advances the head by
* writing a new data entry and a reader advances the tail when it observes a
@@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc);
#endif /* CONFIG_CPU_FREQ */
+#define ART_CPUID_LEAF (0x15)
+#define ART_MIN_DENOMINATOR (1)
+
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void detect_art(void)
+{
+ unsigned int unused[2];
+
+ if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+ return;
+
+ cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+ &art_to_tsc_numerator, unused, unused+1);
+
+ /* Don't enable ART in a VM, non-stop TSC required */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+ art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ return;
+
+ if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+ return;
+
+ /* Make this sticky over multiple CPU init calls */
+ setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
/* clocksource code */
static struct clocksource clocksource_tsc;
@@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void)
return 0;
}
+/*
+ * Convert ART to TSC given numerator/denominator found in detect_art()
+ */
+struct system_counterval_t convert_art_to_tsc(cycle_t art)
+{
+ u64 tmp, res, rem;
+
+ rem = do_div(art, art_to_tsc_denominator);
+
+ res = art * art_to_tsc_numerator;
+ tmp = rem * art_to_tsc_numerator;
+
+ do_div(tmp, art_to_tsc_denominator);
+ res += tmp + art_to_tsc_offset;
+
+ return (struct system_counterval_t) {.cs = art_related_clocksource,
+ .cycles = res};
+}
+EXPORT_SYMBOL(convert_art_to_tsc);
static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
@@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
(unsigned long)tsc_khz % 1000);
out:
+ if (boot_cpu_has(X86_FEATURE_ART))
+ art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
@@ -1235,6 +1292,8 @@ void __init tsc_init(void)
mark_tsc_unstable("TSCs unsynchronized");
check_system_tsc_reliable();
+
+ detect_art();
}
#ifdef CONFIG_SMP
@@ -1246,14 +1305,14 @@ void __init tsc_init(void)
*/
unsigned long calibrate_delay_is_known(void)
{
- int i, cpu = smp_processor_id();
+ int sibling, cpu = smp_processor_id();
if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
return 0;
- for_each_online_cpu(i)
- if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
- return cpu_data(i).loops_per_jiffy;
+ sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+ if (sibling < nr_cpu_ids)
+ return cpu_data(sibling).loops_per_jiffy;
return 0;
}
#endif
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 13fa0ad..73de260 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
/*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
*
* However, kernel identity mappings will have different RWX permissions
* to the pages mapping to text and to the pages padding (which are freed) the
* text section. Hence kernel identity mappings will be broken to smaller
* pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
*/
-#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
-#define X64_ALIGN_DEBUG_RODATA_END \
+#define X64_ALIGN_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
#else
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
#endif
@@ -82,11 +81,11 @@ PHDRS {
SECTIONS
{
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
+ . = __START_KERNEL;
+ phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
/* Text and read-only data */
@@ -112,13 +111,11 @@ SECTIONS
EXCEPTION_TABLE(16) :text = 0x9090
-#if defined(CONFIG_DEBUG_RODATA)
/* .text should occupy whole number of pages */
. = ALIGN(PAGE_SIZE);
-#endif
- X64_ALIGN_DEBUG_RODATA_BEGIN
+ X64_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
- X64_ALIGN_DEBUG_RODATA_END
+ X64_ALIGN_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be..cd05942 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(_copy_from_user);
EXPORT_SYMBOL(_copy_to_user);
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a1ff508..464fa47 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o
+ hyperv.o page_track.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
+
kvm-intel-y += vmx.o pmu_intel.o
kvm-amd-y += svm.o pmu_amd.o
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 9dc091a..308b859 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel {
static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
int assigned_dev_id)
{
- struct list_head *ptr;
struct kvm_assigned_dev_kernel *match;
- list_for_each(ptr, head) {
- match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+ list_for_each_entry(match, head, list) {
if (match->assigned_dev_id == assigned_dev_id)
return match;
}
@@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm,
void kvm_free_all_assigned_devices(struct kvm *kvm)
{
- struct list_head *ptr, *ptr2;
- struct kvm_assigned_dev_kernel *assigned_dev;
-
- list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
- assigned_dev = list_entry(ptr,
- struct kvm_assigned_dev_kernel,
- list);
+ struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
+ list_for_each_entry_safe(assigned_dev, tmp,
+ &kvm->arch.assigned_dev_head, list) {
kvm_free_assigned_device(kvm, assigned_dev);
}
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6525e92..0029644 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -46,11 +46,18 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
+bool kvm_mpx_supported(void)
+{
+ return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+ && kvm_x86_ops->mpx_supported());
+}
+EXPORT_SYMBOL_GPL(kvm_mpx_supported);
+
u64 kvm_supported_xcr0(void)
{
u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
- if (!kvm_x86_ops->mpx_supported())
+ if (!kvm_mpx_supported())
xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
return xcr0;
@@ -97,8 +104,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
- if (vcpu->arch.eager_fpu)
+ if (use_eager_fpu())
kvm_x86_ops->fpu_activate(vcpu);
/*
@@ -295,7 +301,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
#endif
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
- unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+ unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
/* cpuid 1.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c8eda14..66a6581 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -5,6 +5,7 @@
#include <asm/cpu.h>
int kvm_update_cpuid(struct kvm_vcpu *vcpu);
+bool kvm_mpx_supported(void);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
@@ -135,14 +136,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
-static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_MPX));
-}
-
static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80363eb..0f62943 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -663,10 +663,10 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
u16 sel;
la = seg_base(ctxt, addr.seg) + addr.ea;
- *linear = la;
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
+ *linear = la;
if (is_noncanonical_address(la))
goto bad;
@@ -675,6 +675,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
goto bad;
break;
default:
+ *linear = la = (u32)la;
usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
addr.seg);
if (!usable)
@@ -702,7 +703,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
if (size > *max_size)
goto bad;
}
- la &= (u32)-1;
break;
}
if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c58ba67..5ff3485 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
}
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_mode(vcpu);
+ if (longmode)
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+ else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+ return 1;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
@@ -1055,7 +1076,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
*/
if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
+ return 1;
}
longmode = is_64_bit_mode(vcpu);
@@ -1083,22 +1104,33 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+ /* Hypercall continuation is not supported yet */
+ if (rep_cnt || rep_idx) {
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ goto set_result;
+ }
+
switch (code) {
- case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
kvm_vcpu_on_spin(vcpu);
break;
+ case HVCALL_POST_MESSAGE:
+ case HVCALL_SIGNAL_EVENT:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = param;
+ vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_hypercall_complete_userspace;
+ return 0;
default:
res = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
+set_result:
ret = res | (((u64)rep_done & 0xfff) << 32);
- if (longmode) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
- } else {
- kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
- }
-
+ kvm_hv_hypercall_set_result(vcpu, ret);
return 1;
}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index b0ea42b..a4bf5b4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -51,32 +51,9 @@
#define RW_STATE_WORD0 3
#define RW_STATE_WORD1 4
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
{
- union {
- u64 ll;
- struct {
- u32 low, high;
- } l;
- } u, res;
- u64 rl, rh;
-
- u.ll = a;
- rl = (u64)u.l.low * (u64)b;
- rh = (u64)u.l.high * (u64)b;
- rh += (rl >> 32);
- res.l.high = div64_u64(rh, c);
- res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
- return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
switch (c->mode) {
default:
@@ -97,18 +74,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
c->gate = val;
}
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
{
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- return kvm->arch.vpit->pit_state.channels[channel].gate;
+ return pit->pit_state.channels[channel].gate;
}
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
{
s64 elapsed;
ktime_t remaining;
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ struct kvm_kpit_state *ps = &pit->pit_state;
if (!ps->period)
return 0;
@@ -128,26 +103,23 @@ static s64 __kpit_elapsed(struct kvm *kvm)
return elapsed;
}
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
int channel)
{
if (channel == 0)
- return __kpit_elapsed(kvm);
+ return __kpit_elapsed(pit);
return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
}
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int counter;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
case 0:
@@ -167,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel)
return counter;
}
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int out;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
default:
@@ -202,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
return out;
}
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->count_latched) {
- c->latched_count = pit_get_count(kvm, channel);
+ c->latched_count = pit_get_count(pit, channel);
c->count_latched = c->rw_mode;
}
}
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->status_latched) {
/* TODO: Return NULL COUNT (bit 6). */
- c->status = ((pit_get_out(kvm, channel) << 7) |
+ c->status = ((pit_get_out(pit, channel) << 7) |
(c->rw_mode << 4) |
(c->mode << 1) |
c->bcd);
@@ -232,26 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+ return container_of(ps, struct kvm_pit, pit_state);
+}
+
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- int value;
-
- spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pending);
- if (value < 0)
- /* spurious acks can be generated if, for example, the
- * PIC is being reset. Handle it gracefully here
- */
- atomic_inc(&ps->pending);
- else if (value > 0)
- /* in this case, we had multiple outstanding pit interrupts
- * that we needed to inject. Reinject
- */
- queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
- ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ queue_kthread_work(&pit->worker, &pit->expired);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -282,45 +243,36 @@ static void pit_do_work(struct kthread_work *work)
struct kvm_vcpu *vcpu;
int i;
struct kvm_kpit_state *ps = &pit->pit_state;
- int inject = 0;
- /* Try to inject pending interrupts when
- * last one has been acked.
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
*/
- spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- spin_unlock(&ps->inject_lock);
- if (inject) {
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
- struct kvm_pit *pt = ps->kvm->arch.vpit;
+ struct kvm_pit *pt = pit_state_to_pit(ps);
- if (ps->reinject || !atomic_read(&ps->pending)) {
+ if (atomic_read(&ps->reinject))
atomic_inc(&ps->pending);
- queue_kthread_work(&pt->worker, &pt->expired);
- }
+
+ queue_kthread_work(&pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
@@ -329,30 +281,54 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ if (reinject) {
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
s64 interval;
if (!ioapic_in_kernel(kvm) ||
ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
return;
- interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&ps->timer);
- flush_kthread_work(&ps->pit->expired);
+ flush_kthread_work(&pit->expired);
ps->period = interval;
ps->is_periodic = is_period;
- ps->timer.function = pit_timer_fn;
- ps->kvm = ps->pit->kvm;
-
- atomic_set(&ps->pending, 0);
- ps->irq_ack = 1;
+ kvm_pit_reset_reinject(pit);
/*
* Do not allow the guest to program periodic timers with small
@@ -375,11 +351,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
HRTIMER_MODE_ABS);
}
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
- WARN_ON(!mutex_is_locked(&ps->lock));
+ struct kvm_kpit_state *ps = &pit->pit_state;
pr_debug("load_count val is %d, channel is %d\n", val, channel);
@@ -404,29 +378,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(kvm, val, 0);
+ create_pit_timer(pit, val, 0);
break;
case 2:
case 3:
- create_pit_timer(kvm, val, 1);
+ create_pit_timer(pit, val, 1);
break;
default:
- destroy_pit_timer(kvm->arch.vpit);
+ destroy_pit_timer(pit);
}
}
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
WARN_ON(channel != 0);
- saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
- kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
- pit_load_count(kvm, channel, val);
- kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
} else {
- pit_load_count(kvm, channel, val);
+ pit_load_count(pit, channel, val);
}
}
@@ -452,7 +430,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int channel, access;
struct kvm_kpit_channel_state *s;
u32 val = *(u32 *) data;
@@ -476,9 +453,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
s = &pit_state->channels[channel];
if (val & (2 << channel)) {
if (!(val & 0x20))
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
if (!(val & 0x10))
- pit_latch_status(kvm, channel);
+ pit_latch_status(pit, channel);
}
}
} else {
@@ -486,7 +463,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
s = &pit_state->channels[channel];
access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
if (access == 0) {
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
} else {
s->rw_mode = access;
s->read_state = access;
@@ -503,17 +480,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
switch (s->write_state) {
default:
case RW_STATE_LSB:
- pit_load_count(kvm, addr, val);
+ pit_load_count(pit, addr, val);
break;
case RW_STATE_MSB:
- pit_load_count(kvm, addr, val << 8);
+ pit_load_count(pit, addr, val << 8);
break;
case RW_STATE_WORD0:
s->write_latch = val;
s->write_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
s->write_state = RW_STATE_WORD0;
break;
}
@@ -529,7 +506,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int ret, count;
struct kvm_kpit_channel_state *s;
if (!pit_in_range(addr))
@@ -566,20 +542,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
switch (s->read_state) {
default:
case RW_STATE_LSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
break;
case RW_STATE_MSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
break;
case RW_STATE_WORD0:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
s->read_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
s->read_state = RW_STATE_WORD0;
break;
@@ -600,14 +576,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
if (addr != KVM_SPEAKER_BASE_ADDRESS)
return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
pit_state->speaker_data_on = (val >> 1) & 1;
- pit_set_gate(kvm, 2, val & 1);
+ pit_set_gate(pit, 2, val & 1);
mutex_unlock(&pit_state->lock);
return 0;
}
@@ -618,7 +593,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
int ret;
if (addr != KVM_SPEAKER_BASE_ADDRESS)
@@ -628,8 +602,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
mutex_lock(&pit_state->lock);
- ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
- (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+ (pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
if (len > sizeof(ret))
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
@@ -637,33 +611,28 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
return 0;
}
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
{
int i;
struct kvm_kpit_channel_state *c;
- mutex_lock(&pit->pit_state.lock);
pit->pit_state.flags = 0;
for (i = 0; i < 3; i++) {
c = &pit->pit_state.channels[i];
c->mode = 0xff;
c->gate = (i != 2);
- pit_load_count(pit->kvm, i, 0);
+ pit_load_count(pit, i, 0);
}
- mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pending, 0);
- pit->pit_state.irq_ack = 1;
+ kvm_pit_reset_reinject(pit);
}
static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
{
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
- if (!mask) {
- atomic_set(&pit->pit_state.pending, 0);
- pit->pit_state.irq_ack = 1;
- }
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
}
static const struct kvm_io_device_ops pit_dev_ops = {
@@ -690,14 +659,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
return NULL;
pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0) {
- kfree(pit);
- return NULL;
- }
+ if (pit->irq_source_id < 0)
+ goto fail_request;
mutex_init(&pit->pit_state.lock);
- mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
pid = get_pid(task_tgid(current));
pid_nr = pid_vnr(pid);
@@ -706,36 +671,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
init_kthread_worker(&pit->worker);
pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
"kvm-pit/%d", pid_nr);
- if (IS_ERR(pit->worker_task)) {
- mutex_unlock(&pit->pit_state.lock);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
- kfree(pit);
- return NULL;
- }
+ if (IS_ERR(pit->worker_task))
+ goto fail_kthread;
+
init_kthread_work(&pit->expired, pit_do_work);
- kvm->arch.vpit = pit;
pit->kvm = kvm;
pit_state = &pit->pit_state;
- pit_state->pit = pit;
hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->timer.function = pit_timer_fn;
+
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
- kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->reinject = true;
- mutex_unlock(&pit->pit_state.lock);
+ pit->mask_notifier.func = pit_mask_notifer;
kvm_pit_reset(pit);
- pit->mask_notifier.func = pit_mask_notifer;
- kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_pit_set_reinject(pit, true);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
KVM_PIT_MEM_LENGTH, &pit->dev);
if (ret < 0)
- goto fail;
+ goto fail_register_pit;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
@@ -743,42 +702,35 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
KVM_SPEAKER_BASE_ADDRESS, 4,
&pit->speaker_dev);
if (ret < 0)
- goto fail_unregister;
+ goto fail_register_speaker;
}
return pit;
-fail_unregister:
+fail_register_speaker:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
- kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_register_pit:
+ kvm_pit_set_reinject(pit, false);
kthread_stop(pit->worker_task);
+fail_kthread:
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
kfree(pit);
return NULL;
}
void kvm_free_pit(struct kvm *kvm)
{
- struct hrtimer *timer;
-
- if (kvm->arch.vpit) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &kvm->arch.vpit->speaker_dev);
- kvm_unregister_irq_mask_notifier(kvm, 0,
- &kvm->arch.vpit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm,
- &kvm->arch.vpit->pit_state.irq_ack_notifier);
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.timer;
- hrtimer_cancel(timer);
- flush_kthread_work(&kvm->arch.vpit->expired);
- kthread_stop(kvm->arch.vpit->worker_task);
- kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- kfree(kvm->arch.vpit);
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (pit) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ flush_kthread_work(&pit->expired);
+ kthread_stop(pit->worker_task);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index c84990b..2f5af07 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -22,19 +22,18 @@ struct kvm_kpit_channel_state {
};
struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
struct kvm_kpit_channel_state channels[3];
u32 flags;
bool is_periodic;
s64 period; /* unit: ns */
struct hrtimer timer;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm *kvm;
u32 speaker_data_on;
+
struct mutex lock;
- struct kvm_pit *pit;
- spinlock_t inject_lock;
- unsigned long irq_ack;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
@@ -57,9 +56,11 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
#endif
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 1facfd6..9db4709 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
return;
new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
- old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
if (new_val == old_val)
return;
if (new_val) {
- __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
ioapic->rtc_status.pending_eoi++;
} else {
- __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
ioapic->rtc_status.pending_eoi--;
rtc_status_pending_eoi_check_valid(ioapic);
}
@@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
{
- if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+ if (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map)) {
--ioapic->rtc_status.pending_eoi;
rtc_status_pending_eoi_check_valid(ioapic);
}
@@ -236,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
union kvm_ioapic_redirect_entry *e;
int index;
spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
for (index = 0; index < IOAPIC_NUM_PINS; index++) {
e = &ioapic->redirtbl[index];
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
@@ -346,7 +354,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
*/
BUG_ON(ioapic->rtc_status.pending_eoi != 0);
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
- ioapic->rtc_status.dest_map);
+ &ioapic->rtc_status.dest_map);
ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
} else
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
@@ -407,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
struct kvm_ioapic *ioapic, int vector, int trigger_mode)
{
- int i;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ vector == dest_map->vectors[vcpu->vcpu_id])
+ rtc_irq_eoi(ioapic, vcpu);
for (i = 0; i < IOAPIC_NUM_PINS; i++) {
union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -416,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
if (ent->fields.vector != vector)
continue;
- if (i == RTC_GSI)
- rtc_irq_eoi(ioapic, vcpu);
/*
* We are dropping lock while calling ack notifiers because ack
* notifier callbacks for assigned devices call into IOAPIC
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 2d16dc2..7d2692a 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -40,9 +40,21 @@ struct kvm_vcpu;
#define RTC_GSI -1U
#endif
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPUS);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPUS];
+};
+
+
struct rtc_status {
int pending_eoi;
- DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+ struct dest_map dest_map;
};
union kvm_ioapic_redirect_entry {
@@ -118,7 +130,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, unsigned long *dest_map);
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3982b47..95fcc7b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -33,7 +33,10 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- return apic_has_pending_timer(vcpu);
+ if (lapic_in_kernel(vcpu))
+ return apic_has_pending_timer(vcpu);
+
+ return 0;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
- kvm_inject_apic_timer_irqs(vcpu);
- /* TODO: PIT, RTC etc. */
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ae5c78f..61ebdc1 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
return ret;
}
-static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
-{
- /* Same as irqchip_in_kernel(vcpu->kvm), but with less
- * pointer chasing and no unnecessary memory barriers.
- */
- return vcpu->arch.apic != NULL;
-}
-
void kvm_pic_reset(struct kvm_kpic_state *s);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8fc89ef..54ead79 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -34,6 +34,7 @@
#include "lapic.h"
#include "hyperv.h"
+#include "x86.h"
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
@@ -53,10 +54,12 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
}
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, unsigned long *dest_map)
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
kvm_lowest_prio_delivery(irq)) {
@@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
return r;
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
@@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_lapic_enabled(vcpu)) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
}
}
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591fa..443d2a5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return;
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -475,26 +475,20 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
- int highest_irr;
-
/* This may race with setting of irr in __apic_accept_irq() and
* value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
* will cause vmexit immediately and the value will be recalculated
* on the next vmentry.
*/
- if (!kvm_vcpu_has_lapic(vcpu))
- return 0;
- highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-
- return highest_irr;
+ return apic_find_highest_irr(vcpu->arch.apic);
}
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
- unsigned long *dest_map);
+ struct dest_map *dest_map);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
- unsigned long *dest_map)
+ struct dest_map *dest_map)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -675,8 +669,33 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
}
}
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ u32 mod;
+ int i, idx = -1;
+
+ mod = vector % dest_vcpus;
+
+ for (i = 0; i <= mod; i++) {
+ idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+ BUG_ON(idx == bitmap_size);
+ }
+
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ printk(KERN_INFO
+ "Disabled LAPIC found during irq injection\n");
+ }
+}
+
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
{
struct kvm_apic_map *map;
unsigned long bitmap = 1;
@@ -727,21 +746,42 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
dst = map->logical_map[cid];
- if (kvm_lowest_prio_delivery(irq)) {
+ if (!kvm_lowest_prio_delivery(irq))
+ goto set_irq;
+
+ if (!kvm_vector_hashing_enabled()) {
int l = -1;
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
continue;
if (l < 0)
l = i;
- else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+ else if (kvm_apic_compare_prio(dst[i]->vcpu,
+ dst[l]->vcpu) < 0)
l = i;
}
-
bitmap = (l >= 0) ? 1 << l : 0;
+ } else {
+ int idx;
+ unsigned int dest_vcpus;
+
+ dest_vcpus = hweight16(bitmap);
+ if (dest_vcpus == 0)
+ goto out;
+
+ idx = kvm_vector_to_index(irq->vector,
+ dest_vcpus, &bitmap, 16);
+
+ if (!dst[idx]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ goto out;
+ }
+
+ bitmap = (idx >= 0) ? 1 << idx : 0;
}
}
+set_irq:
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
continue;
@@ -754,6 +794,20 @@ out:
return ret;
}
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destinaiton vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
@@ -795,16 +849,37 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
if (cid >= ARRAY_SIZE(map->logical_map))
goto out;
- for_each_set_bit(i, &bitmap, 16) {
- dst = map->logical_map[cid][i];
- if (++r == 2)
+ if (kvm_vector_hashing_enabled() &&
+ kvm_lowest_prio_delivery(irq)) {
+ int idx;
+ unsigned int dest_vcpus;
+
+ dest_vcpus = hweight16(bitmap);
+ if (dest_vcpus == 0)
goto out;
- }
- if (dst && kvm_apic_present(dst->vcpu))
+ idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ &bitmap, 16);
+
+ dst = map->logical_map[cid][idx];
+ if (!dst) {
+ kvm_apic_disabled_lapic_found(kvm);
+ goto out;
+ }
+
*dest_vcpu = dst->vcpu;
- else
- goto out;
+ } else {
+ for_each_set_bit(i, &bitmap, 16) {
+ dst = map->logical_map[cid][i];
+ if (++r == 2)
+ goto out;
+ }
+
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ }
}
ret = true;
@@ -819,7 +894,7 @@ out:
*/
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
- unsigned long *dest_map)
+ struct dest_map *dest_map)
{
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -839,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
result = 1;
- if (dest_map)
- __set_bit(vcpu->vcpu_id, dest_map);
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
@@ -1195,7 +1272,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
static void apic_timer_expired(struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu = apic->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
+ struct swait_queue_head *q = &vcpu->wq;
struct kvm_timer *ktimer = &apic->lapic_timer;
if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1281,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
atomic_inc(&apic->lapic_timer.pending);
kvm_set_pending_timer(vcpu);
- if (waitqueue_active(q))
- wake_up_interruptible(q);
+ if (swait_active(q))
+ swake_up(q);
if (apic_lvtt_tscdeadline(apic))
ktimer->expired_tscdeadline = ktimer->tscdeadline;
@@ -1239,7 +1316,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 guest_tsc, tsc_deadline;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return;
if (apic->lapic_timer.expired_tscdeadline == 0)
@@ -1515,8 +1592,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_has_lapic(vcpu))
- apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+ apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
@@ -1566,7 +1642,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return 0;
@@ -1577,7 +1653,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return;
@@ -1590,9 +1666,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu))
- return;
-
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
}
@@ -1601,9 +1674,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
u64 tpr;
- if (!kvm_vcpu_has_lapic(vcpu))
- return 0;
-
tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
@@ -1728,8 +1798,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
- apic_lvt_enabled(apic, APIC_LVTT))
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
return atomic_read(&apic->lapic_timer.pending);
return 0;
@@ -1826,7 +1895,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+ if (!apic_enabled(apic))
return -1;
apic_update_ppr(apic);
@@ -1854,9 +1923,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu))
- return;
-
if (atomic_read(&apic->lapic_timer.pending) > 0) {
kvm_apic_local_deliver(apic, APIC_LVTT);
if (apic_lvtt_tscdeadline(apic))
@@ -1932,7 +1998,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
struct hrtimer *timer;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return;
timer = &vcpu->arch.apic->lapic_timer.timer;
@@ -2105,7 +2171,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return 1;
/* if this is ICR write vector before command */
@@ -2119,7 +2185,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 low, high = 0;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return 1;
if (apic_reg_read(apic, reg, 4, &low))
@@ -2151,7 +2217,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
u8 sipi_vector;
unsigned long pe;
- if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+ if (!lapic_in_kernel(vcpu) || !apic->pending_events)
return;
/*
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 41bdb35..f71183e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -42,6 +42,9 @@ struct kvm_lapic {
unsigned long pending_events;
unsigned int sipi_vector;
};
+
+struct dest_map;
+
int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
- unsigned long *dest_map);
+ struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -103,7 +106,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
extern struct static_key kvm_no_apic_vcpu;
-static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
if (static_key_false(&kvm_no_apic_vcpu))
return vcpu->arch.apic;
@@ -130,7 +133,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
}
static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
@@ -150,7 +153,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
}
static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
@@ -161,7 +164,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
static inline int kvm_apic_id(struct kvm_lapic *apic)
@@ -175,4 +178,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95a955d..c512f09 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -41,6 +41,7 @@
#include <asm/cmpxchg.h>
#include <asm/io.h>
#include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
/*
* When setting this variable to true it enables Two-Dimensional-Paging
@@ -776,62 +777,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->disallow_lpage += count;
+ WARN_ON(linfo->disallow_lpage < 0);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
gfn_t gfn;
- int i;
+ kvm->arch.indirect_shadow_pages++;
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
- for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count += 1;
- }
- kvm->arch.indirect_shadow_pages++;
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_add_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
gfn_t gfn;
- int i;
+ kvm->arch.indirect_shadow_pages--;
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
- for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count -= 1;
- WARN_ON(linfo->write_count < 0);
- }
- kvm->arch.indirect_shadow_pages--;
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-static int __has_wrprotected_page(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
struct kvm_lpage_info *linfo;
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
- return linfo->write_count;
+ return !!linfo->disallow_lpage;
}
- return 1;
+ return true;
}
-static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int level)
{
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return __has_wrprotected_page(gfn, level, slot);
+ return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
}
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@ -897,7 +921,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (__has_wrprotected_page(large_gfn, level, slot))
+ if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
break;
return level - 1;
@@ -1323,23 +1347,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn)
{
- struct kvm_memory_slot *slot;
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
return write_protected;
}
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
@@ -1754,7 +1784,7 @@ static void mark_unsync(u64 *spte)
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- return 1;
+ return 0;
}
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1840,13 +1870,16 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return nr_unsync_leaf;
}
+#define INVALID_INDEX (-1)
+
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
+ pvec->nr = 0;
if (!sp->unsync_children)
return 0;
- mmu_pages_add(pvec, sp, 0);
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
return __mmu_unsync_walk(sp, pvec);
}
@@ -1883,37 +1916,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
if ((_sp)->role.direct || (_sp)->role.invalid) {} else
/* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list, bool clear_unsync)
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
+ return false;
}
- if (clear_unsync)
- kvm_unlink_unsync_page(vcpu->kvm, sp);
-
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
+ return false;
}
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- return 0;
+ return true;
}
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+ struct list_head *invalid_list,
+ bool remote_flush, bool local_flush)
{
- LIST_HEAD(invalid_list);
- int ret;
-
- ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
- if (ret)
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (!list_empty(invalid_list)) {
+ kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ return;
+ }
- return ret;
+ if (remote_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else if (local_flush)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
#ifdef CONFIG_KVM_MMU_AUDIT
@@ -1923,46 +1954,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- return __kvm_sync_page(vcpu, sp, invalid_list, true);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return __kvm_sync_page(vcpu, sp, invalid_list);
}
/* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *s;
- LIST_HEAD(invalid_list);
- bool flush = false;
+ bool ret = false;
for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
if (!s->unsync)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- kvm_unlink_unsync_page(vcpu->kvm, s);
- if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s))) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
- continue;
- }
- flush = true;
+ ret |= kvm_sync_page(vcpu, s, invalid_list);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- if (flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ return ret;
}
struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- unsigned int idx[PT64_ROOT_LEVEL-1];
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+ unsigned int idx[PT64_ROOT_LEVEL];
};
#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_next(&pvec, &parents, -1), \
- sp = pvec.page[i].sp; \
+ for (i = mmu_pages_first(&pvec, &parents); \
i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
i = mmu_pages_next(&pvec, &parents, i))
@@ -1974,19 +1997,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
for (n = i+1; n < pvec->nr; n++) {
struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- parents->idx[0] = pvec->page[n].idx;
- return n;
- }
+ parents->idx[level-1] = idx;
+ if (level == PT_PAGE_TABLE_LEVEL)
+ break;
- parents->parent[sp->role.level-2] = sp;
- parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ parents->parent[level-2] = sp;
}
return n;
}
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
{
struct kvm_mmu_page *sp;
@@ -1994,22 +2041,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
do {
unsigned int idx = parents->idx[level];
-
sp = parents->parent[level];
if (!sp)
return;
+ WARN_ON(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;
- } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- struct mmu_page_path *parents,
- struct kvm_mmu_pages *pvec)
-{
- parents->parent[parent->role.level-1] = NULL;
- pvec->nr = 0;
+ } while (!sp->unsync_children);
}
static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2020,30 +2059,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
LIST_HEAD(invalid_list);
+ bool flush = false;
- kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu, sp->gfn);
- if (protected)
+ if (protected) {
kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = false;
+ }
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp, &invalid_list);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
+ if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
}
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
}
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
{
- sp->write_flooding_count = 0;
+ atomic_set(&sp->write_flooding_count, 0);
}
static void clear_sp_write_flooding_count(u64 *spte)
@@ -2069,6 +2114,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct kvm_mmu_page *sp;
bool need_sync = false;
+ bool flush = false;
+ LIST_HEAD(invalid_list);
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -2092,8 +2139,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->role.word != role.word)
continue;
- if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
- break;
+ if (sp->unsync) {
+ /* The page is good, but __kvm_sync_page might still end
+ * up zapping it. If so, break in order to rebuild it.
+ */
+ if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ break;
+
+ WARN_ON(!list_empty(&invalid_list));
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
if (sp->unsync_children)
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -2112,16 +2167,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
hlist_add_head(&sp->hash_link,
&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
- if (rmap_write_protect(vcpu, gfn))
+ /*
+ * we should do write protection before syncing pages
+ * otherwise the content of the synced shadow page may
+ * be inconsistent with guest page table.
+ */
+ account_shadowed(vcpu->kvm, sp);
+ if (level == PT_PAGE_TABLE_LEVEL &&
+ rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
- if (level > PT_PAGE_TABLE_LEVEL && need_sync)
- kvm_sync_pages(vcpu, gfn);
- account_shadowed(vcpu->kvm, sp);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
return sp;
}
@@ -2269,7 +2332,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
if (parent->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
- kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
struct kvm_mmu_page *sp;
@@ -2278,7 +2340,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
mmu_pages_clear_parents(&parents);
zapped++;
}
- kvm_mmu_pages_init(parent, &parents, &pages);
}
return zapped;
@@ -2354,8 +2415,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
if (list_empty(&kvm->arch.active_mmu_pages))
return false;
- sp = list_entry(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
+ sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
return true;
@@ -2408,7 +2469,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
@@ -2417,37 +2478,26 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
kvm_mmu_mark_parents_unsync(sp);
}
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
{
- struct kvm_mmu_page *s;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- if (s->unsync)
- continue;
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- __kvm_unsync_page(vcpu, s);
- }
-}
+ struct kvm_mmu_page *sp;
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
-{
- struct kvm_mmu_page *s;
- bool need_unsync = false;
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (!can_unsync)
- return 1;
+ return true;
- if (s->role.level != PT_PAGE_TABLE_LEVEL)
- return 1;
+ if (sp->unsync)
+ continue;
- if (!s->unsync)
- need_unsync = true;
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unsync_page(vcpu, sp);
}
- if (need_unsync)
- kvm_unsync_pages(vcpu, gfn);
- return 0;
+
+ return false;
}
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -2503,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu, gfn, level))
+ mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
goto done;
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2768,7 +2818,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
@@ -2796,20 +2846,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
kvm_pfn_t pfn, unsigned access, int *ret_val)
{
- bool ret = true;
-
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
- goto exit;
+ return true;
}
if (unlikely(is_noslot_pfn(pfn)))
vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- ret = false;
-exit:
- return ret;
+ return false;
}
static bool page_fault_can_be_fast(u32 error_code)
@@ -3273,7 +3319,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
}
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
if (direct)
return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3332,7 +3378,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
u64 spte;
bool reserved;
- if (quickly_check_mmio_pf(vcpu, addr, direct))
+ if (mmio_info_in_cache(vcpu, addr, direct))
return RET_MMIO_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@ -3362,20 +3408,53 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ u32 error_code, gfn_t gfn)
+{
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ clear_sp_write_flooding_count(iterator.sptep);
+ if (!is_shadow_present_pte(spte))
+ break;
+ }
+ walk_shadow_page_lockless_end(vcpu);
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
- gfn_t gfn;
+ gfn_t gfn = gva >> PAGE_SHIFT;
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gva, true);
-
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
- }
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return 1;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3383,7 +3462,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
error_code, gfn, prefault);
@@ -3460,12 +3538,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gpa, true);
-
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
- }
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return 1;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3558,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
-static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+ unsigned level, unsigned gpte)
{
- unsigned index;
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
- index = level - 1;
- index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
- return mmu->last_pte_bitmap & (1 << index);
+ /*
+ * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+ * If it is clear, there are no large pages at this level, so clear
+ * PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - mmu->last_nonleaf_level;
+
+ return gpte & PT_PAGE_SIZE_MASK;
}
#define PTTYPE_EPT 18 /* arbitrary */
@@ -3721,13 +3806,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
+ bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
/*
* Passing "true" to the last argument is okay; it adds a check
* on bit 8 of the SPTEs which KVM doesn't use anyway.
*/
__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
boot_cpu_data.x86_phys_bits,
- context->shadow_root_level, context->nx,
+ context->shadow_root_level, uses_nx,
guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
true);
}
@@ -3836,22 +3923,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
}
}
-static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
- u8 map;
- unsigned level, root_level = mmu->root_level;
- const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
-
- if (root_level == PT32E_ROOT_LEVEL)
- --root_level;
- /* PT_PAGE_TABLE_LEVEL always terminates */
- map = 1 | (1 << ps_set_index);
- for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
- if (level <= PT_PDPE_LEVEL
- && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
- map |= 1 << (ps_set_index | (level - 1));
- }
- mmu->last_pte_bitmap = map;
+ unsigned root_level = mmu->root_level;
+
+ mmu->last_nonleaf_level = root_level;
+ if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+ mmu->last_nonleaf_level++;
}
static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@ -3863,7 +3941,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
@@ -3890,7 +3968,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -3948,7 +4026,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
@@ -4054,7 +4132,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, g_context, false);
- update_last_pte_bitmap(vcpu, g_context);
+ update_last_nonleaf_level(vcpu, g_context);
}
static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -4125,18 +4203,6 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
- bool remote_flush, bool local_flush)
-{
- if (zap_page)
- return;
-
- if (remote_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
const u8 *new, int *bytes)
{
@@ -4186,7 +4252,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return false;
- return ++sp->write_flooding_count >= 3;
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
}
/*
@@ -4248,15 +4315,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush, zap_page;
+ bool remote_flush, local_flush;
union kvm_mmu_page_role mask = { };
mask.cr0_wp = 1;
@@ -4273,7 +4340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- zap_page = remote_flush = local_flush = false;
+ remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -4293,8 +4360,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
- zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -4316,8 +4382,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
++spte;
}
}
- mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
}
@@ -4354,32 +4419,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
- if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
- return vcpu_match_mmio_gpa(vcpu, addr);
-
- return vcpu_match_mmio_gva(vcpu, addr);
-}
-
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_RETRY;
enum emulation_result er;
+ bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, cr2, direct);
+ if (r == RET_MMIO_PF_EMULATE) {
+ emulation_type = 0;
+ goto emulate;
+ }
+ if (r == RET_MMIO_PF_RETRY)
+ return 1;
+ if (r < 0)
+ return r;
+ }
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
- goto out;
-
- if (!r) {
- r = 1;
- goto out;
- }
+ return r;
+ if (!r)
+ return 1;
- if (is_mmio_page_fault(vcpu, cr2))
+ if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
-
+emulate:
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
switch (er) {
@@ -4393,8 +4460,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
default:
BUG();
}
-out:
- return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -4463,6 +4528,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ node->track_write = kvm_mmu_pte_write;
+ kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ kvm_page_track_unregister_notifier(kvm, node);
+}
+
/* The return value indicates if tlb flush on all vcpus is needed. */
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 55ffb7b..58fe98a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -174,4 +174,9 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn);
#endif
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644
index 0000000..11f7643
--- /dev/null
+++ b/arch/x86/kvm/page_track.c
@@ -0,0 +1,222 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+ if (!dont || free->arch.gfn_track[i] !=
+ dont->arch.gfn_track[i]) {
+ kvfree(free->arch.gfn_track[i]);
+ free->arch.gfn_track[i] = NULL;
+ }
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
+ sizeof(*slot->arch.gfn_track[i]));
+ if (!slot->arch.gfn_track[i])
+ goto track_free;
+ }
+
+ return 0;
+
+track_free:
+ kvm_page_track_free_memslot(slot, NULL);
+ return -ENOMEM;
+}
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+ if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+ return false;
+
+ return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode, short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+ val = slot->arch.gfn_track[mode][index];
+
+ if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (mode == KVM_PAGE_TRACK_WRITE)
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ kvm_flush_remote_tlbs(kvm);
+}
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return false;
+
+ return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+}
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ init_srcu_struct(&head->track_srcu);
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ spin_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+}
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &vcpu->kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_write)
+ n->track_write(vcpu, gpa, new, bytes);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6c9fed9..e159a81 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
ACC_USER_MASK;
#else
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- access &= ~(gpte >> PT64_NX_SHIFT);
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
#endif
return access;
@@ -249,7 +252,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
return ret;
kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
- walker->ptes[level] = pte;
+ walker->ptes[level - 1] = pte;
}
return 0;
}
@@ -702,24 +705,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
-
- /*
- * page fault with PFEC.RSVD = 1 is caused by shadow
- * page fault, should not be used to walk guest page
- * table.
- */
- error_code &= ~PFERR_RSVD_MASK;
- };
-
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
/*
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ error_code &= ~PFERR_RSVD_MASK;
+
+ /*
* Look up the guest pte for the faulting address.
*/
r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
@@ -735,6 +731,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
}
+ if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+ shadow_page_table_clear_flood(vcpu, addr);
+ return 1;
+ }
+
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
@@ -945,7 +946,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
- return -EINVAL;
+ return 0;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
@@ -977,7 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
host_writable);
}
- return !nr_present;
+ return nr_present;
}
#undef pt_element_t
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 31aa2c8..06ce377 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.apic)
+ if (lapic_in_kernel(vcpu))
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c13a64b..9507038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1858,8 +1858,7 @@ static int halt_interception(struct vcpu_svm *svm)
static int vmmcall_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- kvm_emulate_hypercall(&svm->vcpu);
- return 1;
+ return kvm_emulate_hypercall(&svm->vcpu);
}
static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ad9f6a2..2f1ea2f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm,
* Tracepoint for VT-d posted-interrupts.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
- unsigned int gvec, u64 pi_desc_addr, bool set),
- TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+ unsigned int gsi, unsigned int gvec,
+ u64 pi_desc_addr, bool set),
+ TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
TP_STRUCT__entry(
+ __field( unsigned int, host_irq )
__field( unsigned int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
@@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update,
),
TP_fast_assign(
+ __entry->host_irq = host_irq;
__entry->vcpu_id = vcpu_id;
__entry->gsi = gsi;
__entry->gvec = gvec;
@@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update,
__entry->set = set;
),
- TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+ TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
"gvec: 0x%x, pi_desc_addr: 0x%llx",
__entry->set ? "enabled and being updated" : "disabled",
+ __entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
__entry->gvec,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e153522..75173ef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,6 +596,8 @@ struct vcpu_vmx {
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
+
+ u64 current_tsc_ratio;
};
enum segment_cache_field {
@@ -861,7 +863,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
-static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -961,25 +962,36 @@ static const u32 vmx_msr_index[] = {
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+ (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
}
static inline bool is_no_device(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, NM_VECTOR);
}
static inline bool is_invalid_opcode(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, UD_VECTOR);
}
static inline bool is_external_interrupt(u32 intr_info)
@@ -1811,6 +1823,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
return;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
for (i = 0; i < m->nr; ++i)
@@ -1848,26 +1867,31 @@ static void reload_tss(void)
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
- u64 guest_efer;
- u64 ignore_bits;
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
- guest_efer = vmx->vcpu.arch.efer;
+ if (!enable_ept) {
+ /*
+ * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
+ * host CPUID is more efficient than testing guest CPUID
+ * or CR4. Host SMEP is anyway a requirement for guest SMEP.
+ */
+ if (boot_cpu_has(X86_FEATURE_SMEP))
+ guest_efer |= EFER_NX;
+ else if (!(guest_efer & EFER_NX))
+ ignore_bits |= EFER_NX;
+ }
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaningless
- * outside long mode
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
- ignore_bits = EFER_NX | EFER_SCE;
+ ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
clear_atomic_switch_msr(vmx, MSR_EFER);
@@ -1878,16 +1902,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
if (cpu_has_load_ia32_efer ||
(enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
- guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer);
return false;
- }
+ } else {
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
- return true;
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ return true;
+ }
}
static unsigned long segment_base(u16 selector)
@@ -2127,14 +2156,16 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
- /* Setup TSC multiplier */
- if (cpu_has_vmx_tsc_scaling())
- vmcs_write64(TSC_MULTIPLIER,
- vcpu->arch.tsc_scaling_ratio);
-
vmx->loaded_vmcs->cpu = cpu;
}
+ /* Setup TSC multiplier */
+ if (kvm_has_tsc_control &&
+ vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+ vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+ }
+
vmx_vcpu_pi_load(vcpu, cpu);
}
@@ -2584,7 +2615,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
- if (vmx_mpx_supported())
+ if (kvm_mpx_supported())
vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
@@ -2605,7 +2636,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
VM_ENTRY_LOAD_IA32_PAT;
vmx->nested.nested_vmx_entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
- if (vmx_mpx_supported())
+ if (kvm_mpx_supported())
vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
@@ -2849,7 +2880,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
- if (!vmx_mpx_supported())
+ if (!kvm_mpx_supported())
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -2926,7 +2957,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
- if (!vmx_mpx_supported())
+ if (!kvm_mpx_supported())
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
@@ -3399,7 +3430,7 @@ static void init_vmcs_shadow_fields(void)
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
switch (shadow_read_write_fields[i]) {
case GUEST_BNDCFGS:
- if (!vmx_mpx_supported())
+ if (!kvm_mpx_supported())
continue;
break;
default:
@@ -5608,11 +5639,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
}
if (vcpu->guest_debug == 0) {
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_MOV_DR_EXITING);
/*
* No more DR vmexits; force a reload of the debug registers
@@ -5649,8 +5677,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
@@ -5659,10 +5685,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
}
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5747,8 +5770,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
- kvm_emulate_hypercall(vcpu);
- return 1;
+ return kvm_emulate_hypercall(vcpu);
}
static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6435,8 +6457,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
/* Recycle the least recently used VMCS. */
- item = list_entry(vmx->nested.vmcs02_pool.prev,
- struct vmcs02_list, list);
+ item = list_last_entry(&vmx->nested.vmcs02_pool,
+ struct vmcs02_list, list);
item->vmptr = vmx->nested.current_vmptr;
list_move(&item->list, &vmx->nested.vmcs02_pool);
return &item->vmcs02;
@@ -7752,6 +7774,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
return false;
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -10258,7 +10287,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
- if (vmx_mpx_supported())
+ if (kvm_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
if (nested_cpu_has_xsaves(vmcs12))
vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@ -10766,13 +10795,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
*/
kvm_set_msi_irq(e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
continue;
+ }
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
- trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (set)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4244c2b..7236bd3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -1196,17 +1199,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
- uint32_t quotient, remainder;
-
- /* Don't try to replace with do_div(), this one calculates
- * "(dividend << 32) / divisor" */
- __asm__ ( "divl %4"
- : "=a" (quotient), "=d" (remainder)
- : "0" (0), "1" (dividend), "r" (divisor) );
- return quotient;
+ do_shl32_div32(dividend, divisor);
+ return dividend;
}
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
s8 *pshift, u32 *pmultiplier)
{
uint64_t scaled64;
@@ -1214,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
uint64_t tps64;
uint32_t tps32;
- tps64 = base_khz * 1000LL;
- scaled64 = scaled_khz * 1000LL;
+ tps64 = base_hz;
+ scaled64 = scaled_hz;
while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
tps64 >>= 1;
shift--;
@@ -1233,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
*pshift = shift;
*pmultiplier = div_frac(scaled64, tps32);
- pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
- __func__, base_khz, scaled_khz, shift, *pmultiplier);
+ pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+ __func__, base_hz, scaled_hz, shift, *pmultiplier);
}
#ifdef CONFIG_X86_64
@@ -1293,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
return 0;
}
-static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
- if (this_tsc_khz == 0) {
+ if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
return -1;
}
/* Compute a scale to convert nanoseconds in TSC cycles */
- kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
&vcpu->arch.virtual_tsc_shift,
&vcpu->arch.virtual_tsc_mult);
- vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
/*
* Compute the variation in TSC rate which is acceptable
@@ -1319,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
*/
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
- if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+ if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
- return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1716,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+ unsigned long flags, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1742,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
- if (unlikely(this_tsc_khz == 0)) {
+ tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
return 1;
@@ -1778,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (!vcpu->pv_time_enabled)
return 0;
- if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- tgt_tsc_khz = kvm_has_tsc_control ?
- vcpu->virtual_tsc_khz : this_tsc_khz;
- kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+ if (kvm_has_tsc_control)
+ tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+ if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
- vcpu->hw_tsc_khz = this_tsc_khz;
+ vcpu->hw_tsc_khz = tgt_tsc_khz;
}
/* With all the info we got, fill in the values */
@@ -2987,7 +2985,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
- kvm_vcpu_has_lapic(vcpu))
+ lapic_in_kernel(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@ -3000,7 +2998,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (kvm_vcpu_has_lapic(vcpu)) {
+ if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
else
@@ -3240,7 +3238,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
- if (!vcpu->arch.apic)
+ if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
@@ -3258,7 +3256,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_LAPIC: {
r = -EINVAL;
- if (!vcpu->arch.apic)
+ if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
if (IS_ERR(u.lapic))
@@ -3605,20 +3603,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int i;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
for (i = 0; i < 3; i++)
- kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
return 0;
}
@@ -3638,29 +3642,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
int start = 0;
int i;
u32 prev_legacy, cur_legacy;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy && cur_legacy)
start = 1;
- memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
- sizeof(kvm->arch.vpit->pit_state.channels));
- kvm->arch.vpit->pit_state.flags = ps->flags;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
for (i = 0; i < 3; i++)
- kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
start && i == 0);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
struct kvm_reinject_control *control)
{
- if (!kvm->arch.vpit)
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (!pit)
return -ENXIO;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
return 0;
}
@@ -4093,7 +4107,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
do {
n = min(len, 8);
- if (!(vcpu->arch.apic &&
+ if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
&& kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
@@ -4113,7 +4127,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
do {
n = min(len, 8);
- if (!(vcpu->arch.apic &&
+ if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -4346,7 +4360,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ kvm_page_track_write(vcpu, gpa, val, bytes);
return 1;
}
@@ -4604,7 +4618,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CMPXCHG_FAILED;
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
- kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+ kvm_page_track_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -6010,7 +6024,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops->update_cr8_intercept)
return;
- if (!vcpu->arch.apic)
+ if (!lapic_in_kernel(vcpu))
return;
if (vcpu->arch.apicv_active)
@@ -6618,12 +6632,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* KVM_DEBUGREG_WONT_EXIT again.
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
- int i;
-
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
kvm_x86_ops->sync_dirty_debug_regs(vcpu);
- for (i = 0; i < KVM_NR_DB_REGS; i++)
- vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ kvm_update_dr0123(vcpu);
+ kvm_update_dr6(vcpu);
+ kvm_update_dr7(vcpu);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
/*
@@ -7038,7 +7052,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- if (!kvm_vcpu_has_lapic(vcpu) &&
+ if (!lapic_in_kernel(vcpu) &&
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
return -EINVAL;
@@ -7314,7 +7328,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
* Every 255 times fpu_counter rolls over to 0; a guest that uses
* the FPU in bursts will revert to loading it on demand.
*/
- if (!vcpu->arch.eager_fpu) {
+ if (!use_eager_fpu()) {
if (++vcpu->fpu_counter < 5)
kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
}
@@ -7593,6 +7607,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
}
struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
@@ -7724,6 +7739,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_page_track_init(kvm);
+ kvm_mmu_init_vm(kvm);
+
return 0;
}
@@ -7850,6 +7868,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvm_mmu_uninit_vm(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -7871,6 +7890,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
free->arch.lpage_info[i - 1] = NULL;
}
}
+
+ kvm_page_track_free_memslot(free, dont);
}
int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -7879,6 +7900,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ struct kvm_lpage_info *linfo;
unsigned long ugfn;
int lpages;
int level = i + 1;
@@ -7893,15 +7915,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
if (i == 0)
continue;
- slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
- sizeof(*slot->arch.lpage_info[i - 1]));
- if (!slot->arch.lpage_info[i - 1])
+ linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+ if (!linfo)
goto out_free;
+ slot->arch.lpage_info[i - 1] = linfo;
+
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i - 1][0].write_count = 1;
+ linfo[0].disallow_lpage = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+ linfo[lpages - 1].disallow_lpage = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
@@ -7913,10 +7936,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned long j;
for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i - 1][j].write_count = 1;
+ linfo[j].disallow_lpage = 1;
}
}
+ if (kvm_page_track_create_memslot(slot, npages))
+ goto out_free;
+
return 0;
out_free:
@@ -8370,6 +8396,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
}
+bool kvm_vector_hashing_enabled(void)
+{
+ return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f2afa5f..007940f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -179,6 +179,7 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
+bool kvm_vector_hashing_enabled(void);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -192,4 +193,19 @@ extern unsigned int min_timer_period_us;
extern unsigned int lapic_timer_advance_ns;
extern struct static_key kvm_no_apic_vcpu;
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base) \
+ ({ \
+ u32 __quot, __rem; \
+ asm("divl %2" : "=a" (__quot), "=d" (__rem) \
+ : "rm" (base), "0" (0), "1" ((u32) n)); \
+ n = __quot; \
+ __rem; \
+ })
+
#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ba229a..fd57d3a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
*/
reserve_top_address(lguest_data.reserve_mem);
- /*
- * If we don't initialize the lock dependency checker now, it crashes
- * atomic_notifier_chain_register, then paravirt_disable_iospace.
- */
- lockdep_init();
-
/* Hook in our special panic hypercall code. */
atomic_notifier_chain_register(&panic_notifier_list, &paniced);
@@ -1535,7 +1529,7 @@ __init void lguest_init(void)
*/
cpu_detect(&new_cpu_data);
/* head.S usually sets up the first capability word, so do it here. */
- new_cpu_data.x86_capability[0] = cpuid_edx(1);
+ new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
/* Math is always hard! */
set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db00..5cc78bf 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -21,12 +21,16 @@ static inline int myisspace(u8 c)
* @option: option string to look for
*
* Returns the position of that @option (starts counting with 1)
- * or 0 on not found.
+ * or 0 on not found. @option will only be found if it is found
+ * as an entire word in @cmdline. For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
*/
-int cmdline_find_option_bool(const char *cmdline, const char *option)
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+ const char *option)
{
char c;
- int len, pos = 0, wstart = 0;
+ int pos = 0, wstart = 0;
const char *opptr = NULL;
enum {
st_wordstart = 0, /* Start of word/after whitespace */
@@ -37,11 +41,11 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
if (!cmdline)
return -1; /* No command line */
- len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
- if (!len)
- return 0;
-
- while (len--) {
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos < max_cmdline_size) {
c = *(char *)cmdline++;
pos++;
@@ -58,18 +62,35 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
/* fall through */
case st_wordcmp:
- if (!*opptr)
+ if (!*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for. If the
+ * command-line has a space _or_ ends, then
+ * we matched!
+ */
if (!c || myisspace(c))
return wstart;
- else
- state = st_wordskip;
- else if (!c)
+ /*
+ * We hit the end of the option, but _not_
+ * the end of a word on the cmdline. Not
+ * a match.
+ */
+ } else if (!c) {
+ /*
+ * Hit the NULL terminator on the end of
+ * cmdline.
+ */
return 0;
- else if (c != *opptr++)
- state = st_wordskip;
- else if (!len) /* last word and is matching */
- return wstart;
- break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
case st_wordskip:
if (!c)
@@ -82,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
return 0; /* Buffer overrun */
}
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+}
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 1318f75..28a6654 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto, __wsum sum)
+ __u32 len, __u8 proto, __wsum sum)
{
__u64 rest, sum64;
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e912b2f..2f07c29 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops)
* Use cpu_tss as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
- __monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index a0de849..cbb8ee5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
.Lend:
retq
ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+ cmpl $8, %edx
+ /* Less than 8 bytes? Go to byte copy loop */
+ jb .L_no_whole_words
+
+ /* Check for bad alignment of source */
+ testl $7, %esi
+ /* Already aligned */
+ jz .L_8byte_aligned
+
+ /* Copy one byte at a time until source is 8-byte aligned */
+ movl %esi, %ecx
+ andl $7, %ecx
+ subl $8, %ecx
+ negl %ecx
+ subl %ecx, %edx
+.L_copy_leading_bytes:
+ movb (%rsi), %al
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+ /* Figure out how many whole cache lines (64-bytes) to copy */
+ movl %edx, %ecx
+ andl $63, %edx
+ shrl $6, %ecx
+ jz .L_no_whole_cache_lines
+
+ /* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+ movq %r8, (%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+ movq %r8, 4*8(%rdi)
+ movq %r9, 5*8(%rdi)
+ movq %r10, 6*8(%rdi)
+ movq %r11, 7*8(%rdi)
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
+ decl %ecx
+ jnz .L_cache_w0
+
+ /* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+ movl %edx, %ecx
+ andl $7, %edx
+ shrl $3, %ecx
+ jz .L_no_whole_words
+
+ /* Copy trailing words */
+.L_copy_trailing_words:
+ movq (%rsi), %r8
+ mov %r8, (%rdi)
+ leaq 8(%rsi), %rsi
+ leaq 8(%rdi), %rdi
+ decl %ecx
+ jnz .L_copy_trailing_words
+
+ /* Any trailing bytes? */
+.L_no_whole_words:
+ andl %edx, %edx
+ jz .L_done_memcpy_trap
+
+ /* Copy trailing bytes */
+ movl %edx, %ecx
+.L_copy_trailing_bytes:
+ movb (%rsi), %al
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_copy_trailing_bytes
+
+ /* Copy successful. Return true */
+.L_done_memcpy_trap:
+ xorq %rax, %rax
+ ret
+ENDPROC(memcpy_mcsafe)
+
+ .section .fixup, "ax"
+ /* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+ mov $1, %rax
+ ret
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 4a6f1d9..99bfb19 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
-#ifdef CONFIG_X86_64
static inline bool is_hypervisor_range(int idx)
{
+#ifdef CONFIG_X86_64
/*
* ffff800000000000 - ffff87ffffffffff is reserved for
* the hypervisor.
*/
- return paravirt_enabled() &&
- (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
- (idx < pgd_index(__PAGE_OFFSET));
-}
+ return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+ (idx < pgd_index(__PAGE_OFFSET));
#else
-static inline bool is_hypervisor_range(int idx) { return false; }
+ return false;
#endif
+}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d8a798d..f8d0b5e 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -131,7 +131,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON_PAGE(page != compound_head(page), page);
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- atomic_add(nr, &page->_count);
+ page_ref_add(page, nr);
SetPageReferenced(page);
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f541..9d56f27 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,13 +150,14 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK)
/*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
+ * use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (cpu_has_pse)
+ if (cpu_has_pse && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
#endif
@@ -666,21 +667,22 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
- begin, end - 1);
- set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
- /*
- * We just marked the kernel text read only above, now that
- * we are going to free part of that, we need to make that
- * writeable and non-executable first.
- */
- set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
- set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+ if (debug_pagealloc_enabled()) {
+ pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+ } else {
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable and non-executable first.
+ */
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
- free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
-#endif
+ free_reserved_area((void *)begin, (void *)end,
+ POISON_FREE_INITMEM, what);
+ }
}
void free_initmem(void)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cb4ef3d..bd7a9b9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -388,7 +388,6 @@ repeat:
}
pte_t *kmap_pte;
-pgprot_t kmap_prot;
static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
{
@@ -405,8 +404,6 @@ static void __init kmap_init(void)
*/
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
- kmap_prot = PAGE_KERNEL;
}
#ifdef CONFIG_HIGHMEM
@@ -871,7 +868,6 @@ static noinline int do_test_wp_bit(void)
return flag;
}
-#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -960,5 +956,3 @@ void mark_rodata_ro(void)
if (__supported_pte_mask & _PAGE_NX)
debug_checkwx();
}
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5488d21..214afda 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
+#include <asm/uv/uv.h>
#include <asm/setup.h>
#include "mm_internal.h"
@@ -1074,7 +1075,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -1166,8 +1166,6 @@ void mark_rodata_ro(void)
debug_checkwx();
}
-#endif
-
int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -1206,26 +1204,13 @@ int kern_addr_valid(unsigned long addr)
static unsigned long probe_memory_block_size(void)
{
- /* start from 2g */
- unsigned long bz = 1UL<<31;
-
- if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
- pr_info("Using 2GB memory block size for large-memory system\n");
- return 2UL * 1024 * 1024 * 1024;
- }
-
- /* less than 64g installed */
- if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
- return MIN_MEMORY_BLOCK_SIZE;
+ unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
- /* get the tail size */
- while (bz > MIN_MEMORY_BLOCK_SIZE) {
- if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
- break;
- bz >>= 1;
- }
+ /* if system is UV or has 64GB of RAM or more, use large blocks */
+ if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
+ bz = 2UL << 30; /* 2GB */
- printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+ pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
return bz;
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index d470cf2..1b1110f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -120,11 +120,22 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
- memset(kasan_zero_page, 0, PAGE_SIZE);
-
load_cr3(init_level4_pgt);
__flush_tlb_all();
- init_task.kasan_depth = 0;
+ /*
+ * kasan_zero_page has been used as early shadow memory, thus it may
+ * contain some garbage. Now we can clear and write protect it, since
+ * after the TLB flush no one should write to it.
+ */
+ memset(kasan_zero_page, 0, PAGE_SIZE);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+ set_pte(&kasan_zero_pte[i], pte);
+ }
+ /* Flush TLBs again to be sure that write protection applied. */
+ __flush_tlb_all();
+
+ init_task.kasan_depth = 0;
pr_info("KernelAddressSanitizer initialized\n");
}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 637ab34..ddb2244 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -33,7 +33,7 @@
struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
- unsigned long page; /* location of the fault page */
+ unsigned long addr; /* the requested address */
pteval_t old_presence; /* page presence prior to arming */
bool armed;
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
static LIST_HEAD(kmmio_probes);
-static struct list_head *kmmio_page_list(unsigned long page)
+static struct list_head *kmmio_page_list(unsigned long addr)
{
- return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+
+ return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
}
/* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
}
/* You must be holding RCU read lock. */
-static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
{
struct list_head *head;
struct kmmio_fault_page *f;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
- page &= PAGE_MASK;
- head = kmmio_page_list(page);
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+ head = kmmio_page_list(addr);
list_for_each_entry_rcu(f, head, list) {
- if (f->page == page)
+ if (f->addr == addr)
return f;
}
return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
- pte_t *pte = lookup_address(f->page, &level);
+ pte_t *pte = lookup_address(f->addr, &level);
if (!pte) {
- pr_err("no pte for page 0x%08lx\n", f->page);
+ pr_err("no pte for addr 0x%08lx\n", f->addr);
return -1;
}
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
return -1;
}
- __flush_tlb_one(f->page);
+ __flush_tlb_one(f->addr);
return 0;
}
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
int ret;
WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
if (f->armed) {
- pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
- f->page, f->count, !!f->old_presence);
+ pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+ f->addr, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
- WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
- f->page);
+ WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+ f->addr);
f->armed = true;
return ret;
}
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
- KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+ KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
f->armed = false;
}
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
struct kmmio_context *ctx;
struct kmmio_fault_page *faultpage;
int ret = 0; /* default to fault not handled */
+ unsigned long page_base = addr;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+ if (!pte)
+ return -EINVAL;
+ page_base &= page_level_mask(l);
/*
* Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
preempt_disable();
rcu_read_lock();
- faultpage = get_kmmio_fault_page(addr);
+ faultpage = get_kmmio_fault_page(page_base);
if (!faultpage) {
/*
* Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
- if (addr == ctx->addr) {
+ if (page_base == ctx->addr) {
/*
* A second fault on the same page means some other
* condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
ctx->active++;
ctx->fpage = faultpage;
- ctx->probe = get_kmmio_probe(addr);
+ ctx->probe = get_kmmio_probe(page_base);
ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- ctx->addr = addr;
+ ctx->addr = page_base;
if (ctx->probe && ctx->probe->pre_handler)
ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
}
/* You must be holding kmmio_lock. */
-static int add_kmmio_fault_page(unsigned long page)
+static int add_kmmio_fault_page(unsigned long addr)
{
struct kmmio_fault_page *f;
- page &= PAGE_MASK;
- f = get_kmmio_fault_page(page);
+ f = get_kmmio_fault_page(addr);
if (f) {
if (!f->count)
arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
return -1;
f->count = 1;
- f->page = page;
+ f->addr = addr;
if (arm_kmmio_fault_page(f)) {
kfree(f);
return -1;
}
- list_add_rcu(&f->list, kmmio_page_list(f->page));
+ list_add_rcu(&f->list, kmmio_page_list(f->addr));
return 0;
}
/* You must be holding kmmio_lock. */
-static void release_kmmio_fault_page(unsigned long page,
+static void release_kmmio_fault_page(unsigned long addr,
struct kmmio_fault_page **release_list)
{
struct kmmio_fault_page *f;
- page &= PAGE_MASK;
- f = get_kmmio_fault_page(page);
+ f = get_kmmio_fault_page(addr);
if (!f)
return;
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
int ret = 0;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ unsigned int l;
+ pte_t *pte;
spin_lock_irqsave(&kmmio_lock, flags);
if (get_kmmio_probe(p->addr)) {
ret = -EEXIST;
goto out;
}
+
+ pte = lookup_address(p->addr, &l);
+ if (!pte) {
+ ret = -EINVAL;
+ goto out;
+ }
+
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
pr_err("Unable to set page fault.\n");
- size += PAGE_SIZE;
+ size += page_level_size(l);
}
out:
spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
+ unsigned int l;
+ pte_t *pte;
+
+ pte = lookup_address(p->addr, &l);
+ if (!pte)
+ return;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
release_kmmio_fault_page(p->addr + size, &release_list);
- size += PAGE_SIZE;
+ size += page_level_size(l);
}
list_del_rcu(&p->list);
kmmio_count--;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 96bd1e2..d2dc043 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,12 +71,12 @@ unsigned long arch_mmap_rnd(void)
if (mmap_is_ia32())
#ifdef CONFIG_COMPAT
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
#else
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
#endif
else
- rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
@@ -94,18 +94,6 @@ static unsigned long mmap_base(unsigned long rnd)
}
/*
- * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
- * does, but not when emulating X86_32
- */
-static unsigned long mmap_legacy_base(unsigned long rnd)
-{
- if (mmap_is_ia32())
- return TASK_UNMAPPED_BASE;
- else
- return TASK_UNMAPPED_BASE + rnd;
-}
-
-/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
- mm->mmap_legacy_base = mmap_legacy_base(random_factor);
+ mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index b2fd67d..ef05755 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -123,7 +123,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
break;
}
- if (regno > nr_registers) {
+ if (regno >= nr_registers) {
WARN_ONCE(1, "decoded an instruction with an invalid register");
return -EINVAL;
}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index d04f809..f70c1ff 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
return true;
}
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
static void __init numa_clear_kernel_node_hotplug(void)
{
- int i, nid;
- nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
- phys_addr_t start, end;
- struct memblock_region *r;
+ nodemask_t reserved_nodemask = NODE_MASK_NONE;
+ struct memblock_region *mb_region;
+ int i;
/*
+ * We have to do some preprocessing of memblock regions, to
+ * make them suitable for reservation.
+ *
* At this time, all memory regions reserved by memblock are
- * used by the kernel. Set the nid in memblock.reserved will
- * mark out all the nodes the kernel resides in.
+ * used by the kernel, but those regions are not split up
+ * along node boundaries yet, and don't necessarily have their
+ * node ID set yet either.
+ *
+ * So iterate over all memory known to the x86 architecture,
+ * and use those ranges to set the nid in memblock.reserved.
+ * This will split up the memblock regions along node
+ * boundaries and will set the node IDs as well.
*/
for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = &numa_meminfo.blk[i];
+ struct numa_memblk *mb = numa_meminfo.blk + i;
+ int ret;
- memblock_set_node(mb->start, mb->end - mb->start,
- &memblock.reserved, mb->nid);
+ ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+ WARN_ON_ONCE(ret);
}
/*
- * Mark all kernel nodes.
+ * Now go over all reserved memblock regions, to construct a
+ * node mask of all kernel reserved memory areas.
*
- * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
- * may not include all the memblock.reserved memory ranges because
- * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+ * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+ * numa_meminfo might not include all memblock.reserved
+ * memory ranges, because quirks such as trim_snb_memory()
+ * reserve specific pages for Sandy Bridge graphics. ]
*/
- for_each_memblock(reserved, r)
- if (r->nid != MAX_NUMNODES)
- node_set(r->nid, numa_kernel_nodes);
+ for_each_memblock(reserved, mb_region) {
+ if (mb_region->nid != MAX_NUMNODES)
+ node_set(mb_region->nid, reserved_nodemask);
+ }
- /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ /*
+ * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+ * belonging to the reserved node mask.
+ *
+ * Note that this will include memory regions that reside
+ * on nodes that contain kernel memory - entire nodes
+ * become hot-unpluggable:
+ */
for (i = 0; i < numa_meminfo.nr_blks; i++) {
- nid = numa_meminfo.blk[i].nid;
- if (!node_isset(nid, numa_kernel_nodes))
- continue;
+ struct numa_memblk *mb = numa_meminfo.blk + i;
- start = numa_meminfo.blk[i].start;
- end = numa_meminfo.blk[i].end;
+ if (!node_isset(mb->nid, reserved_nodemask))
+ continue;
- memblock_clear_hotplug(start, end - start);
+ memblock_clear_hotplug(mb->start, mb->end - mb->start);
}
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2440814..4d0b262 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,12 +106,6 @@ static inline unsigned long highmap_end_pfn(void)
#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-# define debug_pagealloc 1
-#else
-# define debug_pagealloc 0
-#endif
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -283,7 +277,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
__pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
@@ -419,24 +413,30 @@ pmd_t *lookup_pmd_address(unsigned long address)
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
unsigned long virt_addr = (unsigned long)__virt_addr;
- unsigned long phys_addr, offset;
+ phys_addr_t phys_addr;
+ unsigned long offset;
enum pg_level level;
pte_t *pte;
pte = lookup_address(virt_addr, &level);
BUG_ON(!pte);
+ /*
+ * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
+ * before being left-shifted PAGE_SHIFT bits -- this trick is to
+ * make 32-PAE kernel work correctly.
+ */
switch (level) {
case PG_LEVEL_1G:
- phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+ phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
offset = virt_addr & ~PUD_PAGE_MASK;
break;
case PG_LEVEL_2M:
- phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+ phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
offset = virt_addr & ~PMD_PAGE_MASK;
break;
default:
- phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
offset = virt_addr & ~PAGE_MASK;
}
@@ -708,10 +708,10 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
{
struct page *base;
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
return -ENOMEM;
@@ -1122,8 +1122,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
/*
* Ignore all non primary paths.
*/
- if (!primary)
+ if (!primary) {
+ cpa->numpages = 1;
return 0;
+ }
/*
* Ignore the NULL PTE for kernel identity mapping, as it is expected
@@ -1331,10 +1333,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
- if (!debug_pagealloc)
+ if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
return ret;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index f65a33f..8bea847 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -32,9 +32,8 @@ early_param("noexec", noexec_setup);
void x86_configure_nx(void)
{
- if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
- __supported_pte_mask |= _PAGE_NX;
- else
+ /* If disable_nx is set, clear NX on all new mappings going forward. */
+ if (disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 4e664bd..cb31a44 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name)
return 0;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
{
unsigned int *depth = data;
if ((*depth)--)
oprofile_add_trace(addr);
+ return 0;
}
static struct stacktrace_ops backtrace_ops = {
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2879efc..381a43c 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -12,7 +12,6 @@
#include <linux/dmi.h>
#include <linux/slab.h>
-#include <asm-generic/pci-bridge.h>
#include <asm/acpi.h>
#include <asm/segment.h>
#include <asm/io.h>
@@ -711,28 +710,22 @@ int pcibios_add_device(struct pci_dev *dev)
return 0;
}
-int pcibios_alloc_irq(struct pci_dev *dev)
+int pcibios_enable_device(struct pci_dev *dev, int mask)
{
- /*
- * If the PCI device was already claimed by core code and has
- * MSI enabled, probing of the pcibios IRQ will overwrite
- * dev->irq. So bail out if MSI is already enabled.
- */
- if (pci_dev_msi_enabled(dev))
- return -EBUSY;
+ int err;
- return pcibios_enable_irq(dev);
-}
+ if ((err = pci_enable_resources(dev, mask)) < 0)
+ return err;
-void pcibios_free_irq(struct pci_dev *dev)
-{
- if (pcibios_disable_irq)
- pcibios_disable_irq(dev);
+ if (!pci_dev_msi_enabled(dev))
+ return pcibios_enable_irq(dev);
+ return 0;
}
-int pcibios_enable_device(struct pci_dev *dev, int mask)
+void pcibios_disable_device (struct pci_dev *dev)
{
- return pci_enable_resources(dev, mask);
+ if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+ pcibios_disable_irq(dev);
}
int pci_ext_cfg_avail(void)
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index e585655..b7de192 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -297,14 +297,14 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r
*
* The standard boot ROM sequence for an x86 machine uses the BIOS
* to select an initial video card for boot display. This boot video
- * card will have it's BIOS copied to C0000 in system RAM.
+ * card will have its BIOS copied to 0xC0000 in system RAM.
* IORESOURCE_ROM_SHADOW is used to associate the boot video
* card with this copy. On laptops this copy has to be used since
* the main ROM may be compressed or combined with another image.
* See pci_map_rom() for use of this flag. Before marking the device
* with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
- * by either arch cde or vga-arbitration, if so only apply the fixup to this
- * already determined primary video card.
+ * by either arch code or vga-arbitration; if so only apply the fixup to this
+ * already-determined primary video card.
*/
static void pci_fixup_video(struct pci_dev *pdev)
@@ -312,6 +312,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
struct pci_dev *bridge;
struct pci_bus *bus;
u16 config;
+ struct resource *res;
/* Is VGA routed to us? */
bus = pdev->bus;
@@ -336,8 +337,18 @@ static void pci_fixup_video(struct pci_dev *pdev)
if (!vga_default_device() || pdev == vga_default_device()) {
pci_read_config_word(pdev, PCI_COMMAND, &config);
if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
- pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
- dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n");
+ res = &pdev->resource[PCI_ROM_RESOURCE];
+
+ pci_disable_rom(pdev);
+ if (res->parent)
+ release_resource(res);
+
+ res->start = 0xC0000;
+ res->end = res->start + 0x20000 - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
+ IORESOURCE_PCI_FIXED;
+ dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n",
+ res);
}
}
}
@@ -540,3 +551,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
+
+static void pci_bdwep_bar(struct pci_dev *dev)
+{
+ dev->non_compliant_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 0d24e7c..8b93e63 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -215,7 +215,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
int polarity;
int ret;
- if (pci_has_managed_irq(dev))
+ if (dev->irq_managed && dev->irq > 0)
return 0;
switch (intel_mid_identify_cpu()) {
@@ -256,13 +256,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
static void intel_mid_pci_irq_disable(struct pci_dev *dev)
{
- if (pci_has_managed_irq(dev)) {
+ if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+ dev->irq > 0) {
mp_unmap_irq(dev->irq);
dev->irq_managed = 0;
- /*
- * Don't reset dev->irq here, otherwise
- * intel_mid_pci_irq_enable() will fail on next call.
- */
}
}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 32e7034..9bd1154 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
struct pci_dev *temp_dev;
int irq;
- if (pci_has_managed_irq(dev))
+ if (dev->irq_managed && dev->irq > 0)
return 0;
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
@@ -1230,7 +1230,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
- pci_set_managed_irq(dev, irq);
+ dev->irq_managed = 1;
+ dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
return 0;
@@ -1256,10 +1257,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
return 0;
}
+bool mp_should_keep_irq(struct device *dev)
+{
+ if (dev->power.is_prepared)
+ return true;
+#ifdef CONFIG_PM
+ if (dev->power.runtime_status == RPM_SUSPENDING)
+ return true;
+#endif
+
+ return false;
+}
+
static void pirq_disable_irq(struct pci_dev *dev)
{
- if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) {
+ if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+ dev->irq_managed && dev->irq) {
mp_unmap_irq(dev->irq);
- pci_reset_managed_irq(dev);
+ dev->irq = 0;
+ dev->irq_managed = 0;
}
}
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index d57e480..7792aba 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -503,6 +503,18 @@ static struct pci_ops vmd_ops = {
.write = vmd_pci_write,
};
+static void vmd_attach_resources(struct vmd_dev *vmd)
+{
+ vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
+ vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
+}
+
+static void vmd_detach_resources(struct vmd_dev *vmd)
+{
+ vmd->dev->resource[VMD_MEMBAR1].child = NULL;
+ vmd->dev->resource[VMD_MEMBAR2].child = NULL;
+}
+
/*
* VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
*/
@@ -527,11 +539,28 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
res = &vmd->dev->resource[VMD_CFGBAR];
vmd->resources[0] = (struct resource) {
.name = "VMD CFGBAR",
- .start = res->start,
+ .start = 0,
.end = (resource_size(res) >> 20) - 1,
.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
};
+ /*
+ * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
+ * put 32-bit resources in the window.
+ *
+ * There's no hardware reason why a 64-bit window *couldn't*
+ * contain a 32-bit resource, but pbus_size_mem() computes the
+ * bridge window size assuming a 64-bit window will contain no
+ * 32-bit resources. __pci_assign_resource() enforces that
+ * artificial restriction to make sure everything will fit.
+ *
+ * The only way we could use a 64-bit non-prefechable MEMBAR is
+ * if its address is <4GB so that we can convert it to a 32-bit
+ * resource. To be visible to the host OS, all VMD endpoints must
+ * be initially configured by platform BIOS, which includes setting
+ * up these resources. We can assume the device is configured
+ * according to the platform needs.
+ */
res = &vmd->dev->resource[VMD_MEMBAR1];
upper_bits = upper_32_bits(res->end);
flags = res->flags & ~IORESOURCE_SIZEALIGN;
@@ -542,6 +571,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start,
.end = res->end,
.flags = flags,
+ .parent = res,
};
res = &vmd->dev->resource[VMD_MEMBAR2];
@@ -554,6 +584,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start + 0x2000,
.end = res->end,
.flags = flags,
+ .parent = res,
};
sd->domain = vmd_find_free_domain();
@@ -578,6 +609,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
return -ENODEV;
}
+ vmd_attach_resources(vmd);
vmd_setup_dma_ops(vmd);
dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
pci_rescan_bus(vmd->bus);
@@ -674,6 +706,7 @@ static void vmd_remove(struct pci_dev *dev)
{
struct vmd_dev *vmd = pci_get_drvdata(dev);
+ vmd_detach_resources(vmd);
pci_set_drvdata(dev, NULL);
sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
pci_stop_root_bus(vmd->bus);
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 2d66db8..ed30e79 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -131,6 +131,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
EXPORT_SYMBOL_GPL(efi_query_variable_store);
/*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+ if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+ return false;
+
+ if (!e820_all_mapped(start, start+size, E820_RAM))
+ return false;
+
+ return true;
+}
+
+/*
* The UEFI specification makes it clear that the operating system is free to do
* whatever it wants with boot services code after ExitBootServices() has been
* called. Ignoring this recommendation a significant bunch of EFI implementations
@@ -147,26 +168,50 @@ void __init efi_reserve_boot_services(void)
efi_memory_desc_t *md = p;
u64 start = md->phys_addr;
u64 size = md->num_pages << EFI_PAGE_SHIFT;
+ bool already_reserved;
if (md->type != EFI_BOOT_SERVICES_CODE &&
md->type != EFI_BOOT_SERVICES_DATA)
continue;
- /* Only reserve where possible:
- * - Not within any already allocated areas
- * - Not over any memory area (really needed, if above?)
- * - Not within any part of the kernel
- * - Not the bios reserved area
- */
- if ((start + size > __pa_symbol(_text)
- && start <= __pa_symbol(_end)) ||
- !e820_all_mapped(start, start+size, E820_RAM) ||
- memblock_is_region_reserved(start, size)) {
- /* Could not reserve, skip it */
- md->num_pages = 0;
- memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
- start, start+size-1);
- } else
+
+ already_reserved = memblock_is_region_reserved(start, size);
+
+ /*
+ * Because the following memblock_reserve() is paired
+ * with free_bootmem_late() for this region in
+ * efi_free_boot_services(), we must be extremely
+ * careful not to reserve, and subsequently free,
+ * critical regions of memory (like the kernel image) or
+ * those regions that somebody else has already
+ * reserved.
+ *
+ * A good example of a critical region that must not be
+ * freed is page zero (first 4Kb of memory), which may
+ * contain boot services code/data but is marked
+ * E820_RESERVED by trim_bios_range().
+ */
+ if (!already_reserved) {
memblock_reserve(start, size);
+
+ /*
+ * If we are the first to reserve the region, no
+ * one else cares about it. We own it and can
+ * free it later.
+ */
+ if (can_free_region(start, size))
+ continue;
+ }
+
+ /*
+ * We don't own the region. We must not free it.
+ *
+ * Setting this bit for a boot services region really
+ * doesn't make sense as far as the firmware is
+ * concerned, but it does provide us with a way to tag
+ * those regions that must not be paired with
+ * free_bootmem_late().
+ */
+ md->attribute |= EFI_MEMORY_RUNTIME;
}
}
@@ -183,8 +228,8 @@ void __init efi_free_boot_services(void)
md->type != EFI_BOOT_SERVICES_DATA)
continue;
- /* Could not reserve boot area */
- if (!size)
+ /* Do not free, someone else owns it: */
+ if (md->attribute & EFI_MEMORY_RUNTIME)
continue;
free_bootmem_late(start, size);
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index 76b6632..1865c19 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -21,7 +21,7 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/leds.h>
#include <linux/platform_device.h>
#include <linux/gpio.h>
@@ -35,6 +35,11 @@
#define BIOS_SIGNATURE_COREBOOT 0x500
#define BIOS_REGION_SIZE 0x10000
+/*
+ * This driver is not modular, but to keep back compatibility
+ * with existing use cases, continuing with module_param is
+ * the easiest way forward.
+ */
static bool force = 0;
module_param(force, bool, 0444);
/* FIXME: Award bios is not automatically detected as Alix platform */
@@ -192,9 +197,4 @@ static int __init alix_init(void)
return 0;
}
-
-module_init(alix_init);
-
-MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>");
-MODULE_DESCRIPTION("PCEngines ALIX System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(alix_init);
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index aa733fb..4fcdb91 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -19,7 +19,6 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/string.h>
-#include <linux/module.h>
#include <linux/leds.h>
#include <linux/platform_device.h>
#include <linux/gpio.h>
@@ -120,9 +119,4 @@ static int __init geos_init(void)
return 0;
}
-
-module_init(geos_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(geos_init);
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 927e38c..a2f6b98 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,7 +20,6 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/string.h>
-#include <linux/module.h>
#include <linux/leds.h>
#include <linux/platform_device.h>
#include <linux/gpio.h>
@@ -146,9 +145,4 @@ static int __init net5501_init(void)
return 0;
}
-
-module_init(net5501_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Soekris net5501 System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(net5501_init);
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index 23381d2..1eb47b6 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void)
/* mark tsc clocksource as reliable */
set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
- if (fast_calibrate)
- return fast_calibrate;
-
- return 0;
+ return fast_calibrate;
}
static void __init penwell_arch_setup(void)
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
index aaca917..bd1adc6 100644
--- a/arch/x86/platform/intel-mid/mrfl.c
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void)
/* mark tsc clocksource as reliable */
set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
- if (fast_calibrate)
- return fast_calibrate;
-
- return 0;
+ return fast_calibrate;
}
static void __init tangier_arch_setup(void)
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index c61b6c3..17d6d22 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -1,5 +1,5 @@
/**
- * imr.c
+ * imr.c -- Intel Isolated Memory Region driver
*
* Copyright(c) 2013 Intel Corporation.
* Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -31,7 +31,6 @@
#include <linux/debugfs.h>
#include <linux/init.h>
#include <linux/mm.h>
-#include <linux/module.h>
#include <linux/types.h>
struct imr_device {
@@ -135,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
* @idev: pointer to imr_device structure.
* @imr_id: IMR entry to write.
* @imr: IMR structure representing address and access masks.
- * @lock: indicates if the IMR lock bit should be applied.
* @return: 0 on success or error code passed from mbi_iosf on failure.
*/
-static int imr_write(struct imr_device *idev, u32 imr_id,
- struct imr_regs *imr, bool lock)
+static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
{
unsigned long flags;
u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
@@ -163,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
if (ret)
goto failed;
- /* Lock bit must be set separately to addr_lo address bits. */
- if (lock) {
- imr->addr_lo |= IMR_LOCK;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
- reg - IMR_NUM_REGS, imr->addr_lo);
- if (ret)
- goto failed;
- }
-
local_irq_restore(flags);
return 0;
failed:
@@ -270,17 +258,6 @@ static int imr_debugfs_register(struct imr_device *idev)
}
/**
- * imr_debugfs_unregister - unregister debugfs hooks.
- *
- * @idev: pointer to imr_device structure.
- * @return:
- */
-static void imr_debugfs_unregister(struct imr_device *idev)
-{
- debugfs_remove(idev->file);
-}
-
-/**
* imr_check_params - check passed address range IMR alignment and non-zero size
*
* @base: base address of intended IMR.
@@ -334,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
* @size: physical size of region in bytes must be aligned to 1KiB.
* @read_mask: read access mask.
* @write_mask: write access mask.
- * @lock: indicates whether or not to permanently lock this region.
* @return: zero on success or negative value indicating error.
*/
int imr_add_range(phys_addr_t base, size_t size,
- unsigned int rmask, unsigned int wmask, bool lock)
+ unsigned int rmask, unsigned int wmask)
{
phys_addr_t end;
unsigned int i;
@@ -411,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size,
imr.rmask = rmask;
imr.wmask = wmask;
- ret = imr_write(idev, reg, &imr, lock);
+ ret = imr_write(idev, reg, &imr);
if (ret < 0) {
/*
* In the highly unlikely event iosf_mbi_write failed
@@ -422,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size,
imr.addr_hi = 0;
imr.rmask = IMR_READ_ACCESS_ALL;
imr.wmask = IMR_WRITE_ACCESS_ALL;
- imr_write(idev, reg, &imr, false);
+ imr_write(idev, reg, &imr);
}
failed:
mutex_unlock(&idev->lock);
@@ -518,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
imr.rmask = IMR_READ_ACCESS_ALL;
imr.wmask = IMR_WRITE_ACCESS_ALL;
- ret = imr_write(idev, reg, &imr, false);
+ ret = imr_write(idev, reg, &imr);
failed:
mutex_unlock(&idev->lock);
@@ -592,14 +568,14 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
end = (unsigned long)__end_rodata - 1;
/*
- * Setup a locked IMR around the physical extent of the kernel
+ * Setup an unlocked IMR around the physical extent of the kernel
* from the beginning of the .text secton to the end of the
* .rodata section as one physically contiguous block.
*
* We don't round up @size since it is already PAGE_SIZE aligned.
* See vmlinux.lds.S for details.
*/
- ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
if (ret < 0) {
pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
size / 1024, start, end);
@@ -614,7 +590,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
{ X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
{}
};
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
/**
* imr_init - entry point for IMR driver.
@@ -640,22 +615,4 @@ static int __init imr_init(void)
imr_fixup_memmap(idev);
return 0;
}
-
-/**
- * imr_exit - exit point for IMR code.
- *
- * Deregisters debugfs, leave IMR state as-is.
- *
- * return:
- */
-static void __exit imr_exit(void)
-{
- imr_debugfs_unregister(&imr_dev);
-}
-
-module_init(imr_init);
-module_exit(imr_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_init);
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 278e4da..f5bad40 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -1,5 +1,5 @@
/**
- * imr_selftest.c
+ * imr_selftest.c -- Intel Isolated Memory Region self-test driver
*
* Copyright(c) 2013 Intel Corporation.
* Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -15,7 +15,6 @@
#include <asm/imr.h>
#include <linux/init.h>
#include <linux/mm.h>
-#include <linux/module.h>
#include <linux/types.h>
#define SELFTEST KBUILD_MODNAME ": "
@@ -61,30 +60,30 @@ static void __init imr_self_test(void)
int ret;
/* Test zero zero. */
- ret = imr_add_range(0, 0, 0, 0, false);
+ ret = imr_add_range(0, 0, 0, 0);
imr_self_test_result(ret < 0, "zero sized IMR\n");
/* Test exact overlap. */
- ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
/* Test overlap with base inside of existing. */
base += size - IMR_ALIGN;
- ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
/* Test overlap with end inside of existing. */
base -= size + IMR_ALIGN * 2;
- ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
/* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
- IMR_WRITE_ACCESS_ALL, false);
+ IMR_WRITE_ACCESS_ALL);
imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
/* Test that a 1 KiB IMR @ zero with CPU only will work. */
- ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+ ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU);
imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
if (ret >= 0) {
ret = imr_remove_range(0, IMR_ALIGN);
@@ -93,8 +92,7 @@ static void __init imr_self_test(void)
/* Test 2 KiB works. */
size = IMR_ALIGN * 2;
- ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
- IMR_WRITE_ACCESS_ALL, false);
+ ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL);
imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
if (ret >= 0) {
ret = imr_remove_range(0, size);
@@ -106,7 +104,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
{ X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
{}
};
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
/**
* imr_self_test_init - entry point for IMR driver.
@@ -125,13 +122,4 @@ static int __init imr_self_test_init(void)
*
* return:
*/
-static void __exit imr_self_test_exit(void)
-{
-}
-
-module_init(imr_self_test_init);
-module_exit(imr_self_test_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_self_test_init);
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index ee94018..54d96f1 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -87,8 +87,8 @@ static inline __sum16 csum_fold(__wsum sum)
* 32bit unfolded.
*/
static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
- unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
asm(" addl %1, %0\n"
" adcl %2, %0\n"
@@ -104,9 +104,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
* returns a 16-bit checksum, already complemented
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
- __wsum sum)
+ __u32 len, __u8 proto,
+ __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
}
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
index ab77b6f..83a75f8 100644
--- a/arch/x86/um/asm/checksum_32.h
+++ b/arch/x86/um/asm/checksum_32.h
@@ -13,7 +13,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
#define _HAVE_ARCH_IPV6_CSUM
static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
__asm__(
diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c
index 8502ad3..5adb6a2 100644
--- a/arch/x86/um/os-Linux/task_size.c
+++ b/arch/x86/um/os-Linux/task_size.c
@@ -109,7 +109,7 @@ unsigned long os_get_top_address(void)
exit(1);
}
- printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT);
+ printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT);
printf("Locating the top of the address space ... ");
fflush(stdout);
@@ -134,7 +134,7 @@ out:
exit(1);
}
top <<= UM_KERN_PAGE_SHIFT;
- printf("0x%x\n", top);
+ printf("0x%lx\n", top);
return top;
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5c45a69..2379a5a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1655,7 +1655,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
cpu_detect(&new_cpu_data);
set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
new_cpu_data.wp_works_ok = 1;
- new_cpu_data.x86_capability[0] = cpuid_edx(1);
+ new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
#endif
if (xen_start_info->mod_start) {
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 724a087..9466354 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -11,7 +11,7 @@
#include "pmu.h"
/* x86_pmu.handle_irq definition */
-#include "../kernel/cpu/perf_event.h"
+#include "../events/perf_event.h"
#define XENPMU_IRQ_PROCESSING 1
struct xenpmu {
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3f4ebf0..3c6d17f 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
xen_pvh_secondary_vcpu_init(cpu);
#endif
cpu_bringup();
- cpu_startup_entry(CPUHP_ONLINE);
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
static void xen_smp_intr_free(unsigned int cpu)