summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/entry/calling.h15
-rw-r--r--arch/x86/entry/common.c6
-rw-r--r--arch/x86/entry/entry_32.S15
-rw-r--r--arch/x86/entry/entry_64.S8
-rw-r--r--arch/x86/entry/entry_64_compat.S40
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c151
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S3
-rw-r--r--arch/x86/entry/vdso/vdso2c.c3
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S54
-rw-r--r--arch/x86/entry/vdso/vma.c14
-rw-r--r--arch/x86/include/asm/apic.h6
-rw-r--r--arch/x86/include/asm/atomic.h1
-rw-r--r--arch/x86/include/asm/atomic64_32.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/intel_pt.h10
-rw-r--r--arch/x86/include/asm/ipi.h2
-rw-r--r--arch/x86/include/asm/jump_label.h63
-rw-r--r--arch/x86/include/asm/msi.h6
-rw-r--r--arch/x86/include/asm/msr-trace.h57
-rw-r--r--arch/x86/include/asm/msr.h31
-rw-r--r--arch/x86/include/asm/paravirt.h18
-rw-r--r--arch/x86/include/asm/paravirt_types.h22
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h59
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/include/asm/uaccess.h9
-rw-r--r--arch/x86/include/asm/vdso.h1
-rw-r--r--arch/x86/include/uapi/asm/mce.h2
-rw-r--r--arch/x86/kernel/apic/apic.c45
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c19
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c7
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c10
-rw-r--r--arch/x86/kernel/apic/ipi.c18
-rw-r--r--arch/x86/kernel/apic/msi.c8
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/vector.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c93
-rw-r--r--arch/x86/kernel/cpu/perf_event.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event.h21
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c115
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c42
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c25
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c17
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c635
-rw-r--r--arch/x86/kernel/crash.c11
-rw-r--r--arch/x86/kernel/fpu/init.c13
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kernel/nmi.c32
-rw-r--r--arch/x86/kernel/paravirt.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c3
-rw-r--r--arch/x86/kernel/pvclock.c24
-rw-r--r--arch/x86/kernel/reboot.c30
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/smp.c4
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/mtrr.c25
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/vmx.c7
-rw-r--r--arch/x86/kvm/x86.c13
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/platform/uv/uv_nmi.c1
-rw-r--r--arch/x86/um/signal.c2
-rw-r--r--arch/x86/xen/enlighten.c11
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/xen-asm_32.S14
-rw-r--r--arch/x86/xen/xen-asm_64.S19
-rw-r--r--arch/x86/xen/xen-ops.h3
84 files changed, 1653 insertions, 429 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a2abc2f..258965d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -698,6 +698,14 @@ config PARAVIRT_SPINLOCKS
If you are unsure how to answer this question, answer Y.
+config QUEUED_LOCK_STAT
+ bool "Paravirt queued spinlock statistics"
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+ ---help---
+ Enable the collection of statistical data on the slowpath
+ behavior of paravirtualized queued spinlocks and report
+ them on debugfs.
+
source "arch/x86/xen/Kconfig"
config KVM_GUEST
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3c71dd9..e32206e 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,3 +1,5 @@
+#include <linux/jump_label.h>
+
/*
x86 function call convention, 64-bit:
@@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* CONFIG_X86_64 */
+/*
+ * This does 'call enter_from_user_mode' unless we can avoid it based on
+ * kernel config or using the static jump infrastructure.
+ */
+.macro CALL_enter_from_user_mode
+#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef HAVE_JUMP_LABEL
+ STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+#endif
+ call enter_from_user_mode
+.Lafter_call_\@:
+#endif
+.endm
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc..0366374 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -421,7 +421,7 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
regs->ip = landing_pad;
/*
- * Fetch ECX from where the vDSO stashed it.
+ * Fetch EBP from where the vDSO stashed it.
*
* WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
*/
@@ -432,10 +432,10 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
* Micro-optimization: the pointer we're following is explicitly
* 32 bits, so it can't be out of range.
*/
- __get_user(*(u32 *)&regs->cx,
+ __get_user(*(u32 *)&regs->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
- get_user(*(u32 *)&regs->cx,
+ get_user(*(u32 *)&regs->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
) {
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3eb572e..77d8c51 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -292,7 +292,7 @@ ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
- pushl %ecx /* pt_regs->cx */
+ pushl %ebp /* pt_regs->sp (stashed in bp) */
pushfl /* pt_regs->flags (except IF = 0) */
orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
pushl $__USER_CS /* pt_regs->cs */
@@ -308,8 +308,9 @@ sysenter_past_esp:
movl %esp, %eax
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
@@ -328,7 +329,8 @@ sysenter_past_esp:
* Return back to the vDSO, which will pop ecx and edx.
* Don't bother with DS and ES (they already contain __USER_DS).
*/
- ENABLE_INTERRUPTS_SYSEXIT
+ sti
+ sysexit
.pushsection .fixup, "ax"
2: movl $0, PT_FS(%esp)
@@ -551,11 +553,6 @@ ENTRY(native_iret)
iret
_ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
#endif
ENTRY(overflow)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a55697d..9d34d3c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -520,9 +520,7 @@ END(irq_entries_start)
*/
TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ CALL_enter_from_user_mode
1:
/*
@@ -1066,9 +1064,7 @@ ENTRY(error_entry)
* (which can take locks).
*/
TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ CALL_enter_from_user_mode
ret
.Lerror_entry_done:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index c320183..ff1c6d6 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -18,13 +18,6 @@
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
- swapgs
- sysretl
-ENDPROC(native_usergs_sysret32)
-#endif
-
/*
* 32-bit SYSENTER instruction entry.
*
@@ -63,7 +56,7 @@ ENTRY(entry_SYSENTER_compat)
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
- pushq %rcx /* pt_regs->sp */
+ pushq %rbp /* pt_regs->sp (stashed in bp) */
/*
* Push flags. This is nasty. First, interrupts are currently
@@ -82,14 +75,14 @@ ENTRY(entry_SYSENTER_compat)
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx (will be overwritten) */
+ pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 = 0 */
pushq %r8 /* pt_regs->r9 = 0 */
pushq %r8 /* pt_regs->r10 = 0 */
pushq %r8 /* pt_regs->r11 = 0 */
pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
pushq %r8 /* pt_regs->r12 = 0 */
pushq %r8 /* pt_regs->r13 = 0 */
pushq %r8 /* pt_regs->r14 = 0 */
@@ -103,15 +96,15 @@ ENTRY(entry_SYSENTER_compat)
* This needs to happen before enabling interrupts so that
* we don't get preempted with NT set.
*
- * NB.: sysenter_fix_flags is a label with the code under it moved
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
* out-of-line as an optimization: NT is unlikely to be set in the
* majority of the cases and instead of polluting the I$ unnecessarily,
* we're keeping that code behind a branch which will predict as
* not-taken and therefore its instructions won't be fetched.
*/
testl $X86_EFLAGS_NT, EFLAGS(%rsp)
- jnz sysenter_fix_flags
-sysenter_flags_fixed:
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
/*
* User mode is traced as though IRQs are on, and SYSENTER
@@ -121,14 +114,15 @@ sysenter_flags_fixed:
movq %rsp, %rdi
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
jmp sysret32_from_system_call
-sysenter_fix_flags:
+.Lsysenter_fix_flags:
pushq $X86_EFLAGS_FIXED
popfq
- jmp sysenter_flags_fixed
+ jmp .Lsysenter_flags_fixed
ENDPROC(entry_SYSENTER_compat)
/*
@@ -178,7 +172,7 @@ ENTRY(entry_SYSCALL_compat)
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx (will be overwritten) */
+ pushq %rbp /* pt_regs->cx (stashed in bp) */
pushq $-ENOSYS /* pt_regs->ax */
xorq %r8,%r8
pushq %r8 /* pt_regs->r8 = 0 */
@@ -186,7 +180,7 @@ ENTRY(entry_SYSCALL_compat)
pushq %r8 /* pt_regs->r10 = 0 */
pushq %r8 /* pt_regs->r11 = 0 */
pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
pushq %r8 /* pt_regs->r12 = 0 */
pushq %r8 /* pt_regs->r13 = 0 */
pushq %r8 /* pt_regs->r14 = 0 */
@@ -200,8 +194,9 @@ ENTRY(entry_SYSCALL_compat)
movq %rsp, %rdi
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
@@ -236,7 +231,8 @@ sysret32_from_system_call:
xorq %r9, %r9
xorq %r10, %r10
movq RSP-ORIG_RAX(%rsp), %rsp
- USERGS_SYSRET32
+ swapgs
+ sysretl
END(entry_SYSCALL_compat)
/*
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa6..8602f06 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,8 +17,10 @@
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
+#include <asm/pvclock.h>
#include <linux/math64.h>
#include <linux/time.h>
+#include <linux/kernel.h>
#define gtod (&VVAR(vsyscall_gtod_data))
@@ -36,12 +38,12 @@ static notrace cycle_t vread_hpet(void)
}
#endif
-#ifndef BUILD_VDSO32
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+ __attribute__((visibility("hidden")));
+#endif
-#include <linux/kernel.h>
-#include <asm/vsyscall.h>
-#include <asm/fixmap.h>
-#include <asm/pvclock.h>
+#ifndef BUILD_VDSO32
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -60,75 +62,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
-#ifdef CONFIG_PARAVIRT_CLOCK
-
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
-{
- const struct pvclock_vsyscall_time_info *pvti_base;
- int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
- int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
- BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
- pvti_base = (struct pvclock_vsyscall_time_info *)
- __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
- return &pvti_base[offset];
-}
-
-static notrace cycle_t vread_pvclock(int *mode)
-{
- const struct pvclock_vsyscall_time_info *pvti;
- cycle_t ret;
- u64 last;
- u32 version;
- u8 flags;
- unsigned cpu, cpu1;
-
-
- /*
- * Note: hypervisor must guarantee that:
- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
- * 2. that per-CPU pvclock time info is updated if the
- * underlying CPU changes.
- * 3. that version is increased whenever underlying CPU
- * changes.
- *
- */
- do {
- cpu = __getcpu() & VGETCPU_CPU_MASK;
- /* TODO: We can put vcpu id into higher bits of pvti.version.
- * This will save a couple of cycles by getting rid of
- * __getcpu() calls (Gleb).
- */
-
- pvti = get_pvti(cpu);
-
- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
- /*
- * Test we're still on the cpu as well as the version.
- * We could have been migrated just after the first
- * vgetcpu but before fetching the version, so we
- * wouldn't notice a version change.
- */
- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
- } while (unlikely(cpu != cpu1 ||
- (pvti->pvti.version & 1) ||
- pvti->pvti.version != version));
-
- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
- *mode = VCLOCK_NONE;
-
- /* refer to tsc.c read_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
-}
-#endif
#else
@@ -162,15 +95,77 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
+#endif
+
#ifdef CONFIG_PARAVIRT_CLOCK
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
+{
+ return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
+}
static notrace cycle_t vread_pvclock(int *mode)
{
- *mode = VCLOCK_NONE;
- return 0;
-}
-#endif
+ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
+ cycle_t ret;
+ u64 tsc, pvti_tsc;
+ u64 last, delta, pvti_system_time;
+ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
+
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
+ do {
+ version = pvti->version;
+
+ smp_rmb();
+
+ tsc = rdtsc_ordered();
+ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+ pvti_tsc_shift = pvti->tsc_shift;
+ pvti_system_time = pvti->system_time;
+ pvti_tsc = pvti->tsc_timestamp;
+
+ /* Make sure that the version double-check is last. */
+ smp_rmb();
+ } while (unlikely((version & 1) || version != pvti->version));
+
+ delta = tsc - pvti_tsc;
+ ret = pvti_system_time +
+ pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+ pvti_tsc_shift);
+
+ /* refer to vread_tsc() comment for rationale */
+ last = gtod->cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ return last;
+}
#endif
notrace static cycle_t vread_tsc(void)
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921..4158acc 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
- vvar_start = . - 2 * PAGE_SIZE;
+ vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
+ pvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 785d992..491020b 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -80,6 +81,7 @@ enum {
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
};
struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
+ [sym_pvclock_page] = {"pvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 93bd845..3a1d929 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -1,5 +1,5 @@
/*
- * Code for the vDSO. This version uses the old int $0x80 method.
+ * AT_SYSINFO entry point
*/
#include <asm/dwarf2.h>
@@ -21,35 +21,67 @@ __kernel_vsyscall:
/*
* Reshuffle regs so that all of any of the entry instructions
* will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may eludicate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
*/
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
pushl %edx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edx, 0
- pushl %ecx
+ pushl %ebp
CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- movl %esp, %ecx
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
#ifdef CONFIG_X86_64
/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
- ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
- "syscall", X86_FEATURE_SYSCALL32
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
#else
- ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
#endif
/* Enter using int $0x80 */
- movl (%esp), %ecx
int $0x80
GLOBAL(int80_landing_pad)
- /* Restore ECX and EDX in case they were clobbered. */
- popl %ecx
- CFI_RESTORE ecx
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
CFI_ADJUST_CFA_OFFSET -4
popl %edx
CFI_RESTORE edx
CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
ret
CFI_ENDPROC
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 64df471..b8f69e2 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
+#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
@@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
.name = "[vvar]",
.pages = no_pages,
};
+ struct pvclock_vsyscall_time_info *pvti;
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
#endif
+ pvti = pvclock_pvti_cpu0_va();
+ if (pvti && image->sym_pvclock_page) {
+ ret = remap_pfn_range(vma,
+ text_start + image->sym_pvclock_page,
+ __pa(pvti) >> PAGE_SHIFT,
+ PAGE_SIZE,
+ PAGE_READONLY);
+
+ if (ret)
+ goto up_fail;
+ }
+
up_fail:
if (ret)
current->mm->context.vdso = NULL;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a30316b..c80f6b6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -23,6 +23,11 @@
#define APIC_VERBOSE 1
#define APIC_DEBUG 2
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP 0 /* Default */
+#define APIC_EXTNMI_ALL 1
+#define APIC_EXTNMI_NONE 2
+
/*
* Define the default level of output to be very little
* This can be turned up by using apic=verbose for more
@@ -303,6 +308,7 @@ struct apic {
unsigned int *apicid);
/* ipi */
+ void (*send_IPI)(int cpu, int vector);
void (*send_IPI_mask)(const struct cpumask *mask, int vector);
void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
int vector);
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index ae5fb83..3e86742 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -3,7 +3,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <asm/processor.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index a11c30b..a984111 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -3,7 +3,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <asm/processor.h>
//#include <asm/cmpxchg.h>
/* An 64bit atomic type */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 43e1444..7ad8c94 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index f80d700..6d7d0e5 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
-#include <asm/pvclock.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -72,10 +71,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_VSYSCALL_EMULATION
VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
#endif
-#ifdef CONFIG_PARAVIRT_CLOCK
- PVCLOCK_FIXMAP_BEGIN,
- PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
-#endif
#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
new file mode 100644
index 0000000..e1a4117
--- /dev/null
+++ b/arch/x86/include/asm/intel_pt.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 615fa90..cfc9a0d 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -119,6 +119,8 @@ static inline void
native_apic_mem_write(APIC_ICR, cfg);
}
+extern void default_send_IPI_single(int cpu, int vector);
+extern void default_send_IPI_single_phys(int cpu, int vector);
extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
int vector);
extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 5daeca3..adc54c1 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,12 +1,18 @@
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H
-#ifndef __ASSEMBLY__
-
-#include <linux/stringify.h>
-#include <linux/types.h>
-#include <asm/nops.h>
-#include <asm/asm.h>
+#ifndef HAVE_JUMP_LABEL
+/*
+ * For better or for worse, if jump labels (the gcc extension) are missing,
+ * then the entire static branch patching infrastructure is compiled out.
+ * If that happens, the code in here will malfunction. Raise a compiler
+ * error instead.
+ *
+ * In theory, jump labels and the static branch patching infrastructure
+ * could be decoupled to fix this.
+ */
+#error asm/jump_label.h included on a non-jump-label kernel
+#endif
#define JUMP_LABEL_NOP_SIZE 5
@@ -16,6 +22,14 @@
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
@@ -59,5 +73,40 @@ struct jump_entry {
jump_label_t key;
};
-#endif /* __ASSEMBLY__ */
+#else /* __ASSEMBLY__ */
+
+.macro STATIC_JUMP_IF_TRUE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .else
+ .byte STATIC_KEY_INIT_NOP
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key
+ .popsection
+.endm
+
+.macro STATIC_JUMP_IF_FALSE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ .byte STATIC_KEY_INIT_NOP
+ .else
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key + 1
+ .popsection
+.endm
+
+#endif /* __ASSEMBLY__ */
+
#endif
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 93724cc..eb4b09b 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -1,7 +1,13 @@
#ifndef _ASM_X86_MSI_H
#define _ASM_X86_MSI_H
#include <asm/hw_irq.h>
+#include <asm/irqdomain.h>
typedef struct irq_alloc_info msi_alloc_info_t;
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
+
#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h
new file mode 100644
index 0000000..7567225
--- /dev/null
+++ b/arch/x86/include/asm/msr-trace.h
@@ -0,0 +1,57 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed),
+ TP_STRUCT__entry(
+ __field( unsigned, msr )
+ __field( u64, val )
+ __field( int, failed )
+ ),
+ TP_fast_assign(
+ __entry->msr = msr;
+ __entry->val = val;
+ __entry->failed = failed;
+ ),
+ TP_printk("%x, value %llx%s",
+ __entry->msr,
+ __entry->val,
+ __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b28..fedd6e6 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -57,11 +57,34 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Be very careful with includes. This header is prone to include loops.
+ */
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_read_msr;
+extern struct tracepoint __tracepoint_write_msr;
+extern struct tracepoint __tracepoint_rdpmc;
+#define msr_tracepoint_active(t) static_key_false(&(t).key)
+extern void do_trace_write_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned msr, u64 val, int failed);
+#else
+#define msr_tracepoint_active(t) false
+static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {}
+#endif
+
static inline unsigned long long native_read_msr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);
asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
@@ -78,6 +101,8 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
_ASM_EXTABLE(2b, 3b)
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
: "c" (msr), [fault] "i" (-EIO));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
}
@@ -85,6 +110,8 @@ static inline void native_write_msr(unsigned int msr,
unsigned low, unsigned high)
{
asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}
/* Can be uninlined because referenced by paravirt */
@@ -102,6 +129,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
: "c" (msr), "0" (low), "d" (high),
[fault] "i" (-EIO)
: "memory");
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), err);
return err;
}
@@ -160,6 +189,8 @@ static inline unsigned long long native_read_pmc(int counter)
DECLARE_ARGS(val, low, high);
asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ if (msr_tracepoint_active(__tracepoint_rdpmc))
+ do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cbbf41c..f619250 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -19,6 +19,12 @@ static inline int paravirt_enabled(void)
return pv_info.paravirt_enabled;
}
+static inline int paravirt_has_feature(unsigned int feature)
+{
+ WARN_ON_ONCE(!pv_info.paravirt_enabled);
+ return (pv_info.features & feature);
+}
+
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -896,23 +902,11 @@ extern void default_banner(void);
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-#define USERGS_SYSRET32 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
-
#ifdef CONFIG_X86_32
#define GET_CR0_INTO_EAX \
push %ecx; push %edx; \
call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
pop %edx; pop %ecx
-
-#define ENABLE_INTERRUPTS_SYSEXIT \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-
-
#else /* !CONFIG_X86_32 */
/*
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0451503..77db561 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -70,9 +70,14 @@ struct pv_info {
#endif
int paravirt_enabled;
+ unsigned int features; /* valid only if paravirt_enabled is set */
const char *name;
};
+#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
+/* Supported features */
+#define PV_SUPPORTED_RTC (1<<0)
+
struct pv_init_ops {
/*
* Patch may replace one of the defined code sequences with
@@ -157,15 +162,6 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
-#ifdef CONFIG_X86_32
- /*
- * Atomically enable interrupts and return to userspace. This
- * is only used in 32-bit kernels. 64-bit kernels use
- * usergs_sysret32 instead.
- */
- void (*irq_enable_sysexit)(void);
-#endif
-
/*
* Switch to usermode gs and return to 64-bit usermode using
* sysret. Only used in 64-bit kernels to return to 64-bit
@@ -174,14 +170,6 @@ struct pv_cpu_ops {
*/
void (*usergs_sysret64)(void);
- /*
- * Switch to usermode gs and return to 32-bit usermode using
- * sysret. Used to return to 32-on-64 compat processes.
- * Other usermode register state, including %esp, must already
- * be restored.
- */
- void (*usergs_sysret32)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6752225..2d5a50c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void)
#else
#define __cpuid native_cpuid
#define paravirt_enabled() 0
+#define paravirt_has(x) 0
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7a6bed5..fdcc040 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,6 +4,15 @@
#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
+#ifdef CONFIG_KVM_GUEST
+extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
+
/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
@@ -91,10 +100,5 @@ struct pvclock_vsyscall_time_info {
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
-#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
- int size);
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e71..9f92c18 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * struct __qspinlock *l = (void *)lock;
+ * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ "lock cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ "ret;"
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ "ret;"
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif /* CONFIG_64BIT */
#endif
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index a82c4f1..2cb1cc2 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type);
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+void run_crash_ipi_callback(struct pt_regs *regs);
#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 09b1b0a..660458a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -745,5 +745,14 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
#undef __copy_from_user_overflow
#undef __copy_to_user_overflow
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ *
+ * Caller must use pagefault_enable/disable, or run in interrupt context,
+ * and also do a uaccess_ok() check
+ */
+#define __copy_from_user_nmi __copy_from_user_inatomic
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 756de91..deabaf9 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -22,6 +22,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_hpet_page;
+ long sym_pvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 03429da..2184943 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -16,7 +16,7 @@ struct mce {
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
__u8 severity;
- __u8 usable_addr;
+ __u8 pad;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2f69e3b..8a5cdda 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -82,6 +82,12 @@ physid_mask_t phys_cpu_present_map;
static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
/*
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
+ */
+static int apic_extnmi = APIC_EXTNMI_BSP;
+
+/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1161,6 +1167,8 @@ void __init init_bsp_APIC(void)
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
}
@@ -1378,9 +1386,11 @@ void setup_local_APIC(void)
apic_write(APIC_LVT0, value);
/*
- * only the BP should see the LINT1 NMI signal, obviously.
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
*/
- if (!cpu)
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -2270,6 +2280,7 @@ static struct {
unsigned int apic_tmict;
unsigned int apic_tdcr;
unsigned int apic_thmr;
+ unsigned int apic_cmci;
} apic_pm_state;
static int lapic_suspend(void)
@@ -2299,6 +2310,10 @@ static int lapic_suspend(void)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
local_irq_save(flags);
disable_local_APIC();
@@ -2355,10 +2370,14 @@ static void lapic_resume(void)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
if (maxlvt >= 4)
apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
@@ -2548,3 +2567,23 @@ static int __init apic_set_disabled_cpu_apicid(char *arg)
return 0;
}
early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f92ab36..9968f30 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,6 +185,7 @@ static struct apic apic_flat = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
.send_IPI_allbutself = flat_send_IPI_allbutself,
@@ -230,17 +231,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- default_send_IPI_mask_sequence_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
- int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpumask, vector);
-}
-
static void physflat_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -248,7 +238,7 @@ static void physflat_send_IPI_allbutself(int vector)
static void physflat_send_IPI_all(int vector)
{
- physflat_send_IPI_mask(cpu_online_mask, vector);
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
}
static int physflat_probe(void)
@@ -292,8 +282,9 @@ static struct apic apic_physflat = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 0d96749..331a7a0 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -30,6 +30,7 @@
#include <asm/e820.h>
static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
@@ -144,6 +145,7 @@ struct apic apic_noop = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
.send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
.send_IPI_allbutself = noop_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 38dd5ef..c80c02c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -193,20 +193,17 @@ static int __init numachip_system_init(void)
case 1:
init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
numachip_apic_icr_write = numachip1_apic_icr_write;
- x86_init.pci.arch_init = pci_numachip_init;
break;
case 2:
init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
numachip_apic_icr_write = numachip2_apic_icr_write;
-
- /* Use MCFG config cycles rather than locked CF8 cycles */
- raw_pci_ops = &pci_mmcfg;
break;
default:
return 0;
}
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
return 0;
}
@@ -276,6 +273,7 @@ static const struct apic apic_numachip1 __refconst = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
.send_IPI_allbutself = numachip_send_IPI_allbutself,
@@ -327,6 +325,7 @@ static const struct apic apic_numachip2 __refconst = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
.send_IPI_allbutself = numachip_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 971cf88..cf9bd89 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
return cpuid_apic >> index_msb;
}
-static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
static void bigsmp_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector)
static void bigsmp_send_IPI_all(int vector)
{
- bigsmp_send_IPI_mask(cpu_online_mask, vector);
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
}
static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -180,7 +175,8 @@ static struct apic apic_bigsmp = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
- .send_IPI_mask = bigsmp_send_IPI_mask,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
.send_IPI_mask_allbutself = NULL,
.send_IPI_allbutself = bigsmp_send_IPI_allbutself,
.send_IPI_all = bigsmp_send_IPI_all,
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6207156..eb45fc9 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,16 @@
#include <asm/proto.h>
#include <asm/ipi.h>
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+}
+
void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
unsigned long query_cpu;
@@ -55,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
local_irq_restore(flags);
}
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
+{
+ apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 5f1feb6..ade2532 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -96,8 +96,8 @@ static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
return arg->msi_hwirq;
}
-static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *arg)
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
{
struct pci_dev *pdev = to_pci_dev(dev);
struct msi_desc *desc = first_pci_msi_entry(pdev);
@@ -113,11 +113,13 @@ static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
-static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
{
arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
}
+EXPORT_SYMBOL_GPL(pci_msi_set_desc);
static struct msi_domain_ops pci_msi_domain_ops = {
.get_hwirq = pci_msi_get_hwirq,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7694ae6..f316e34 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,6 +105,7 @@ static struct apic apic_default = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
.send_IPI_allbutself = default_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 861bc59..908cb37 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -29,6 +29,7 @@ struct apic_chip_data {
};
struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
static DEFINE_RAW_SPINLOCK(vector_lock);
static cpumask_var_t vector_cpumask;
static struct irq_chip lapic_controller;
@@ -66,6 +67,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
return data ? &data->cfg : NULL;
}
+EXPORT_SYMBOL_GPL(irqd_cfg);
struct irq_cfg *irq_cfg(unsigned int irq)
{
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cc8311c..aca8b75 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu)
return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
}
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+}
+
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
@@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = {
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 662e915..a1242e2 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
+}
+
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
@@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 4a13946..d760c6b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 439df97..84a7524 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -65,9 +65,6 @@ void common(void) {
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-#ifdef CONFIG_X86_32
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
-#endif
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d8f42f9..f2edafb 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,7 +23,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
- OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index c5b0d56..a006f4c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -114,7 +114,6 @@ static struct work_struct mce_work;
static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-static int mce_usable_address(struct mce *m);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
@@ -475,6 +474,28 @@ static void mce_report_event(struct pt_regs *regs)
irq_work_queue(&mce_irq_work);
}
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ return 0;
+
+ /* Checks after this one are Intel-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 1;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return 0;
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return 0;
+ return 1;
+}
+
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -484,7 +505,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
if (!mce)
return NOTIFY_DONE;
- if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+ if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
memory_failure(pfn, MCE_VECTOR, 0);
}
@@ -522,10 +543,10 @@ static bool memory_error(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
- /*
- * coming soon
- */
- return false;
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -567,7 +588,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
*/
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
- bool error_logged = false;
+ bool error_seen = false;
struct mce m;
int severity;
int i;
@@ -601,6 +622,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
(m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
+ error_seen = true;
+
mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
@@ -608,27 +631,24 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- /*
- * In the cases where we don't have a valid address after all,
- * do not add it into the ring buffer.
- */
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
- if (m.status & MCI_STATUS_ADDRV) {
+ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
- m.usable_addr = mce_usable_address(&m);
-
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
- }
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
- error_logged = true;
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
mce_log(&m);
+ else if (mce_usable_address(&m)) {
+ /*
+ * Although we skipped logging this, we still want
+ * to take action. Add to the pool so the registered
+ * notifiers will see it.
+ */
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
}
/*
@@ -644,7 +664,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
sync_core();
- return error_logged;
+ return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -931,23 +951,6 @@ reset:
return ret;
}
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
- return 0;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
- return 1;
-}
-
static void mce_clear_state(unsigned long *toclear)
{
int i;
@@ -999,6 +1002,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int flags = MF_ACTION_REQUIRED;
int lmce = 0;
+ /* If this CPU is offline, just bail out. */
+ if (cpu_is_offline(smp_processor_id())) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return;
+ }
+ }
+
ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1089,7 +1103,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/* assuming valid severity level != 0 */
m.severity = severity;
- m.usable_addr = mce_usable_address(&m);
mce_log(&m);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bf79d7..1b443db 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -482,6 +482,9 @@ int x86_pmu_hw_config(struct perf_event *event)
/* Support for IP fixup */
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
+
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
}
if (event->attr.precise_ip > precise)
@@ -1531,6 +1534,7 @@ static void __init filter_events(struct attribute **attrs)
{
struct device_attribute *d;
struct perf_pmu_events_attr *pmu_attr;
+ int offset = 0;
int i, j;
for (i = 0; attrs[i]; i++) {
@@ -1539,7 +1543,7 @@ static void __init filter_events(struct attribute **attrs)
/* str trumps id */
if (pmu_attr->event_str)
continue;
- if (x86_pmu.event_map(i))
+ if (x86_pmu.event_map(i + offset))
continue;
for (j = i; attrs[j]; j++)
@@ -1547,6 +1551,14 @@ static void __init filter_events(struct attribute **attrs)
/* Check the shifted attr. */
i--;
+
+ /*
+ * event_map() is index based, the attrs array is organized
+ * by increasing event index. If we shift the events, then
+ * we need to compensate for the event_map(), otherwise
+ * we are looking up the wrong event in the map
+ */
+ offset++;
}
}
@@ -2250,12 +2262,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
ss_base = get_segment_base(regs->ss);
fp = compat_ptr(ss_base + regs->bp);
+ pagefault_disable();
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (!access_ok(VERIFY_READ, fp, 8))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
if (bytes != 0)
break;
@@ -2265,6 +2284,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
perf_callchain_store(entry, cs_base + frame.return_address);
fp = compat_ptr(ss_base + frame.next_frame);
}
+ pagefault_enable();
return 1;
}
#else
@@ -2302,12 +2322,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (perf_callchain_user32(regs, entry))
return;
+ pagefault_disable();
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (!access_ok(VERIFY_READ, fp, 16))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
if (bytes != 0)
break;
@@ -2315,8 +2342,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
break;
perf_callchain_store(entry, frame.return_address);
- fp = frame.next_frame;
+ fp = (void __user *)frame.next_frame;
}
+ pagefault_enable();
}
/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index d0e35eb..7bb61e3 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,17 +14,7 @@
#include <linux/perf_event.h>
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) \
-do { \
- unsigned int _msr = (msr); \
- u64 _val = (val); \
- trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
- (unsigned long long)(_val)); \
- native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
-} while (0)
-#endif
+/* To enable MSR tracing please use the generic trace points. */
/*
* | NHM/WSM | SNB |
@@ -318,6 +308,10 @@ struct cpu_hw_events {
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
/* Like UEVENT_CONSTRAINT, but match flags too */
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
@@ -589,7 +583,8 @@ struct x86_pmu {
bts_active :1,
pebs :1,
pebs_active :1,
- pebs_broken :1;
+ pebs_broken :1,
+ pebs_prec_dist :1;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
@@ -907,6 +902,8 @@ void intel_pmu_lbr_init_hsw(void);
void intel_pmu_lbr_init_skl(void);
+void intel_pmu_lbr_init_knl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 3ea177c..5861053 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -18,7 +18,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+ [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e2a4300..a667078 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7,
+ MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7,
+ MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -255,7 +263,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
EVENT_CONSTRAINT_END
};
@@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL BIT_ULL(21)
+#define KNL_MCDRAM_FAR BIT_ULL(22)
+#define KNL_DDR_LOCAL BIT_ULL(23)
+#define KNL_DDR_FAR BIT_ULL(24)
+#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+ KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ SLM_DMND_READ
+#define KNL_L2_WRITE SLM_DMND_WRITE
+#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS SLM_LLC_ACCESS
+#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+ KNL_DRAM_ANY | SNB_SNP_ANY | \
+ SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
+ },
+ },
+};
+
/*
* Use from PMIs where the LBRs are already disabled.
*/
@@ -2475,6 +2519,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
}
}
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+ * (0x01c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * The PREC_DIST event has special support to minimize sample
+ * shadowing effects. One drawback is that it can be
+ * only programmed on counter 1, but that seems like an
+ * acceptable trade off.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_snb(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_core2(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
{
unsigned long flags = x86_pmu.free_running_flags;
@@ -3332,6 +3414,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_gen_event_constraints;
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
break;
@@ -3431,7 +3514,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_ivb_event_constraints;
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
if (boot_cpu_data.x86_model == 62)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
@@ -3464,7 +3548,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3499,7 +3584,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_bdw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3511,6 +3597,24 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 87: /* Knights Landing Xeon Phi */
+ memcpy(hw_cache_event_ids,
+ slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs,
+ knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_knl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_knl_extra_regs;
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ pr_cont("Knights Landing events, ");
+ break;
+
case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
x86_pmu.late_ack = true;
@@ -3521,7 +3625,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_skl_event_constraints;
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
x86_pmu.extra_regs = intel_skl_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5db1c77..10602f0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -620,6 +620,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
};
@@ -686,6 +688,8 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -700,6 +704,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -718,9 +724,10 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
struct event_constraint intel_skl_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
@@ -1101,6 +1108,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
void *at;
u64 pebs_status;
+ /*
+ * fmt0 does not have a status bitfield (does not use
+ * perf_record_nhm format)
+ */
+ if (x86_pmu.intel_cap.pebs_format < 1)
+ return base;
+
if (base == NULL)
return NULL;
@@ -1186,7 +1200,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
if (!event->attr.precise_ip)
return;
- n = (top - at) / x86_pmu.pebs_record_size;
+ n = top - at;
if (n <= 0)
return;
@@ -1230,12 +1244,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
pebs_status = p->status & cpuc->pebs_enabled;
pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+ /*
+ * On some CPUs the PEBS status can be zero when PEBS is
+ * racing with clearing of GLOBAL_STATUS.
+ *
+ * Normally we would drop that record, but in the
+ * case when there is only a single active PEBS event
+ * we can assume it's for that event.
+ */
+ if (!pebs_status && cpuc->pebs_enabled &&
+ !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+ pebs_status = cpuc->pebs_enabled;
+
bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);
- if (WARN(bit >= x86_pmu.max_pebs_events,
- "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
- (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
- *(unsigned long long *)cpuc->active_mask))
+ if (bit >= x86_pmu.max_pebs_events)
continue;
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 659f01e..653f88d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -42,6 +42,13 @@ static enum {
#define LBR_FAR_BIT 8 /* do not capture far branches */
#define LBR_CALL_STACK_BIT 9 /* enable call stack */
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */
+
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
#define LBR_JCC (1 << LBR_JCC_BIT)
@@ -52,6 +59,7 @@ static enum {
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -152,8 +160,8 @@ static void __intel_pmu_lbr_enable(bool pmi)
* did not change.
*/
if (cpuc->lbr_sel)
- lbr_select = cpuc->lbr_sel->config;
- if (!pmi)
+ lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+ if (!pmi && cpuc->lbr_sel)
wrmsrl(MSR_LBR_SELECT, lbr_select);
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -422,6 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
*/
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
+ bool need_info = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
@@ -429,8 +438,11 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
int out = 0;
int num = x86_pmu.lbr_nr;
- if (cpuc->lbr_sel->config & LBR_CALL_STACK)
- num = tos;
+ if (cpuc->lbr_sel) {
+ need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ num = tos;
+ }
for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
@@ -442,7 +454,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
- if (lbr_format == LBR_FORMAT_INFO) {
+ if (lbr_format == LBR_FORMAT_INFO && need_info) {
u64 info;
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
@@ -590,6 +602,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
if (v != LBR_IGN)
mask |= v;
}
+
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
@@ -600,6 +613,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
*/
reg->config = mask ^ x86_pmu.lbr_sel_mask;
+ if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+ (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+ (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ reg->config |= LBR_NO_INFO;
+
return 0;
}
@@ -1028,3 +1046,17 @@ void __init intel_pmu_lbr_init_atom(void)
*/
pr_cont("8-deep LBR, ");
}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 868e119..c0bbd10 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -27,6 +27,7 @@
#include <asm/perf_event.h>
#include <asm/insn.h>
#include <asm/io.h>
+#include <asm/intel_pt.h>
#include "perf_event.h"
#include "intel_pt.h"
@@ -1122,6 +1123,14 @@ static int pt_event_init(struct perf_event *event)
return 0;
}
+void cpu_emergency_stop_pt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ if (pt->handle.event)
+ pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
static __init int pt_init(void)
{
int ret, cpu, prior_warn = 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index ed446bd..24a351a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -63,7 +63,7 @@
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
#define NR_RAPL_DOMAINS 0x4
-static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@@ -109,11 +109,11 @@ static struct kobj_attribute format_attr_##_var = \
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-#define RAPL_EVENT_ATTR_STR(_name, v, str) \
-static struct perf_pmu_events_attr event_attr_##v = { \
- .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
- .id = 0, \
- .event_str = str, \
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
};
struct rapl_pmu {
@@ -405,19 +405,6 @@ static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
-static ssize_t rapl_sysfs_show(struct device *dev,
- struct device_attribute *attr,
- char *page)
-{
- struct perf_pmu_events_attr *pmu_attr = \
- container_of(attr, struct perf_pmu_events_attr, attr);
-
- if (pmu_attr->event_str)
- return sprintf(page, "%s", pmu_attr->event_str);
-
- return 0;
-}
-
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 61215a6..f97f807 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -884,6 +884,15 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
* each box has a different function id.
*/
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ /* Knights Landing uses a common PCI device ID for multiple instances of
+ * an uncore PMU device type. There is only one entry per device type in
+ * the knl_uncore_pci_ids table inspite of multiple devices present for
+ * some device types. Hence PCI device idx would be 0 for all devices.
+ * So increment pmu pointer to point to an unused array element.
+ */
+ if (boot_cpu_data.x86_model == 87)
+ while (pmu->func_id >= 0)
+ pmu++;
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
@@ -966,6 +975,7 @@ static int __init uncore_pci_init(void)
case 63: /* Haswell-EP */
ret = hswep_uncore_pci_init();
break;
+ case 79: /* BDX-EP */
case 86: /* BDX-DE */
ret = bdx_uncore_pci_init();
break;
@@ -982,6 +992,9 @@ static int __init uncore_pci_init(void)
case 61: /* Broadwell */
ret = bdw_uncore_pci_init();
break;
+ case 87: /* Knights Landing */
+ ret = knl_uncore_pci_init();
+ break;
default:
return 0;
}
@@ -1287,9 +1300,13 @@ static int __init uncore_cpu_init(void)
case 63: /* Haswell-EP */
hswep_uncore_cpu_init();
break;
+ case 79: /* BDX-EP */
case 86: /* BDX-DE */
bdx_uncore_cpu_init();
break;
+ case 87: /* Knights Landing */
+ knl_uncore_cpu_init();
+ break;
default:
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 2f0a4a9..07aa2d6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -338,6 +338,7 @@ int hsw_uncore_pci_init(void);
int bdw_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
/* perf_event_intel_uncore_snbep.c */
int snbep_uncore_pci_init(void);
@@ -348,6 +349,8 @@ int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
int bdx_uncore_pci_init(void);
void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 8452561..0b93482 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -417,7 +417,7 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
}
}
-static int snb_pci2phy_map_init(int devid)
+int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
struct pci2phy_map *map;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index f0f4fcb..33acb88 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -209,31 +209,98 @@
#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710
#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET 0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400
+#define KNL_UCLK_MSR_PMON_CTL0 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454
+#define KNL_PMON_FIXED_CTL_EN 0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+ KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_CBO_PMON_CTL_TID_EN | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PMON_CTL_INVERT | \
+ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -315,8 +382,9 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe
static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+ pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
}
static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
@@ -1728,6 +1796,419 @@ int ivbep_uncore_pci_init(void)
}
/* end of IvyTown uncore support */
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link3.attr,
+ &format_attr_filter_state4.attr,
+ &format_attr_filter_local.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_nnm.attr,
+ &format_attr_filter_opc3.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+ EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x4)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+ return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = knl_cha_hw_config,
+ .get_constraint = knl_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+ .name = "cha",
+ .num_counters = 4,
+ .num_boxes = 38,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = KNL_CHA_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = knl_uncore_cha_constraints,
+ .ops = &knl_uncore_cha_ops,
+ .format_group = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+ &format_attr_event2.attr,
+ &format_attr_use_occ_ctr.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh6.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+ &knl_uncore_ubox,
+ &knl_uncore_cha,
+ &knl_uncore_pcu,
+ NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+ == UNCORE_FIXED_EVENT)
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | KNL_PMON_FIXED_CTL_EN);
+ else
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = knl_uncore_imc_enable_box,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .enable_event = knl_uncore_imc_enable_event,
+ .disable_event = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+ .name = "imc_uclk",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+ .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+ .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+ .name = "edc_uclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+ .name = "edc_eclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+ .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+ .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = knl_uncore_m2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL,
+ .ops = &snbep_uncore_pci_ops,
+ .format_group = &knl_uncore_irp_format_group,
+};
+
+enum {
+ KNL_PCI_UNCORE_MC_UCLK,
+ KNL_PCI_UNCORE_MC_DCLK,
+ KNL_PCI_UNCORE_EDC_UCLK,
+ KNL_PCI_UNCORE_EDC_ECLK,
+ KNL_PCI_UNCORE_M2PCIE,
+ KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+ [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk,
+ [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk,
+ [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk,
+ [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk,
+ [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie,
+ [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp,
+ NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ * PCI Device ID Uncore PMU Devices
+ * ----------------------------------
+ * 0x7841 MC0 UClk, MC1 UClk
+ * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ * 0x7817 M2PCIe
+ * 0x7814 IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+ { /* MC UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+ },
+ { /* MC DClk Channel */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+ },
+ { /* EDC UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+ },
+ { /* EDC EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+ },
+ { /* M2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+ .name = "knl_uncore",
+ .id_table = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+ int ret;
+
+ /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+ ret = snb_pci2phy_map_init(0x7814); /* IRP */
+ if (ret)
+ return ret;
+ ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+ if (ret)
+ return ret;
+ uncore_pci_uncores = knl_pci_uncores;
+ uncore_pci_driver = &knl_uncore_pci_driver;
+ return 0;
+}
+
+/* end of KNL uncore support */
+
/* Haswell-EP uncore support */
static struct attribute *hswep_uncore_ubox_formats_attr[] = {
&format_attr_event.attr,
@@ -2338,7 +2819,7 @@ int hswep_uncore_pci_init(void)
}
/* end of Haswell-EP uncore support */
-/* BDX-DE uncore support */
+/* BDX uncore support */
static struct intel_uncore_type bdx_uncore_ubox = {
.name = "ubox",
@@ -2360,13 +2841,14 @@ static struct event_constraint bdx_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type bdx_uncore_cbox = {
.name = "cbox",
.num_counters = 4,
- .num_boxes = 8,
+ .num_boxes = 24,
.perf_ctr_bits = 48,
.event_ctl = HSWEP_C0_MSR_PMON_CTL0,
.perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
@@ -2379,9 +2861,24 @@ static struct intel_uncore_type bdx_uncore_cbox = {
.format_group = &hswep_uncore_cbox_format_group,
};
+static struct intel_uncore_type bdx_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
static struct intel_uncore_type *bdx_msr_uncores[] = {
&bdx_uncore_ubox,
&bdx_uncore_cbox,
+ &bdx_uncore_sbox,
&hswep_uncore_pcu,
NULL,
};
@@ -2396,7 +2893,7 @@ void bdx_uncore_cpu_init(void)
static struct intel_uncore_type bdx_uncore_ha = {
.name = "ha",
.num_counters = 4,
- .num_boxes = 1,
+ .num_boxes = 2,
.perf_ctr_bits = 48,
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
@@ -2404,7 +2901,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
static struct intel_uncore_type bdx_uncore_imc = {
.name = "imc",
.num_counters = 5,
- .num_boxes = 2,
+ .num_boxes = 8,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
@@ -2424,6 +2921,19 @@ static struct intel_uncore_type bdx_uncore_irp = {
.format_group = &snbep_uncore_format_group,
};
+static struct intel_uncore_type bdx_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
@@ -2432,6 +2942,8 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
EVENT_CONSTRAINT_END
};
@@ -2445,18 +2957,65 @@ static struct intel_uncore_type bdx_uncore_r2pcie = {
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
enum {
BDX_PCI_UNCORE_HA,
BDX_PCI_UNCORE_IMC,
BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_QPI,
BDX_PCI_UNCORE_R2PCIE,
+ BDX_PCI_UNCORE_R3QPI,
};
static struct intel_uncore_type *bdx_pci_uncores[] = {
[BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
[BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
[BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi,
[BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi,
NULL,
};
@@ -2465,6 +3024,10 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
},
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+ },
{ /* MC0 Channel 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
@@ -2473,14 +3036,74 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
},
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+ },
{ /* IRP */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
},
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+ },
{ /* R2PCIe */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
},
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+ },
+ { /* QPI Port 2 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+ },
{ /* end: all zeroes */ }
};
@@ -2500,4 +3123,4 @@ int bdx_uncore_pci_init(void)
return 0;
}
-/* end of BDX-DE uncore support */
+/* end of BDX uncore support */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2c1910f..58f3431 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -35,6 +35,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
+#include <asm/intel_pt.h>
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
@@ -125,6 +126,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
disable_local_APIC();
}
@@ -169,6 +175,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
#ifdef CONFIG_X86_IO_APIC
/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
ioapic_zap_locks();
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 22abea0..0d4e092 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -143,9 +143,18 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
-/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+/* Get alignment of the TYPE. */
+#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
- BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+ BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
+ TYPE_ALIGN(TYPE)))
/*
* We append the 'struct fpu' to the task_struct:
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2bd81e3..72cef58 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock wall_clock;
+struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+ return hv_clock;
+}
+
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -305,7 +310,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
int cpu;
- int ret;
u8 flags;
struct pvclock_vcpu_time_info *vcpu_time;
unsigned int size;
@@ -325,11 +329,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
return 1;
}
- if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
- put_cpu();
- return ret;
- }
-
put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 697f90d..8a2cdd7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,6 +29,7 @@
#include <asm/mach_traps.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/reboot.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -231,7 +232,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
#endif
if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
@@ -255,8 +256,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason, smp_processor_id());
show_regs(regs);
- if (panic_on_io_nmi)
- panic("NMI IOCK error: Not continuing");
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
@@ -297,7 +306,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
}
@@ -348,8 +357,19 @@ static void default_do_nmi(struct pt_regs *regs)
return;
}
- /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
- raw_spin_lock(&nmi_reason_lock);
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
reason = x86_platform.get_nmi_reason();
if (reason & NMI_REASON_MASK) {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3265ea0..f08ac28 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -150,10 +150,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_ident_64(insnbuf, len);
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-#ifdef CONFIG_X86_32
- type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-#endif
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
@@ -208,8 +204,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret32(void);
extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
@@ -367,13 +361,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.load_sp0 = native_load_sp0,
-#if defined(CONFIG_X86_32)
- .irq_enable_sysexit = native_irq_enable_sysexit,
-#endif
#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
- .usergs_sysret32 = native_usergs_sysret32,
-#endif
.usergs_sysret64 = native_usergs_sysret64,
#endif
.iret = native_iret,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index c89f50a..158dc06 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, restore_fl);
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa0558..e70087a 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -13,9 +13,7 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
@@ -55,7 +53,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
PATCH_SITE(pv_cpu_ops, swapgs);
PATCH_SITE(pv_mmu_ops, read_cr2);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d2..99bfc02 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -140,27 +140,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
-
-#ifdef CONFIG_X86_64
-/*
- * Initialize the generic pvclock vsyscall state. This will allocate
- * a/some page(s) for the per-vcpu pvclock information, set up a
- * fixmap mapping for the page(s)
- */
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
- int size)
-{
- int idx;
-
- WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-
- for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
- __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
- __pa(i) + (idx*PAGE_SIZE),
- PAGE_KERNEL_VVAR);
- }
-
- return 0;
-}
-#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 02693dd..d64889a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -718,6 +718,7 @@ static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
{
@@ -780,6 +781,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
smp_send_nmi_allbutself();
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
+
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
mdelay(1);
@@ -788,9 +792,35 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* Leave the nmi callback set */
}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
+}
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+}
+
#else /* !CONFIG_SMP */
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
/* No other CPUs to shoot down */
}
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
#endif
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index cd96852..4af8d06 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void)
}
#endif
+ if (paravirt_enabled() && !paravirt_has(RTC))
+ return -ENODEV;
+
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 12c8286..658777c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -125,12 +125,12 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+ apic->send_IPI(cpu, RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
}
void native_send_call_func_ipi(const struct cpumask *mask)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 06332cb..3f5c48d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -38,6 +38,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->edx & bit(X86_FEATURE_MTRR));
+}
+
static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 08116ff..b0ea42b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -420,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_s
u8 saved_mode;
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
pit_load_count(kvm, channel, val);
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 9e8bf13..3f8c732 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -120,14 +120,22 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
-static u8 mtrr_disabled_type(void)
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
{
/*
* Intel SDM 11.11.2.2: all MTRRs are disabled when
* IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
* memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
*/
- return MTRR_TYPE_UNCACHABLE;
+ if (guest_cpuid_has_mtrr(vcpu))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
}
/*
@@ -267,7 +275,7 @@ static int fixed_mtrr_addr_to_seg(u64 addr)
for (seg = 0; seg < seg_num; seg++) {
mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
return seg;
}
@@ -300,7 +308,6 @@ static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
*start = range->base & PAGE_MASK;
mask = range->mask & PAGE_MASK;
- mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
/* This cannot overflow because writing to the reserved bits of
* variable MTRRs causes a #GP.
@@ -356,10 +363,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
+ /* Extend the mask with all 1 bits to the left, since those
+ * bits must implicitly be 0. The bits are then cleared
+ * when reading them.
+ */
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data;
+ cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
@@ -426,6 +437,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
*pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+ *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
}
return 0;
@@ -670,7 +683,7 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
if (iter.mtrr_disabled)
- return mtrr_disabled_type();
+ return mtrr_disabled_type(vcpu);
/* not contained in any MTRRs. */
if (type == -1)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 83a1c64..899c40f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3422,6 +3422,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
@@ -3892,8 +3894,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
-
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_handle_nmi(&svm->vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index af823a3..44976a5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2803,7 +2803,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu))
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Otherwise falls through */
default:
@@ -2909,7 +2909,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu))
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -8042,6 +8042,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
@@ -8668,7 +8670,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eed3228..97592e1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3572,9 +3572,11 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
+ int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -3593,6 +3595,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int start = 0;
+ int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3602,7 +3605,9 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ start && i == 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
@@ -6515,6 +6520,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
+ trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
__kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -6527,8 +6534,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- trace_kvm_entry(vcpu->vcpu_id);
- wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a1900d4..4ba229a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1414,6 +1414,7 @@ __init void lguest_init(void)
pv_info.kernel_rpl = 1;
/* Everyone except Xen runs with this set. */
pv_info.shared_kernel_pmd = 1;
+ pv_info.features = 0;
/*
* We set up all the lguest overrides for sensitive operations. These
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4362373..004c861 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,6 +1,8 @@
#include <linux/module.h>
#include <linux/preempt.h>
#include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <asm/msr-trace.h>
struct msr *msrs_alloc(void)
{
@@ -108,3 +110,27 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
+
+#ifdef CONFIG_TRACEPOINTS
+void do_trace_write_msr(unsigned msr, u64 val, int failed)
+{
+ trace_write_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_write_msr);
+EXPORT_TRACEPOINT_SYMBOL(write_msr);
+
+void do_trace_read_msr(unsigned msr, u64 val, int failed)
+{
+ trace_read_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_read_msr);
+EXPORT_TRACEPOINT_SYMBOL(read_msr);
+
+void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+{
+ trace_rdpmc(counter, val, failed);
+}
+EXPORT_SYMBOL(do_trace_rdpmc);
+EXPORT_TRACEPOINT_SYMBOL(rdpmc);
+
+#endif
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 327f21c..8dd8005 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -28,6 +28,7 @@
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/clocksource.h>
#include <asm/apic.h>
#include <asm/current.h>
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index e5f854c..14fcd01 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -470,7 +470,7 @@ long sys_sigreturn(void)
struct sigcontext __user *sc = &frame->sc;
int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
- if (copy_from_user(&set.sig[0], (void *)sc->oldmask, sizeof(set.sig[0])) ||
+ if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) ||
copy_from_user(&set.sig[1], frame->extramask, sig_size))
goto segfault;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4334e511..2306392 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1192,7 +1192,7 @@ static const struct pv_info xen_info __initconst = {
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
#endif
-
+ .features = 0,
.name = "Xen",
};
@@ -1229,10 +1229,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.iret = xen_iret,
#ifdef CONFIG_X86_64
- .usergs_sysret32 = xen_sysret32,
.usergs_sysret64 = xen_sysret64,
-#else
- .irq_enable_sysexit = xen_sysexit,
#endif
.load_tr_desc = paravirt_nop,
@@ -1529,6 +1526,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
+ if (xen_initial_domain())
+ pv_info.features |= PV_SUPPORTED_RTC;
pv_init_ops = xen_init_ops;
if (!xen_pvh_domain()) {
pv_cpu_ops = xen_cpu_ops;
@@ -1879,8 +1878,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
static void xen_set_cpu_features(struct cpuinfo_x86 *c)
{
- if (xen_pv_domain())
+ if (xen_pv_domain()) {
clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+ set_cpu_cap(c, X86_FEATURE_XENPV);
+ }
}
const struct hypervisor_x86 x86_hyper_xen = {
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 3705eab..df0c405 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,6 +1,7 @@
#include <linux/types.h>
#include <linux/tick.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index fd92a64..feb6d40 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -35,20 +35,6 @@ check_events:
ret
/*
- * We can't use sysexit directly, because we're not running in ring0.
- * But we can easily fake it up using iret. Assuming xen_sysexit is
- * jumped to with a standard stack frame, we can just strip it back to
- * a standard iret frame and use iret.
- */
-ENTRY(xen_sysexit)
- movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
- orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
- lea PT_EIP(%esp), %esp
-
- jmp xen_iret
-ENDPROC(xen_sysexit)
-
-/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index f22667a..cc8acc4 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,25 +68,6 @@ ENTRY(xen_sysret64)
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
-ENTRY(xen_sysret32)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER32_DS
- pushq PER_CPU_VAR(rsp_scratch)
- pushq %r11
- pushq $__USER32_CS
- pushq %rcx
-
- pushq $0
-1: jmp hypercall_iret
-ENDPATCH(xen_sysret32)
-RELOC(xen_sysret32, 1b+1)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1399423..4140b07 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-#ifdef CONFIG_X86_32
-__visible void xen_sysexit(void);
-#endif
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
__visible void xen_adjust_exception_frame(void);