diff options
Diffstat (limited to 'arch/x86')
29 files changed, 1219 insertions, 266 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0952ecd..838e7c3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -125,6 +125,7 @@ config X86 select RTC_LIB select HAVE_DEBUG_STACKOVERFLOW select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 + select HAVE_CC_STACKPROTECTOR config INSTRUCTION_DECODER def_bool y @@ -1617,22 +1618,6 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. -config CC_STACKPROTECTOR - bool "Enable -fstack-protector buffer overflow detection" - ---help--- - This option turns on the -fstack-protector GCC feature. This - feature puts, at the beginning of functions, a canary value on - the stack just before the return address, and validates - the value just before actually returning. Stack based buffer - overflows (that need to overwrite this return address) now also - overwrite the canary, which gets detected and the attack is then - neutralized via a kernel panic. - - This feature requires gcc version 4.2 or above, or a distribution - gcc with the feature backported. Older versions are automatically - detected and for those versions, this configuration option is - ignored. (and a warning is printed during bootup) - source kernel/Kconfig.hz config KEXEC diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 57d0215..13b22e0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -89,13 +89,11 @@ else KBUILD_CFLAGS += -maccumulate-outgoing-args endif +# Make sure compiler does not have buggy stack-protector support. ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) - stackp-y := -fstack-protector - KBUILD_CFLAGS += $(stackp-y) - else - $(warning stack protector enabled but no compiler support) + ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) + $(warning stack-protector enabled but compiler support broken) endif endif diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index c6cd358..04a4890 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -92,12 +92,53 @@ #endif #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else +#else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif /* SMP */ + +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + +/* + * For either of these options x86 doesn't have a strong TSO memory + * model and we should fall back to full barriers. + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + +#else /* regular x86 TSO memory ordering */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index c49a613..cea1c76 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -293,12 +293,13 @@ static inline int restore_fpu_checking(struct task_struct *tsk) /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. "m" is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.fpu.has_fpu)); + if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (tsk->thread.fpu.has_fpu)); + } return fpu_restore_checking(&tsk->thread.fpu); } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index cba45d9..67d69b8 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -191,6 +191,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #define trace_interrupt interrupt #endif +#define VECTOR_UNDEFINED -1 +#define VECTOR_RETRIGGERED -2 + typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern void setup_vector_irq(int cpu); diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0..1da25a5 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MWAIT_H #define _ASM_X86_MWAIT_H +#include <linux/sched.h> + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 @@ -13,4 +15,45 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!current_set_polling_and_test()) { + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + mb(); + clflush((void *)¤t_thread_info()->flags); + mb(); + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); + } + current_clr_polling(); +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4..24821f5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -700,29 +700,6 @@ static inline void sync_core(void) #endif } -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - trace_hardirqs_on(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0e..3de54ef 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -4,6 +4,7 @@ #include <linux/pm.h> #include <linux/percpu.h> #include <linux/interrupt.h> +#include <linux/math64.h> #define TICK_SIZE (tick_nsec / 1000) @@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) +/* + * We use the full linear equation: f(x) = a + b*x, in order to allow + * a continuous function in the face of dynamic freq changes. * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC + * Continuity means that when our frequency changes our slope (b); we want to + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. * - * And since SC is a constant power of two, we can convert the div - * into a shift. + * Without an offset (a) the above would not be possible. * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - * - * In: - * - * ns = cycles * cyc2ns_scale / SC - * - * Although we may still have enough bits to store the value of ns, - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, - * leading to an incorrect result. - * - * To avoid this, we can decompose 'cycles' into quotient and remainder - * of division by SC. Then, - * - * ns = (quot * SC + rem) * cyc2ns_scale / SC - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC - * - * - sqazi@google.com + * See the comment near cycles_2_ns() for details on how we compute (b). */ - -DECLARE_PER_CPU(unsigned long, cyc2ns); -DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); - -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) -{ - int cpu = smp_processor_id(); - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); - return ns; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = __cycles_2_ns(cyc); - local_irq_restore(flags); - - return ns; -} +struct cyc2ns_data { + u32 cyc2ns_mul; + u32 cyc2ns_shift; + u64 cyc2ns_offset; + u32 __count; + /* u32 hole */ +}; /* 24 bytes -- do not grow */ + +extern struct cyc2ns_data *cyc2ns_read_begin(void); +extern void cyc2ns_read_end(struct cyc2ns_data *); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27..e69182f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e63a5bd..a43f068 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1142,9 +1142,10 @@ next: if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; + } /* Found one! */ current_vector = vector; current_offset = offset; @@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) vector = cfg->vector; for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; cfg->vector = 0; cpumask_clear(cfg->domain); @@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) continue; - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; break; } } @@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu) /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) + if (irq <= VECTOR_UNDEFINED) continue; cfg = irq_cfg(irq); if (!cpumask_test_cpu(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); } @@ -2202,13 +2202,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; + int irq; unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; irq = __this_cpu_read(vector_irq[vector]); - if (irq == -1) + if (irq <= VECTOR_UNDEFINED) continue; desc = irq_to_desc(irq); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7..6359506 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023b..8bc79cd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ea04b34..1a439c0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e13293..b886451 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,21 +1883,27 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + struct cyc2ns_data *data; + userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; + data = cyc2ns_read_begin(); + userpg->cap_user_time = 1; - userpg->time_mult = this_cpu_read(cyc2ns); - userpg->time_shift = CYC2NS_SCALE_FACTOR; - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; userpg->cap_user_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } /* diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index e09f0bf..4b8e4d3 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/ptrace.h> +#include <linux/syscore_ops.h> #include <asm/apic.h> @@ -816,6 +817,18 @@ out: return ret; } +static void ibs_eilvt_setup(void) +{ + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); +} + static inline int get_ibs_lvt_offset(void) { u64 val; @@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy) setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); } +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ + clear_APIC_ibs(NULL); + return 0; +} + +static void perf_ibs_resume(void) +{ + ibs_eilvt_setup(); + setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { + .resume = perf_ibs_resume, + .suspend = perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ + register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif + static int perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -877,18 +920,12 @@ static __init int amd_ibs_init(void) if (!caps) return -ENODEV; /* ibs not supported by the cpu */ - /* - * Force LVT offset assignment for family 10h: The offsets are - * not assigned by the BIOS for this family, so the OS is - * responsible for doing it. If the OS assignment fails, fall - * back to BIOS settings and try to setup this. - */ - if (boot_cpu_data.x86 == 0x10) - force_ibs_eilvt_setup(); + ibs_eilvt_setup(); if (!ibs_eilvt_valid()) goto out; + perf_ibs_pm_init(); get_online_cpus(); ibs_caps = caps; /* make ibs_caps visible to other cpus: */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 0000000..5ad35ad --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,679 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * dram counter: consumption of the builtin-gpu domain (client only) + * event: rapl_energy_gpu + * perf code: 0x4 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ +#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ +#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ +#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ +#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + +/* Servers have PP0, PKG, RAM */ +#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT) + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ + .config = _config, \ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { + spinlock_t lock; + int hw_unit; /* 1/2^hw_unit Joule */ + int n_active; /* number of active events */ + struct list_head active_list; + struct pmu *pmu; /* pointer to rapl_pmu_class */ + ktime_t timer_interval; /* in ktime_t unit */ + struct hrtimer hrtimer; +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v) +{ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + __hrtimer_start_range_ns(&pmu->hrtimer, + pmu->timer_interval, 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) { + rapl_event_update(event); + } + + spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + if (pmu->n_active == 0) + rapl_stop_hrtimer(pmu); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, msr, ret = 0; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmu_class.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + /* + * check event is known (determines counter) + */ + switch (cfg) { + case INTEL_RAPL_PP0: + bit = RAPL_IDX_PP0_NRG_STAT; + msr = MSR_PP0_ENERGY_STATUS; + break; + case INTEL_RAPL_PKG: + bit = RAPL_IDX_PKG_NRG_STAT; + msr = MSR_PKG_ENERGY_STATUS; + break; + case INTEL_RAPL_RAM: + bit = RAPL_IDX_RAM_NRG_STAT; + msr = MSR_DRAM_ENERGY_STATUS; + break; + case INTEL_RAPL_PP1: + bit = RAPL_IDX_PP1_NRG_STAT; + msr = MSR_PP1_ENERGY_STATUS; + break; + default: + return -EINVAL; + } + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + event->hw.event_base = msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct pmu rapl_pmu_class = { + .attr_groups = rapl_attr_groups, + .task_ctx_nr = perf_invalid_context, /* system-wide only */ + .event_init = rapl_pmu_event_init, + .add = rapl_pmu_event_add, /* must have */ + .del = rapl_pmu_event_del, /* must have */ + .start = rapl_pmu_event_start, + .stop = rapl_pmu_event_stop, + .read = rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int i, phys_id = topology_physical_package_id(cpu); + int target = -1; + + /* find a new cpu on same package */ + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + /* + * clear cpu from cpumask + * if was set in cpumask and still some cpu on package, + * then move to new cpu + */ + if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &rapl_cpu_mask); + + WARN_ON(cpumask_empty(&rapl_cpu_mask)); + /* + * migrate events and context to new cpu + */ + if (target >= 0) + perf_pmu_migrate_context(pmu->pmu, cpu, target); + + /* cancel overflow polling timer for CPU */ + rapl_stop_hrtimer(pmu); +} + +static void rapl_cpu_init(int cpu) +{ + int i, phys_id = topology_physical_package_id(cpu); + + /* check if phys_is is already covered */ + for_each_cpu(i, &rapl_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + /* was not found, so add it */ + cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int phys_id = topology_physical_package_id(cpu); + u64 ms; + + if (pmu) + return 0; + + if (phys_id < 0) + return -1; + + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -1; + + spin_lock_init(&pmu->lock); + + INIT_LIST_HEAD(&pmu->active_list); + + /* + * grab power unit as: 1/2^unit Joules + * + * we cache in local PMU instance + */ + rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); + pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; + pmu->pmu = &rapl_pmu_class; + + /* + * use reference of 200W for scaling the timeout + * to avoid missing counter overflows. + * 200W = 200 Joules/sec + * divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + if (pmu->hw_unit < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + else + ms = 2; + + pmu->timer_interval = ms_to_ktime(ms); + + rapl_hrtimer_init(pmu); + + /* set RAPL pmu for this cpu for now */ + per_cpu(rapl_pmu, cpu) = pmu; + per_cpu(rapl_pmu_to_free, cpu) = NULL; + + return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + + kfree(pmu); + + per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + + if (!pmu) + return 0; + + per_cpu(rapl_pmu, cpu) = NULL; + + per_cpu(rapl_pmu_to_free, cpu) = pmu; + + return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + rapl_cpu_prepare(cpu); + break; + case CPU_STARTING: + rapl_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + rapl_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + rapl_cpu_kfree(cpu); + break; + case CPU_DOWN_PREPARE: + rapl_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { + [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, + [1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ + struct rapl_pmu *pmu; + int cpu, ret; + + /* + * check for Intel processor family 6 + */ + if (!x86_match_cpu(rapl_cpu_match)) + return 0; + + /* check supported CPU */ + switch (boot_cpu_data.x86_model) { + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_CLN; + rapl_pmu_events_group.attrs = rapl_events_cln_attr; + break; + case 45: /* Sandy Bridge-EP */ + case 62: /* IvyTown */ + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; + + default: + /* unsupported */ + return 0; + } + get_online_cpus(); + + for_each_online_cpu(cpu) { + rapl_cpu_prepare(cpu); + rapl_cpu_init(cpu); + } + + perf_cpu_notifier(rapl_cpu_notifier); + + ret = perf_pmu_register(&rapl_pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); + put_online_cpus(); + return -1; + } + + pmu = __get_cpu_var(rapl_pmu); + + pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + " API unit is 2^-32 Joules," + " %d fixed counters" + " %llu ms ovfl timer\n", + pmu->hw_unit, + hweight32(rapl_cntr_mask), + ktime_to_ms(pmu->timer_interval)); + + put_online_cpus(); + + return 0; +} +device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 51e2988..a2a4f46 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller) pushl $0 /* Pass NULL as regs pointer */ movl 4*4(%esp), %eax movl 0x4(%ebp), %edx - leal function_trace_op, %ecx + movl function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call @@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller) movl 12*4(%esp), %eax /* Load ip (1st parameter) */ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ pushl %esp /* Save pt_regs as 4th parameter */ GLOBAL(ftrace_regs_call) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e21b078..1e96c36 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -88,7 +88,7 @@ END(function_hook) MCOUNT_SAVE_FRAME \skip /* Load the ftrace_ops into the 3rd parameter */ - leaq function_trace_op, %rdx + movq function_trace_op(%rip), %rdx /* Load ip into the first parameter */ movq RIP(%rsp), %rdi diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 22d0687..884d875 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) if (!handle_irq(irq, regs)) { ack_APIC_irq(); - if (printk_ratelimit()) - pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + if (irq != VECTOR_RETRIGGERED) { + pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), + vector, irq); + } else { + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + } } irq_exit(); @@ -344,7 +348,7 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__this_cpu_read(vector_irq[vector]) < 0) + if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -355,11 +359,14 @@ void fixup_irqs(void) data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (chip->irq_retrigger) + if (chip->irq_retrigger) { chip->irq_retrigger(data); + __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); + } raw_spin_unlock(&desc->lock); } - __this_cpu_write(vector_irq[vector], -1); + if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); } } #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a2a1fbc..7f50156 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -52,7 +52,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... NR_VECTORS - 1] = -1, + [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector) int cpu; for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) + if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) return 1; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a..f5252c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void) * The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop. */ + mb(); clflush(mwait_ptr); + mb(); __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d4..6377fb2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> +#include <linux/static_key.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; +static struct static_key __use_tsc = STATIC_KEY_INIT; + int tsc_clocksource_reliable; + +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable(); + + return ns; +} + +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + cyc2ns_write_end(cpu, data); + +done: + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - u64 this_offset; + u64 tsc_now; /* * Fall back to jiffies if there's no TSC available: @@ -53,16 +285,16 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (unlikely(tsc_disabled)) { + if (!static_key_false(&__use_tsc)) { /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* read the Time Stamp Counter: */ - rdtscll(this_offset); + rdtscll(tsc_now); /* return the value in ns */ - return __cycles_2_ns(this_offset); + return cycles_2_ns(tsc_now); } /* We need to define a real function for sched_clock, to override the @@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable) + if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable) + if (!sched_clock_stable()) return; local_irq_save(flags); - __this_cpu_write(cyc2ns_offset, 0); + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + offset = cyc2ns_suspend - sched_clock(); - for_each_possible_cpu(cpu) - per_cpu(cyc2ns_offset, cpu) = offset; + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } local_irq_restore(flags); } @@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - sched_clock_stable = 0; + clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ @@ -995,14 +1187,18 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); set_cyc2ns_scale(cpu_khz, cpu); + } if (tsc_disabled > 0) return; /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + static_key_slow_inc(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1673940..775702f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1355,7 +1355,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; /* update jump label if enable bit changes */ - if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) { + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) static_key_slow_dec_deferred(&apic_hw_disabled); else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9ff85bb..9d591c8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -641,6 +641,20 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ if (current_thread_info()->sig_on_uaccess_error && signal) { tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; @@ -649,6 +663,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, tsk, 0); } + + /* + * Barring that, we can do the fixup and be happy. + */ return; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 26328e8..4ed75dd 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -359,15 +359,21 @@ void bpf_jit_compile(struct sk_filter *fp) EMIT2(0x89, 0xd0); /* mov %edx,%eax */ break; case BPF_S_ALU_MOD_K: /* A %= K; */ + if (K == 1) { + CLEAR_A(); + break; + } EMIT2(0x31, 0xd2); /* xor %edx,%edx */ EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ EMIT2(0xf7, 0xf1); /* div %ecx */ EMIT2(0x89, 0xd0); /* mov %edx,%eax */ break; - case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ - EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ - EMIT(K, 4); - EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ + case BPF_S_ALU_DIV_K: /* A /= K */ + if (K == 1) + break; + EMIT2(0x31, 0xd2); /* xor %edx,%edx */ + EMIT1(0xb9);EMIT(K, 4); /* mov imm32,%ecx */ + EMIT2(0xf7, 0xf1); /* div %ecx */ break; case BPF_S_ALU_AND_X: seen |= SEEN_XREG; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index efe4d72..dfe605a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) return; } -static inline unsigned long cycles_2_us(unsigned long long cyc) +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc) { + struct cyc2ns_data *data = cyc2ns_read_begin(); unsigned long long ns; - unsigned long us; - int cpu = smp_processor_id(); - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + + cyc2ns_read_end(data); + return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */ +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long cyc; + + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + + cyc2ns_read_end(data); + return cyc; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ + return cycles_2_ns(cyc) / NSEC_PER_USEC; +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ + return ns_2_cycles(usec * NSEC_PER_USEC); } /* @@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, bcp, try); } -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately @@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long usec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index aabfb83..96bc506 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -357,3 +357,5 @@ 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 349 i386 kcmp sys_kcmp 350 i386 finit_module sys_finit_module +351 i386 sched_setattr sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 38ae65d..a12bddc 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -320,6 +320,8 @@ 311 64 process_vm_writev sys_process_vm_writev 312 common kcmp sys_kcmp 313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 2ada505..eb5d7a5 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) ts->tv_nsec = 0; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; @@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts) ts->tv_nsec = 0; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; @@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); |