diff options
author | James Morris <james.l.morris@oracle.com> | 2012-08-17 10:42:30 (GMT) |
---|---|---|
committer | James Morris <james.l.morris@oracle.com> | 2012-08-17 10:42:30 (GMT) |
commit | 51b743fe87d7fb3dba7a2ff4a1fe23bb65dc2245 (patch) | |
tree | f8b8f601713a3ecb264eb9f145636343d9350520 /arch/x86/kernel | |
parent | 9f99798ff49e73dded73a8c674044ea6fb6af651 (diff) | |
parent | d9875690d9b89a866022ff49e3fcea892345ad92 (diff) | |
download | linux-fsl-qoriq-51b743fe87d7fb3dba7a2ff4a1fe23bb65dc2245.tar.xz |
Merge tag 'v3.6-rc2' into next
Linux 3.6-rc2
Resync with Linus.
Diffstat (limited to 'arch/x86/kernel')
82 files changed, 5367 insertions, 1455 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8afb693..b2297e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + printk(PREFIX "BIOS IRQ0 override ignored.\n"); return 0; } - if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } @@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Force ignoring BIOS IRQ0 pin2 override + * Force ignoring BIOS IRQ0 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - /* - * The ati_ixp4x0_rev() early PCI quirk should have set - * the acpi_skip_timer_override flag already: - */ if (!acpi_skip_timer_override) { - WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; } @@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * is not connected at all. Force ignoring BIOS IRQ0 * override in that cases. */ { @@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, {} }; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 95bf99d..1b8e5a0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -25,10 +25,6 @@ unsigned long acpi_realmode_flags; static char temp_stack[4096]; #endif -asmlinkage void acpi_enter_s3(void) -{ - acpi_enter_sleep_state(3, wake_sleep_flags); -} /** * acpi_suspend_lowlevel - save kernel state * diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 5653a57..67f59f8 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -2,7 +2,6 @@ * Variables and functions used by the code in sleep.c */ -#include <linux/linkage.h> #include <asm/realmode.h> extern unsigned long saved_video_mode; @@ -11,7 +10,6 @@ extern long saved_magic; extern int wakeup_pmode_return; extern u8 wake_sleep_flags; -extern asmlinkage void acpi_enter_s3(void); extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 7261083..13ab720 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -74,7 +74,9 @@ restore_registers: ENTRY(do_suspend_lowlevel) call save_processor_state call save_registers - call acpi_enter_s3 + pushl $3 + call acpi_enter_sleep_state + addl $4, %esp # In case of S3 failure, we'll emerge here. Jump # to ret_point to recover diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 014d1d2..8ea5164 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -71,7 +71,9 @@ ENTRY(do_suspend_lowlevel) movq %rsi, saved_rsi addq $8, %rsp - call acpi_enter_s3 + movl $3, %edi + xorl %eax, %eax + call acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ jmp resume_point diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794..afb7ff7 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt + #include <linux/module.h> #include <linux/sched.h> #include <linux/mutex.h> @@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ - printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, ...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -219,7 +224,7 @@ void __init arch_init_ideal_nops(void) ideal_nops = intel_nops; #endif } - + break; default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; @@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp) * If this still occurs then you should see a hang * or crash shortly after this line: */ - printk("lockdep: fixing up alternatives.\n"); + pr_info("lockdep: fixing up alternatives\n"); #endif if (noreplace_smp || smp_alt_once || skip_smp_alternatives) @@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp) if (smp == smp_mode) { /* nothing */ } else if (smp) { - printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + pr_info("switching to SMP code\n"); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) @@ -546,7 +551,7 @@ void __init alternative_instructions(void) #ifdef CONFIG_SMP if (smp_alt_once) { if (1 == num_possible_cpus()) { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); @@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data) struct text_poke_param *p; int i; - if (atomic_dec_and_test(&stop_machine_first)) { + if (atomic_xchg(&stop_machine_first, 0)) { for (i = 0; i < tpp->nparams; i++) { p = &tpp->params[i]; text_poke(p->addr, p->opcode, p->len); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854..aadf335 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -2,6 +2,9 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/slab.h> #include <linux/init.h> @@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); @@ -258,7 +262,7 @@ void amd_flush_garts(void) } spin_unlock_irqrestore(&gart_lock, flags); if (!flushed) - printk("nothing to flush?\n"); + pr_notice("nothing to flush?\n"); } EXPORT_SYMBOL_GPL(amd_flush_garts); @@ -269,11 +273,10 @@ static __init int init_amd_nbs(void) err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + pr_notice("Cannot enumerate AMD northbridges\n"); if (amd_cache_gart() < 0) - printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " - "GART support disabled.\n"); + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return err; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e..24deb30 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map; /* * Map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); @@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); * used for the mapping. This is where the behaviors of x86_64 and 32 * actually diverge. Let's keep it ugly for now. */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); /* * Knob to control our willingness to enable the local APIC. @@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) +{ + unsigned int cpu; + + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + + if (likely(cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } + + return -EINVAL; +} + +/* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->eoi_write = eoi_write; + } +} + /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 0e881c4..00c77cf 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static const struct cpumask *flat_target_cpus(void) -{ - return cpu_online_mask; -} - -static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* * Set up the logical destination ID. * @@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) } static void - flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; int cpu = smp_processor_id(); @@ -186,7 +167,7 @@ static struct apic apic_flat = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = flat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, @@ -210,8 +191,7 @@ static struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, @@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static const struct cpumask *physflat_target_cpus(void) -{ - return cpu_online_mask; -} - -static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { default_send_IPI_mask_sequence_phys(cpumask, vector); @@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector) physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int physflat_probe(void) { if (apic == &apic_physflat || num_possible_cpus() > 8) @@ -345,13 +282,13 @@ static struct apic apic_physflat = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = physflat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = physflat_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ .init_apic_ldr = flat_init_apic_ldr, @@ -370,8 +307,7 @@ static struct apic apic_physflat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = physflat_send_IPI_mask, .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index a6e4c6e..e145f28 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + cpumask_copy(retmask, cpumask_of(cpu)); } static u32 noop_apic_read(u32 reg) @@ -159,8 +159,7 @@ struct apic apic_noop = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 6ec6d5d..bc552cf 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static const struct cpumask *numachip_target_cpus(void) -{ - return cpu_online_mask; -} - -static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { union numachip_csr_g3_ext_irq_gen int_gen; @@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector) __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } -static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; -} - -static unsigned int -numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int __init numachip_probe(void) { return apic == &apic_numachip; @@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = numachip_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = numachip_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = { .set_apic_id = set_apic_id, .apic_id_mask = 0xffU << 24, - .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 31fbdbf..d50e364 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void) return 1; } -static const struct cpumask *bigsmp_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; @@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) return 1; } -/* As we are using single CPU as destination, pick only one CPU here */ -static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu = cpumask_first(cpumask); - - if (cpu < nr_cpu_ids) - return cpu_physical_id(cpu); - return BAD_APICID; -} - -static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - return cpu_physical_id(cpu); - } - return BAD_APICID; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } /* NULL entry stops DMI scanning */ }; -static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int probe_bigsmp(void) { if (def_to_bigsmp) @@ -205,13 +164,13 @@ static struct apic apic_bigsmp = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = bigsmp_target_cpus, + .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, .check_apicid_present = bigsmp_check_apicid_present, - .vector_allocation_domain = bigsmp_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, @@ -229,8 +188,7 @@ static struct apic apic_bigsmp = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = bigsmp_send_IPI_mask, .send_IPI_mask_allbutself = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index db4ab1b..0874799 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void) WARN(1, "Command failed, status = %x\n", mip_status); } -static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - - static void es7000_wait_for_init_deassert(atomic_t *deassert) { while (!atomic_read(deassert)) @@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, uninitialized_var(apicid); + unsigned int cpu, uninitialized_var(apicid); /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); - return BAD_APICID; + return -EINVAL; } - apicid = new_apicid; + apicid |= new_apicid; round++; } - return apicid; + if (!round) + return -EINVAL; + *dest_id = apicid; + return 0; } -static unsigned int +static int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = es7000_cpu_mask_to_apicid(cpumask); + es7000_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) @@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr_cluster, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, @@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, @@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, @@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5f0ff59..a6c64aa 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi entry = alloc_irq_pin_list(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); return -ENOMEM; } entry->apic = apic; @@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_mask_entry(apic, pin); entry = ioapic_read_entry(apic, pin); if (entry.irr) - printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", mpc_ioapic_id(apic), pin); } @@ -895,7 +895,7 @@ static int irq_polarity(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -906,7 +906,7 @@ static int irq_polarity(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -948,7 +948,7 @@ static int irq_trigger(int idx) } default: { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -962,7 +962,7 @@ static int irq_trigger(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -973,7 +973,7 @@ static int irq_trigger(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 0; break; } @@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; @@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 8; - unsigned int old_vector; + static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; cpumask_var_t tmp_mask; @@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) return -ENOMEM; - old_vector = cfg->vector; - if (old_vector) { - cpumask_and(tmp_mask, mask, cpu_online_mask); - cpumask_and(tmp_mask, cfg->domain, tmp_mask); - if (!cpumask_empty(tmp_mask)) { - free_cpumask_var(tmp_mask); - return 0; - } - } - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - for_each_cpu_and(cpu, mask, cpu_online_mask) { - int new_cpu; - int vector, offset; + cpumask_clear(cfg->old_domain); + cpu = cpumask_first_and(mask, cpu_online_mask); + while (cpu < nr_cpu_ids) { + int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask, mask); + + if (cpumask_subset(tmp_mask, cfg->domain)) { + err = 0; + if (cpumask_equal(tmp_mask, cfg->domain)) + break; + /* + * New cpumask using the vector is a proper subset of + * the current in use mask. So cleanup the vector + * allocation for the members that are not used anymore. + */ + cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); + cfg->move_in_progress = 1; + cpumask_and(cfg->domain, cfg->domain, tmp_mask); + break; + } vector = current_vector; offset = current_offset; next: - vector += 8; + vector += 16; if (vector >= first_system_vector) { - /* If out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; + offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } - if (unlikely(current_vector == vector)) + + if (unlikely(current_vector == vector)) { + cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpu = cpumask_first_and(tmp_mask, cpu_online_mask); continue; + } if (test_bit(vector, used_vectors)) goto next; @@ -1162,7 +1171,7 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (old_vector) { + if (cfg->vector) { cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); } @@ -1195,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu(cpu, cfg->domain) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1203,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu(cpu, cfg->old_domain) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -1346,18 +1355,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0 for legacy - * controllers like 8259. Now that IO-APIC can handle this irq, update - * the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) - apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), + &dest)) { + pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1366,7 +1375,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, cfg->vector, irq, attr->trigger, attr->polarity, dest); if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); @@ -1469,9 +1478,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) * Set up the timer pin, possibly with the 8259A-master behind. */ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) + unsigned int pin, int vector) { struct IO_APIC_route_entry entry; + unsigned int dest; if (irq_remapping_enabled) return; @@ -1482,9 +1492,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, * We use logical delivery to get the timer IRQ * to the first CPU. */ + if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), + apic->target_cpus(), &dest))) + dest = BAD_APICID; + entry.dest_mode = apic->irq_dest_mode; entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); + entry.dest = dest; entry.delivery_mode = apic->irq_delivery_mode; entry.polarity = 0; entry.trigger = 0; @@ -1521,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) reg_03.raw = io_apic_read(ioapic_idx, 3); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1578,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, ir_entry->index ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %X %02X\n", ir_entry->format, ir_entry->mask, @@ -1598,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, entry.dest ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %02X\n", entry.mask, entry.trigger, @@ -1651,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void) continue; printk(KERN_DEBUG "IRQ%d ", irq); for_each_irq_pin(entry, cfg->irq_2_pin) - printk("-> %d:%d", entry->apic, entry->pin); - printk("\n"); + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); } printk(KERN_INFO ".................................... done.\n"); @@ -1665,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base) printk(KERN_DEBUG); for (i = 0; i < 8; i++) - printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + pr_cont("%08x", apic_read(base + i*0x10)); - printk(KERN_CONT "\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1769,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); } } - printk("\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APICs(int maxcpu) @@ -2065,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) - printk("could not set ID!\n"); + pr_cont("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); } @@ -2210,71 +2223,6 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(cfg)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = data->chip_data; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -1; - - if (assign_irq_vector(data->irq, data->chip_data, mask)) - return -1; - - cpumask_copy(data->affinity, mask); - - *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); - return 0; -} - -static int -ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = __ioapic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, data->chip_data); - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2362,6 +2310,87 @@ void irq_force_complete_move(int irq) static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(cfg)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + } +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + cpumask_copy(data->affinity, mask); + + return 0; +} + +static int +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + unsigned int dest, irq = data->irq; + unsigned long flags; + int ret; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ret = __ioapic_set_affinity(data, mask, &dest); + if (!ret) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return ret; +} + static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); @@ -2541,9 +2570,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_ack = ir_ack_apic_edge; chip->irq_eoi = ir_ack_apic_level; -#ifdef CONFIG_SMP chip->irq_set_affinity = set_remapped_irq_affinity; -#endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2554,9 +2581,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_unmask = unmask_ioapic_irq, .irq_ack = ack_apic_edge, .irq_eoi = ack_apic_level, -#ifdef CONFIG_SMP .irq_set_affinity = ioapic_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3038,7 +3063,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, if (err) return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; if (irq_remapped(cfg)) { compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); @@ -3072,7 +3100,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return err; } -#ifdef CONFIG_SMP static int msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3092,9 +3119,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) __write_msi_msg(data->msi_desc, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, @@ -3105,9 +3131,7 @@ static struct irq_chip msi_chip = { .irq_unmask = unmask_msi_irq, .irq_mask = mask_msi_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3192,7 +3216,6 @@ void native_teardown_msi_irq(unsigned int irq) } #ifdef CONFIG_DMAR_TABLE -#ifdef CONFIG_SMP static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) @@ -3214,19 +3237,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, dmar_msi_write(irq, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = dmar_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3247,7 +3266,6 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER -#ifdef CONFIG_SMP static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3267,19 +3285,15 @@ static int hpet_msi_set_affinity(struct irq_data *data, hpet_msi_write(data->handler_data, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = hpet_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3314,8 +3328,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) */ #ifdef CONFIG_HT_IRQ -#ifdef CONFIG_SMP - static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { struct ht_irq_msg msg; @@ -3340,25 +3352,23 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return -1; target_ht_irq(data->irq, dest, cfg->vector); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif - static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = ht_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; + struct ht_irq_msg msg; + unsigned dest; int err; if (disable_apic) @@ -3366,36 +3376,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus()); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq, &msg); - write_ht_irq_msg(irq, &msg); + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - } - return err; + return 0; } #endif /* CONFIG_HT_IRQ */ @@ -3563,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id) /* Sanity check */ if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); return -1; } } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f00a68c..d661ee9 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - return 0x0F; -} - -static inline unsigned int +static int numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - return 0x0F; + *apicid = 0x0F; + return 0; } /* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ @@ -441,20 +438,6 @@ static int probe_numaq(void) return found_numaq; } -static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - static void numaq_setup_portio_remap(void) { int num_quads = num_online_nodes(); @@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = { .check_apicid_used = numaq_check_apicid_used, .check_apicid_present = numaq_check_apicid_present, - .vector_allocation_domain = numaq_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = numaq_init_apic_ldr, .ioapic_phys_id_map = numaq_ioapic_phys_id_map, @@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, .send_IPI_mask = numaq_send_IPI_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da..eb35ef9 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void) #endif } -static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* - * Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* should be called last. */ static int probe_default(void) { @@ -105,7 +90,7 @@ static struct apic apic_default = { .check_apicid_used = default_check_apicid_used, .check_apicid_present = default_check_apicid_present, - .vector_allocation_domain = default_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -123,8 +108,7 @@ static struct apic apic_default = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, @@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 3fe9866..1793dba 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,11 +23,6 @@ #include <asm/ipi.h> #include <asm/setup.h> -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ @@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void) } } - if (is_vsmp_box()) { - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; - } + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c..77c95c0 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "summit: %s: " fmt, __func__ + #include <linux/mm.h> #include <linux/init.h> #include <asm/io.h> @@ -235,8 +237,8 @@ static int summit_apic_id_registered(void) static void summit_setup_apic_routing(void) { - printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); + pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); } static int summit_cpu_present_to_apicid(int mps_cpu) @@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid) return 1; } -static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, apicid = 0; + unsigned int cpu, apicid = 0; /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - printk("%s: Not a valid mask!\n", __func__); - return BAD_APICID; + pr_err("Not a valid mask!\n"); + return -EINVAL; } apicid |= new_apicid; round++; } - return apicid; + if (!round) + return -EINVAL; + *dest_id = apicid; + return 0; } -static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) +static int +summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = summit_cpu_mask_to_apicid(cpumask); + summit_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } /* @@ -320,20 +327,6 @@ static int probe_summit(void) return 0; } -static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - #ifdef CONFIG_X86_SUMMIT_NUMA static struct rio_table_hdr *rio_table_hdr; static struct scal_detail *scal_devs[MAX_NUMNODES]; @@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_rio_dev) { - printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); + pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); return last_bus; } @@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_scal_dev) { - printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); + pr_err("Couldn't find owner Twister for Cyclone!\n"); return last_bus; } @@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) num_buses = 9; break; default: - printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); + pr_info("Unsupported Winnipeg type!\n"); return last_bus; } @@ -411,13 +404,15 @@ static int build_detail_arrays(void) int i, scal_detail_size, rio_detail_size; if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); + pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); return 0; } switch (rio_table_hdr->version) { default: - printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); + pr_warn("Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); return 0; case 2: scal_detail_size = 11; @@ -462,7 +457,7 @@ void setup_summit(void) offset = *((unsigned short *)(ptr + offset)); } if (!rio_table_hdr) { - printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); + pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); return; } @@ -509,7 +504,7 @@ static struct apic apic_summit = { .check_apicid_used = summit_check_apicid_used, .check_apicid_present = summit_check_apicid_present, - .vector_allocation_domain = summit_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = summit_init_apic_ldr, .ioapic_phys_id_map = summit_ioapic_phys_id_map, @@ -527,7 +522,6 @@ static struct apic apic_summit = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = summit_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, .send_IPI_mask = summit_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ff35cff..c88baa4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) } static void - x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } @@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); + u32 dest = 0; + u16 cluster; + int i; - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - else - return BAD_APICID; -} + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + dest = per_cpu(x86_cpu_to_logical_apicid, i); + cluster = x2apic_cluster(i); + break; + } -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; + if (!dest) + return -EINVAL; - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + if (cluster != x2apic_cluster(i)) + continue; + dest |= per_cpu(x86_cpu_to_logical_apicid, i); } - return per_cpu(x86_cpu_to_logical_apicid, cpu); + *apicid = dest; + + return 0; } static void init_x2apic_ldr(void) @@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void) return 0; } +static const struct cpumask *x2apic_cluster_target_cpus(void) +{ + return cpu_all_mask; +} + +/* + * Each x2apic cluster is an allocation domain. + */ +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + /* + * To minimize vector pressure, default case of boot, device bringup + * etc will use a single cpu for the interrupt destination. + * + * On explicit migration requests coming from irqbalance etc, + * interrupts will be routed to the x2apic cluster (cluster-id + * derived from the first cpu in the mask) members specified + * in the mask. + */ + if (mask == x2apic_cluster_target_cpus()) + cpumask_copy(retmask, cpumask_of(cpu)); + else + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); +} + static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", @@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c17e982..e03a1e1 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - - return per_cpu(x86_cpu_to_apicid, cpu); -} - static void init_x2apic_ldr(void) { } @@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c6d03f7..8cfade9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static const struct cpumask *uv_target_cpus(void) -{ - return cpu_online_mask; -} - -static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP @@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - else - return BAD_APICID; -} - -static unsigned int +static int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - int cpu; + int unsigned cpu; /* * We're using fixed IRQ delivery, can only return one phys APIC ID. @@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + + if (likely(cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } + + return -EINVAL; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = uv_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = uv_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 07b0c0d..d65464e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -201,6 +201,8 @@ * http://www.microsoft.com/whdc/archive/amp_12.mspx] */ +#define pr_fmt(fmt) "apm: " fmt + #include <linux/module.h> #include <linux/poll.h> @@ -485,11 +487,11 @@ static void apm_error(char *str, int err) if (error_table[i].key == err) break; if (i < ERROR_COUNT) - printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + pr_notice("%s: %s\n", str, error_table[i].msg); else if (err < 0) - printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); + pr_notice("%s: linux error code %i\n", str, err); else - printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + pr_notice("%s: unknown error code %#2.2x\n", str, err); } @@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) static int notified; if (notified++ == 0) - printk(KERN_ERR "apm: an event queue overflowed\n"); + pr_err("an event queue overflowed\n"); if (++as->event_tail >= APM_MAX_EVENTS) as->event_tail = 0; } @@ -1447,7 +1449,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { if (as == NULL || as->magic != APM_BIOS_MAGIC) { - printk(KERN_ERR "apm: %s passed bad filp\n", func); + pr_err("%s passed bad filp\n", func); return 1; } return 0; @@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp) as1 = as1->next) ; if (as1 == NULL) - printk(KERN_ERR "apm: filp not in user list\n"); + pr_err("filp not in user list\n"); else as1->next = as->next; } @@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp) struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); - if (as == NULL) { - printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", - sizeof(*as)); + if (as == NULL) return -ENOMEM; - } + as->magic = APM_BIOS_MAGIC; as->event_tail = as->event_head = 0; as->suspends_pending = as->standbys_pending = 0; @@ -2313,16 +2313,16 @@ static int __init apm_init(void) } if (apm_info.disabled) { - printk(KERN_NOTICE "apm: disabled on user request.\n"); + pr_notice("disabled on user request.\n"); return -ENODEV; } if ((num_online_cpus() > 1) && !power_off && !smp) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + pr_notice("disabled - APM is not SMP safe.\n"); apm_info.disabled = 1; return -ENODEV; } if (!acpi_disabled) { - printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + pr_notice("overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } @@ -2356,8 +2356,7 @@ static int __init apm_init(void) kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { - printk(KERN_ERR "apm: disabled - Unable to start kernel " - "thread.\n"); + pr_err("disabled - Unable to start kernel thread\n"); err = PTR_ERR(kapmd_task); kapmd_task = NULL; remove_proc_entry("apm", NULL); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2..d30a6a9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += vmware.o hypervisor.o mshyperv.o obj-y += rdrand.o obj-y += match.o @@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifdef CONFIG_PERF_EVENTS obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o endif obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 146bb62..9d92e19 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,39 @@ #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + #ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause @@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_TOPOEXT)) { u64 val; - if (!rdmsrl_amd_safe(0xc0011005, &val)) { + if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - wrmsrl_amd_safe(0xc0011005, val); + wrmsrl_safe(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); @@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); if (err == 0) { mask |= (1 << 10); - checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fb..c97bb7b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -55,8 +55,8 @@ static void __init check_fpu(void) if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); + pr_emerg("No coprocessor found and no math emulation present\n"); + pr_emerg("Giving up\n"); for (;;) ; #endif return; @@ -86,7 +86,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); + pr_warn("Hmm, FPU with FDIV bug\n"); } static void __init check_hlt(void) @@ -94,16 +94,16 @@ static void __init check_hlt(void) if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; - printk(KERN_INFO "Checking 'hlt' instruction... "); + pr_info("Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); + pr_cont("disabled\n"); return; } halt(); halt(); halt(); halt(); - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); } /* @@ -116,7 +116,7 @@ static void __init check_popad(void) #ifndef CONFIG_X86_POPAD_OK int res, inp = (int) &res; - printk(KERN_INFO "Checking for popad bug... "); + pr_info("Checking for popad bug... "); __asm__ __volatile__( "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) @@ -127,9 +127,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk(KERN_CONT "Buggy.\n"); + pr_cont("Buggy\n"); else - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); #endif } @@ -161,7 +161,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b..46d8786 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -452,6 +452,35 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; + +/* + * tlb_flushall_shift shows the balance point in replacing cr3 write + * with multiple 'invlpg'. It will do this replacement when + * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. + * If tlb_flushall_shift is -1, means the replacement will be disabled. + */ +s8 __read_mostly tlb_flushall_shift = -1; + +void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) +{ + if (this_cpu->c_detect_tlb) + this_cpu->c_detect_tlb(c); + + printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ + "tlb_flushall_shift is 0x%x\n", + tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], + tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], + tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], + tlb_flushall_shift); +} + void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT @@ -911,6 +940,8 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + if (boot_cpu_data.cpuid_level >= 2) + cpu_detect_tlb(&boot_cpu_data); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -947,7 +978,7 @@ static void __cpuinit __print_cpu_msr(void) index_max = msr_range_array[i].max; for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) + if (rdmsrl_safe(index, &val)) continue; printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 8bacc78..4041c24 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -20,10 +20,19 @@ struct cpu_dev { void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); + void (*c_detect_tlb)(struct cpuinfo_x86 *); unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); int c_x86_vendor; }; +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; + /* unsigned int ways; */ + char info[128]; +}; + #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __attribute__((__section__(".x86_cpu_dev.init"))) = \ diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64f..a8f8fa9 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3e6ff6c..0a4ce29 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -491,6 +491,181 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i } #endif +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 + +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 + +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 + +#define TLB_DATA_1G 0x16 + +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 + +#define STLB_4K 0x41 + +static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { + { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, + { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, + { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, + { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, + { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, + { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, + { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, + { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x00, 0, 0 } +}; + +static void __cpuinit intel_tlb_lookup(const unsigned char desc) +{ + unsigned char k; + if (desc == 0) + return; + + /* look up this descriptor in the table */ + for (k = 0; intel_tlb_table[k].descriptor != desc && \ + intel_tlb_table[k].descriptor != 0; k++) + ; + + if (intel_tlb_table[k].tlb_type == 0) + return; + + switch (intel_tlb_table[k].tlb_type) { + case STLB_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_ALL: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4M: + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_2M_4M: + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K: + case TLB_DATA0_4K: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4M: + case TLB_DATA0_4M: + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_2M_4M: + case TLB_DATA0_2M_4M: + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K_4M: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + } +} + +static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +{ + if (!cpu_has_invlpg) { + tlb_flushall_shift = -1; + return; + } + switch ((c->x86 << 8) + c->x86_model) { + case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 0x61d: /* six-core 45 nm xeon "Dunnington" */ + tlb_flushall_shift = -1; + break; + case 0x61a: /* 45 nm nehalem, "Bloomfield" */ + case 0x61e: /* 45 nm nehalem, "Lynnfield" */ + case 0x625: /* 32 nm nehalem, "Clarkdale" */ + case 0x62c: /* 32 nm nehalem, "Gulftown" */ + case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ + case 0x62f: /* 32 nm Xeon E7 */ + tlb_flushall_shift = 6; + break; + case 0x62a: /* SandyBridge */ + case 0x62d: /* SandyBridge, "Romely-EP" */ + tlb_flushall_shift = 5; + break; + case 0x63a: /* Ivybridge */ + tlb_flushall_shift = 1; + break; + default: + tlb_flushall_shift = 6; + } +} + +static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) +{ + int i, j, n; + unsigned int regs[4]; + unsigned char *desc = (unsigned char *)regs; + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) + intel_tlb_lookup(desc[j]); + } + intel_tlb_flushall_shift_set(c); +} + static const struct cpu_dev __cpuinitconst intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, @@ -546,6 +721,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = { }, .c_size_cache = intel_size_cache, #endif + .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 413c2ce..1301762 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -55,13 +55,6 @@ static struct severity { #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) -#define MCACOD 0xffff -/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ -#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ -#define MCACOD_SCRUBMSK 0xfff0 -#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ -#define MCACOD_DATA 0x0134 /* Data Load */ -#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ MCESEV( NO, "Invalid", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index da27c5d..292d025 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> @@ -57,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); int mce_disabled __read_mostly; -#define MISC_MCELOG_MINOR 227 - #define SPINUNIT 100 /* 100ns */ atomic_t mce_entry; @@ -102,6 +103,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -210,7 +213,7 @@ static void drain_mcelog_buffer(void) cpu_relax(); if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); + pr_err("skipping error being logged currently!\n"); break; } } @@ -649,14 +652,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { int i, ret = 0; for (i = 0; i < banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); - if (m->status & MCI_STATUS_VAL) + if (m->status & MCI_STATUS_VAL) { __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) ret = 1; } @@ -1039,7 +1046,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) *final = m; memset(valid_banks, 0, sizeof(valid_banks)); - no_way_out = mce_no_way_out(&m, &msg, valid_banks); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); @@ -1167,8 +1174,9 @@ int memory_failure(unsigned long pfn, int vector, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); - printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" - "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); return 0; } @@ -1186,6 +1194,7 @@ void mce_notify_process(void) { unsigned long pfn; struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; if (!mi) mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); @@ -1200,8 +1209,9 @@ void mce_notify_process(void) * doomed. We still need to mark the page as poisoned and alert any * other users of the page. */ - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || - mi->restartable == 0) { + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } @@ -1358,11 +1368,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void) b = cap & MCG_BANKCNT_MASK; if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } @@ -1415,11 +1424,39 @@ static void __mcheck_cpu_init_generic(void) } } +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + /* Add per CPU specific workarounds here */ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } @@ -1512,6 +1549,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) mce_bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; } if (monarch_timeout < 0) monarch_timeout = 0; @@ -1574,7 +1614,7 @@ static void __mcheck_cpu_init_timer(void) /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1893,8 +1933,7 @@ static int __init mcheck_enable(char *str) get_option(&str, &monarch_timeout); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; @@ -2342,7 +2381,7 @@ static __init int mcheck_init_device(void) return err; } -device_initcall(mcheck_init_device); +device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f4873a6..c4e916d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,15 +1,17 @@ /* - * (c) 2005, 2006 Advanced Micro Devices, Inc. + * (c) 2005-2012 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. * - * Support : jacob.shin@amd.com + * Support: borislav.petkov@amd.com * * April 2006 * - added support for AMD Family 0x10 processors + * May 2012 + * - major scrubbing * * All MC4_MISCi registers are shared between multi-cores */ @@ -25,6 +27,7 @@ #include <linux/cpu.h> #include <linux/smp.h> +#include <asm/amd_nb.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> @@ -45,23 +48,15 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - bool interrupt_capable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "", + "northbridge", + "execution_unit", }; -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; -}; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); static unsigned char shared_bank[NR_BANKS] = { @@ -84,6 +79,26 @@ struct thresh_restart { u16 old_limit; }; +static const char * const bank4_names(struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) { /* @@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); - if (shared_bank[bank] && c->cpu_core_id) - break; memset(&b, 0, sizeof(b)); b.cpu = cpu; @@ -326,7 +339,7 @@ struct threshold_attr { #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) @@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) return size; } -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - static ssize_t show_error_count(struct threshold_block *b, char *buf) { - struct threshold_block_cross_cpu tbcc = { .tb = b, }; + u32 lo, hi; - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); -} - -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); } +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + #define RW_ATTR(val) \ static struct threshold_attr val = { \ .attr = {.name = __stringify(val), .mode = 0644 }, \ @@ -418,7 +414,6 @@ static struct threshold_attr val = { \ RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); -RW_ATTR(error_count); static struct attribute *default_attrs[] = { &threshold_limit.attr, @@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); + (bank == 4 ? bank4_names(b) : th_names[bank])); if (err) goto out_free; recurse: @@ -548,98 +543,91 @@ out_free: return err; } -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) +static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) { - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; } -/* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { - int i, err = 0; - struct threshold_bank *b = NULL; struct device *dev = per_cpu(mce_device, cpu); - char name[32]; + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = th_names[bank]; + int err = 0; - sprintf(name, "threshold_bank%i", bank); + if (shared_bank[bank]) { -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_llc_shared_mask(cpu)); + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + WARN_ON(!nb); - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; + /* threshold descriptor already initialized on this node? */ + if (nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; + per_cpu(threshold_banks, cpu)[bank] = b; + atomic_inc(&b->cpus); - b = per_cpu(threshold_banks, i)[bank]; + err = __threshold_add_blocks(b); - if (!b) goto out; - - err = sysfs_create_link(&dev->kobj, b->kobj, name); - if (err) - goto out; - - cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); - per_cpu(threshold_banks, cpu)[bank] = b; - - goto out; + } } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } - if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } b->kobj = kobject_create_and_add(name, &dev->kobj); - if (!b->kobj) + if (!b->kobj) { + err = -EINVAL; goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_set_cpu(cpu, b->cpus); -#endif + } per_cpu(threshold_banks, cpu)[bank] = b; - err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - dev = per_cpu(mce_device, i); - if (dev) - err = sysfs_create_link(&dev->kobj,b->kobj, name); - if (err) - goto out; + if (shared_bank[bank]) { + atomic_set(&b->cpus, 1); - per_cpu(threshold_banks, i)[bank] = b; + /* nb is already initialized, see above */ + WARN_ON(nb->bank4); + nb->bank4 = b; } - goto out; + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + if (!err) + goto out; -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); + out_free: kfree(b); -out: + + out: return err; } @@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu) return err; } -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + static void threshold_remove_bank(unsigned int cpu, int bank) { + struct amd_northbridge *nb; struct threshold_bank *b; - struct device *dev; - char name[32]; - int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; if (!b) return; + if (!b->blocks) goto free_out; - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - dev = per_cpu(mce_device, cpu); - sysfs_remove_link(&dev->kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - dev = per_cpu(mce_device, i); - if (dev) - sysfs_remove_link(&dev->kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; + if (shared_bank[bank]) { + if (!atomic_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } } deallocate_threshold_block(cpu, bank); @@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank) free_out: kobject_del(b->kobj); kobject_put(b->kobj); - free_cpumask_var(b->cpus); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; } @@ -777,4 +759,24 @@ static __init int threshold_init_device(void) return 0; } -device_initcall(threshold_init_device); +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index dfea390..c7b3fe2 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w # # Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h # @@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; print OUT "#include <asm/cpufeature.h>\n\n"; print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; +%features = (); +$err = 0; + while (defined($line = <IN>)) { if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { $macro = $1; - $feature = $2; + $feature = "\L$2"; $tail = $3; if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = $1; + $feature = "\L$1"; } - if ($feature ne '') { - printf OUT "\t%-32s = \"%s\",\n", - "[$macro]", "\L$feature"; + next if ($feature eq ''); + + if ($features{$feature}++) { + print STDERR "$in: duplicate feature name: $feature\n"; + $err++; } + printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature; } } print OUT "};\n"; close(IN); close(OUT); + +if ($err) { + unlink($out); + exit(1); +} + +exit(0); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index bdda2e6..35ffda5 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, /* Compute the maximum size with which we can make a range: */ if (range_startk) - max_align = ffs(range_startk) - 1; + max_align = __ffs(range_startk); else - max_align = 32; + max_align = BITS_PER_LONG - 1; - align = fls(range_sizek) - 1; + align = __fls(range_sizek); if (align > max_align) align = max_align; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae..e9fe907 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -361,11 +361,7 @@ static void __init print_mtrr_state(void) } pr_debug("MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - if (size_or_mask & 0xffffffffUL) - high_width = ffs(size_or_mask & 0xffffffffUL) - 1; - else - high_width = ffs(size_or_mask>>32) + 32 - 1; - high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf..915b876 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,20 +32,11 @@ #include <asm/smp.h> #include <asm/alternative.h> #include <asm/timer.h> +#include <asm/desc.h> +#include <asm/ldt.h> #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - struct x86_pmu x86_pmu __read_mostly; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { @@ -74,7 +65,7 @@ u64 x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -86,7 +77,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -189,7 +180,7 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_new = 0; + u64 val, val_new = ~0; int i, reg, ret = 0; /* @@ -222,8 +213,9 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu_event_addr(0), val); - ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + reg = x86_pmu_event_addr(0); + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; @@ -240,6 +232,7 @@ bios_fail: msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } @@ -388,7 +381,7 @@ int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -637,8 +630,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { - idx = X86_PMC_IDX_FIXED; + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; @@ -646,7 +639,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } @@ -704,8 +697,8 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) { struct perf_sched sched; @@ -824,15 +817,17 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = hwc->idx; } } @@ -930,7 +925,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -1316,7 +1311,6 @@ static struct attribute_group x86_pmu_format_group = { static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; - struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1347,21 +1341,8 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1370,22 +1351,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { - continue; - } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } - } - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; @@ -1620,8 +1585,8 @@ static int x86_pmu_event_idx(struct perf_event *event) if (!x86_pmu.attr_rdpmc) return 0; - if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { - idx -= X86_PMC_IDX_FIXED; + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; idx |= 1 << 30; } @@ -1649,7 +1614,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val = simple_strtoul(buf, NULL, 0); + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; @@ -1682,13 +1652,20 @@ static void x86_pmu_flush_branch_stack(void) x86_pmu.flush_branch_stack(); } +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, - .attr_groups = x86_pmu_attr_groups, + .attr_groups = x86_pmu_attr_groups, - .event_init = x86_pmu_event_init, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, .del = x86_pmu_del, @@ -1696,11 +1673,11 @@ static struct pmu pmu = { .stop = x86_pmu_stop, .read = x86_pmu_read, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, - .event_idx = x86_pmu_event_idx, + .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, }; @@ -1763,6 +1740,29 @@ valid_user_frame(const void __user *fp, unsigned long size) return (__range_not_ok(fp, size, TASK_SIZE) == 0); } +static unsigned long get_segment_base(unsigned int segment) +{ + struct desc_struct *desc; + int idx = segment >> 3; + + if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { + if (idx > LDT_ENTRIES) + return 0; + + if (idx > current->active_mm->context.size) + return 0; + + desc = current->active_mm->context.ldt; + } else { + if (idx > GDT_ENTRIES) + return 0; + + desc = __this_cpu_ptr(&gdt_page.gdt[0]); + } + + return get_desc_base(desc + idx); +} + #ifdef CONFIG_COMPAT #include <asm/compat.h> @@ -1771,13 +1771,17 @@ static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { /* 32-bit process in 64-bit kernel. */ + unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const void __user *fp; if (!test_thread_flag(TIF_IA32)) return 0; - fp = compat_ptr(regs->bp); + cs_base = get_segment_base(regs->cs); + ss_base = get_segment_base(regs->ss); + + fp = compat_ptr(ss_base + regs->bp); while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; frame.next_frame = 0; @@ -1790,8 +1794,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (!valid_user_frame(fp, sizeof(frame))) break; - perf_callchain_store(entry, frame.return_address); - fp = compat_ptr(frame.next_frame); + perf_callchain_store(entry, cs_base + frame.return_address); + fp = compat_ptr(ss_base + frame.next_frame); } return 1; } @@ -1814,6 +1818,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; } + /* + * We don't know what to do with VM86 stacks.. ignore them for now. + */ + if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) + return; + fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -1841,16 +1851,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +/* + * Deal with code segment offsets for the various execution modes: + * + * VM86 - the good olde 16 bit days, where the linear address is + * 20 bits and we use regs->ip + 0x10 * regs->cs. + * + * IA32 - Where we need to look at GDT/LDT segment descriptor tables + * to figure out what the 32bit base address is. + * + * X32 - has TIF_X32 set, but is running in x86_64 + * + * X86_64 - CS,DS,SS,ES are all zero based. + */ +static unsigned long code_segment_base(struct pt_regs *regs) { - unsigned long ip; + /* + * If we are in VM86 mode, add the segment offset to convert to a + * linear address. + */ + if (regs->flags & X86_VM_MASK) + return 0x10 * regs->cs; + + /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ +#ifdef CONFIG_X86_32 + if (user_mode(regs) && regs->cs != __USER_CS) + return get_segment_base(regs->cs); +#else + if (test_thread_flag(TIF_IA32)) { + if (user_mode(regs) && regs->cs != __USER32_CS) + return get_segment_base(regs->cs); + } +#endif + return 0; +} +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - ip = perf_guest_cbs->get_guest_ip(); - else - ip = instruction_pointer(regs); + return perf_guest_cbs->get_guest_ip(); - return ip; + return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7241e2f..6605a81 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -14,6 +14,18 @@ #include <linux/perf_event.h> +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + /* * | NHM/WSM | SNB | * register ------------------------------- @@ -57,7 +69,7 @@ struct amd_nb { }; /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#define MAX_PEBS_EVENTS 8 /* * A debug store configuration. @@ -349,6 +361,8 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); void (*flush_branch_stack)(void); /* @@ -360,12 +374,16 @@ struct x86_pmu { /* * Intel DebugStore bits */ - int bts, pebs; - int bts_active, pebs_active; + unsigned int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; /* * Intel LBR @@ -468,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); @@ -496,6 +516,26 @@ static inline bool kernel_ip(unsigned long ip) #endif } +/* + * Not all PMUs provide the right context information to place the reported IP + * into full context. Specifically segment registers are typically not + * supplied. + * + * Assuming the address is a linear address (it is for IBS), we fake the CS and + * vm86 mode using the known zero-based code segment and 'fix up' the registers + * to reflect this. + * + * Intel PEBS/LBR appear to typically provide the effective address, nothing + * much we can do about that but pray and treat it like a linear address. + */ +static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) +{ + regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS; + if (regs->flags & X86_VM_MASK) + regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK); + regs->ip = ip; +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 11a4eb9..4528ae7 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; - if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86_max_cores < 2) return; nb_id = amd_get_nb_id(cpu); @@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = { NULL, }; -static __initconst const struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = amd_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, - .cntval_bits = 48, - .cntval_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints, - - .format_attrs = amd_format_attr, - - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, - .cpu_dead = amd_pmu_cpu_dead, -}; - /* AMD Family 15h */ #define AMD_EVENT_TYPE_MASK 0x000000F0ULL @@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } -static __initconst const struct x86_pmu amd_pmu_f15h = { - .name = "AMD Family 15h", +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, @@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .disable = x86_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, - .eventsel = MSR_F15H_PERF_CTL, - .perfctr = MSR_F15H_PERF_CTR, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS_F15H, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints_f15h, - /* nortbridge counters not yet implemented: */ -#if 0 + .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_dead = amd_pmu_cpu_dead, -#endif .cpu_starting = amd_pmu_cpu_starting, - .format_attrs = amd_format_attr, + .cpu_dead = amd_pmu_cpu_dead, }; +static int setup_event_constraints(void) +{ + if (boot_cpu_data.x86 >= 0x15) + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + return 0; +} + +static int setup_perfctr_core(void) +{ + if (!cpu_has_perfctr_core) { + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, + KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + return -ENODEV; + } + + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, + KERN_ERR "hw perf events core counters need constraints handler!"); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * x86_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + printk(KERN_INFO "perf: AMD core performance counters detected\n"); + + return 0; +} + __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - /* - * If core performance counter extensions exists, it must be - * family 15h, otherwise fail. See x86_pmu_addr_offset(). - */ - switch (boot_cpu_data.x86) { - case 0x15: - if (!cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu_f15h; - break; - default: - if (cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu; - break; - } + x86_pmu = amd_pmu; + + setup_event_constraints(); + setup_perfctr_core(); /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index da9bcdc..7bfb5be 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -13,6 +13,8 @@ #include <asm/apic.h> +#include "perf_event.h" + static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) @@ -536,7 +538,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { regs.flags &= ~PERF_EFLAGS_EXACT; } else { - instruction_pointer_set(®s, ibs_data.regs[1]); + set_linear_ip(®s, ibs_data.regs[1]); regs.flags |= PERF_EFLAGS_EXACT; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 187c294..3823669 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -5,6 +5,8 @@ * among events on a single PMU. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/stddef.h> #include <linux/types.h> #include <linux/init.h> @@ -21,14 +23,14 @@ */ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -136,6 +138,84 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +#define SNB_DMND_DATA_RD (1ULL << 0) +#define SNB_DMND_RFO (1ULL << 1) +#define SNB_DMND_IFETCH (1ULL << 2) +#define SNB_DMND_WB (1ULL << 3) +#define SNB_PF_DATA_RD (1ULL << 4) +#define SNB_PF_RFO (1ULL << 5) +#define SNB_PF_IFETCH (1ULL << 6) +#define SNB_LLC_DATA_RD (1ULL << 7) +#define SNB_LLC_RFO (1ULL << 8) +#define SNB_LLC_IFETCH (1ULL << 9) +#define SNB_BUS_LOCKS (1ULL << 10) +#define SNB_STRM_ST (1ULL << 11) +#define SNB_OTHER (1ULL << 15) +#define SNB_RESP_ANY (1ULL << 16) +#define SNB_NO_SUPP (1ULL << 17) +#define SNB_LLC_HITM (1ULL << 18) +#define SNB_LLC_HITE (1ULL << 19) +#define SNB_LLC_HITS (1ULL << 20) +#define SNB_LLC_HITF (1ULL << 21) +#define SNB_LOCAL (1ULL << 22) +#define SNB_REMOTE (0xffULL << 23) +#define SNB_SNP_NONE (1ULL << 31) +#define SNB_SNP_NOT_NEEDED (1ULL << 32) +#define SNB_SNP_MISS (1ULL << 33) +#define SNB_NO_FWD (1ULL << 34) +#define SNB_SNP_FWD (1ULL << 35) +#define SNB_HITM (1ULL << 36) +#define SNB_NON_DRAM (1ULL << 37) + +#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD) +#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO) +#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \ + SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \ + SNB_HITM) + +#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY) +#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY) + +#define SNB_L3_ACCESS SNB_RESP_ANY +#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM) + +static __initconst const u64 snb_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE, + }, + }, +}; + static __initconst const u64 snb_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -233,16 +313,16 @@ static __initconst const u64 snb_hw_cache_event_ids }, [ C(NODE) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, }, }, @@ -747,7 +827,7 @@ static void intel_pmu_disable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); intel_pmu_pebs_disable_all(); @@ -763,9 +843,9 @@ static void intel_pmu_enable_all(int added) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; if (WARN_ON_ONCE(!event)) return; @@ -871,7 +951,7 @@ static inline void intel_pmu_ack_status(u64 ack) static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -886,7 +966,7 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; @@ -915,7 +995,7 @@ static void intel_pmu_disable_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; /* @@ -949,7 +1029,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) return; @@ -1000,14 +1080,14 @@ static void intel_pmu_reset(void) local_irq_save(flags); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); - checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); if (ds) ds->bts_index = ds->bts_buffer_base; @@ -1707,16 +1787,61 @@ static __init void intel_clovertown_quirk(void) * But taken together it might just make sense to not enable PEBS on * these chips. */ - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); - x86_pmu.pebs = 0; - x86_pmu.pebs_constraints = NULL; + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -1736,8 +1861,8 @@ static __init void intel_arch_events_quirk(void) /* disable event that reported as not presend by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); } } @@ -1756,7 +1881,7 @@ static __init void intel_nehalem_quirk(void) intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; ebx.split.no_branch_misses_retired = 0; x86_pmu.events_maskl = ebx.full; - printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + pr_info("CPU erratum AAJ80 worked around\n"); } } @@ -1765,6 +1890,7 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; + struct event_constraint *c; unsigned int unused; int version; @@ -1800,6 +1926,8 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1914,6 +2042,8 @@ __init int intel_pmu_init(void) case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_snb(); @@ -1951,5 +2081,37 @@ __init int intel_pmu_init(void) } } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 35e2192..e38d97b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -248,7 +248,7 @@ void reserve_ds_buffers(void) */ struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); void intel_pmu_enable_bts(u64 config) { @@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void) u64 to; u64 flags; }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; @@ -499,7 +499,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) * We sampled a branch insn, rewind using the LBR stack */ if (ip == to) { - regs->ip = from; + set_linear_ip(regs, from); return 1; } @@ -529,7 +529,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) } while (to < ip); if (to == ip) { - regs->ip = old_to; + set_linear_ip(regs, old_to); return 1; } @@ -569,7 +569,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * A possible PERF_SAMPLE_REGS will have to transfer all regs. */ regs = *iregs; - regs.ip = pebs->ip; + regs.flags = pebs->flags; + set_linear_ip(®s, pebs->ip); regs.bp = pebs->bp; regs.sp = pebs->sp; @@ -620,7 +621,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > 1); + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); at += n - 1; __intel_pmu_pebs_event(event, iregs, at); @@ -651,10 +652,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -670,7 +671,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) break; } - if (!event || bit >= MAX_PEBS_EVENTS) + if (!event || bit >= x86_pmu.max_pebs_events) continue; __intel_pmu_pebs_event(event, iregs, at); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 0000000..7563fda --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,2900 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); + +static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 count; + + rdmsrl(event->hw.event_base, count); + + return count; +} + +/* + * generic get constraint function for shared match/mask registers. + */ +static struct event_constraint * +uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + unsigned long flags; + bool ok = false; + + /* + * reg->alloc can be set due to existing state, so for fake box we + * need to ignore this, otherwise we might fail to allocate proper + * fake state for this extra reg constraint. + */ + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || + (er->config1 == reg1->config && er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (!uncore_box_is_fake(box)) + reg1->alloc = 1; + return NULL; + } + + return &constraint_empty; +} + +static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + /* + * Only put constraint if extra reg was actually allocated. Also + * takes care of event which do not use an extra shared reg. + * + * Also, if this is a fake box we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent box + * state either since it will be thrown out. + */ + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (box->pmu->type == &snbep_uncore_cbox) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & + SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; + } else { + if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } + } + reg1->idx = 0; + + return 0; +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *snbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct attribute_group snbep_uncore_qpi_format_group = { + .name = "format", + .attrs = snbep_uncore_qpi_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snbep_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, + .hw_config = snbep_uncore_hw_config, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &snbep_uncore_pci_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + &snbep_uncore_ha, + &snbep_uncore_imc, + &snbep_uncore_qpi, + &snbep_uncore_r2pcie, + &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = (unsigned long)&snbep_uncore_ha, + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* P2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = (unsigned long)&snbep_uncore_r2pcie, + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + u32 config; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, + ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + pci_read_config_dword(ubox_dev, 0x40, &config); + nodeid = config; + /* get the Node ID mapping */ + pci_read_config_dword(ubox_dev, 0x54, &config); + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +/* Nehalem-EX uncore support */ +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); +DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); +DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63"); +DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); + +static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); +} + +static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config &= ~((1ULL << uncore_num_counters(box)) - 1); + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config |= (1ULL << uncore_num_counters(box)) - 1; + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + else + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +#define NHMEX_UNCORE_OPS_COMMON_INIT() \ + .init_box = nhmex_uncore_msr_init_box, \ + .disable_box = nhmex_uncore_msr_disable_box, \ + .enable_box = nhmex_uncore_msr_enable_box, \ + .disable_event = nhmex_uncore_msr_disable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops nhmex_uncore_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_uncore_msr_enable_event, +}; + +static struct attribute *nhmex_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_edge.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_ubox_format_group = { + .name = "format", + .attrs = nhmex_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_ubox = { + .name = "ubox", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, + .perf_ctr = NHMEX_U_MSR_PMON_CTR, + .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_ubox_format_group +}; + +static struct attribute *nhmex_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_cbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_cbox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_cbox = { + .name = "cbox", + .num_counters = 6, + .num_boxes = 8, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, + .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_C_MSR_OFFSET, + .pair_ctr_ctl = 1, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static struct uncore_event_desc nhmex_uncore_wbox_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type nhmex_uncore_wbox = { + .name = "wbox", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_W_MSR_PMON_CNT0, + .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, + .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, + .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, + .pair_ctr_ctl = 1, + .event_descs = nhmex_uncore_wbox_events, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int ctr, ev_sel; + + ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> + NHMEX_B_PMON_CTR_SHIFT; + ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> + NHMEX_B_PMON_CTL_EV_SEL_SHIFT; + + /* events that do not use the match/mask registers */ + if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || + (ctr == 2 && ev_sel != 0x4) || ctr == 3) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_B0_MSR_MATCH; + else + reg1->reg = NHMEX_B1_MSR_MATCH; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, reg2->config); + } + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); +} + +/* + * The Bbox has 4 counters, but each counter monitors different events. + * Use bits 6-7 in the event config to select counter. + */ +static struct event_constraint nhmex_uncore_bbox_constraints[] = { + EVENT_CONSTRAINT(0 , 1, 0xc0), + EVENT_CONSTRAINT(0x40, 2, 0xc0), + EVENT_CONSTRAINT(0x80, 4, 0xc0), + EVENT_CONSTRAINT(0xc0, 8, 0xc0), + EVENT_CONSTRAINT_END, +}; + +static struct attribute *nhmex_uncore_bbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_counter.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_bbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_bbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_bbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_bbox_msr_enable_event, + .hw_config = nhmex_bbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_bbox = { + .name = "bbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_B0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, + .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_B_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .constraints = nhmex_uncore_bbox_constraints, + .ops = &nhmex_uncore_bbox_ops, + .format_group = &nhmex_uncore_bbox_format_group +}; + +static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + } else { + reg1->config = ~0ULL; + reg2->config = ~0ULL; + } + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_S0_MSR_MM_CFG; + else + reg1->reg = NHMEX_S1_MSR_MM_CFG; + + reg1->idx = 0; + + return 0; +} + +static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + wrmsrl(reg1->reg, 0); + if (reg1->config != ~0ULL || reg2->config != ~0ULL) { + wrmsrl(reg1->reg + 1, reg1->config); + wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + } + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); +} + +static struct attribute *nhmex_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_mm_cfg.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_sbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_sbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_sbox_msr_enable_event, + .hw_config = nhmex_sbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_S0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_S_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .ops = &nhmex_uncore_sbox_ops, + .format_group = &nhmex_uncore_sbox_format_group +}; + +enum { + EXTRA_REG_NHMEX_M_FILTER, + EXTRA_REG_NHMEX_M_DSP, + EXTRA_REG_NHMEX_M_ISS, + EXTRA_REG_NHMEX_M_MAP, + EXTRA_REG_NHMEX_M_MSC_THR, + EXTRA_REG_NHMEX_M_PGT, + EXTRA_REG_NHMEX_M_PLD, + EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, +}; + +static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { + MBOX_INC_SEL_EXTAR_REG(0x0, DSP), + MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x9, ISS), + /* event 0xa uses two extra registers */ + MBOX_INC_SEL_EXTAR_REG(0xa, ISS), + MBOX_INC_SEL_EXTAR_REG(0xa, PLD), + MBOX_INC_SEL_EXTAR_REG(0xb, PLD), + /* events 0xd ~ 0x10 use the same extra register */ + MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x16, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), + EVENT_EXTRA_END +}; + +static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + bool ret = false; + u64 mask; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config == config) { + atomic_inc(&er->ref); + er->config = config; + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; + } + /* + * The ZDP_CTL_FVC MSR has 4 fields which are used to control + * events 0xd ~ 0x10. Besides these 4 fields, there are additional + * fields which are shared. + */ + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (WARN_ON_ONCE(idx >= 4)) + return false; + + /* mask of the shared fields */ + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + + raw_spin_lock_irqsave(&er->lock, flags); + /* add mask of the non-shared field if it's in use */ + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + + if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + er->config &= ~mask; + er->config |= (config & mask); + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; +} + +static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + atomic_dec(&er->ref); + return; + } + + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + atomic_sub(1 << (idx * 8), &er->ref); +} + +u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 config = reg1->config; + + /* get the non-shared control bits and shift them */ + idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (new_idx > orig_idx) { + idx = new_idx - orig_idx; + config <<= 3 * idx; + } else { + idx = orig_idx - new_idx; + config >>= 3 * idx; + } + + /* add the shared control bits back */ + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + if (modify) { + /* adjust the main event selector */ + if (new_idx > orig_idx) + hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + else + hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + reg1->config = config; + reg1->idx = ~0xff | new_idx; + } + return config; +} + +static struct event_constraint * +nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int i, idx[2], alloc = 0; + u64 config1 = reg1->config; + + idx[0] = __BITS_VALUE(reg1->idx, 0, 8); + idx[1] = __BITS_VALUE(reg1->idx, 1, 8); +again: + for (i = 0; i < 2; i++) { + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + idx[i] = 0xff; + + if (idx[i] == 0xff) + continue; + + if (!nhmex_mbox_get_shared_reg(box, idx[i], + __BITS_VALUE(config1, i, 32))) + goto fail; + alloc |= (0x1 << i); + } + + /* for the match/mask registers */ + if ((uncore_box_is_fake(box) || !reg2->alloc) && + !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) + goto fail; + + /* + * If it's a fake box -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + */ + if (!uncore_box_is_fake(box)) { + if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) + nhmex_mbox_alter_er(event, idx[0], true); + reg1->alloc |= alloc; + reg2->alloc = 1; + } + return NULL; +fail: + if (idx[0] != 0xff && !(alloc & 0x1) && + idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + /* + * events 0xd ~ 0x10 are functional identical, but are + * controlled by different fields in the ZDP_CTL_FVC + * register. If we failed to take one field, try the + * rest 3 choices. + */ + BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); + idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + idx[0] = (idx[0] + 1) % 4; + idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { + config1 = nhmex_mbox_alter_er(event, idx[0], false); + goto again; + } + } + + if (alloc & 0x1) + nhmex_mbox_put_shared_reg(box, idx[0]); + if (alloc & 0x2) + nhmex_mbox_put_shared_reg(box, idx[1]); + return &constraint_empty; +} + +static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (uncore_box_is_fake(box)) + return; + + if (reg1->alloc & 0x1) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); + if (reg1->alloc & 0x2) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); + reg1->alloc = 0; + + if (reg2->alloc) { + nhmex_mbox_put_shared_reg(box, reg2->idx); + reg2->alloc = 0; + } +} + +static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) +{ + if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return er->idx; + return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; +} + +static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct extra_reg *er; + unsigned msr; + int reg_idx = 0; + + if (WARN_ON_ONCE(reg1->idx != -1)) + return -EINVAL; + /* + * The mbox events may require 2 extra MSRs at the most. But only + * the lower 32 bits in these MSRs are significant, so we can use + * config1 to pass two MSRs' config. + */ + for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) || + er->idx == __BITS_VALUE(reg1->idx, 1, 8)) + continue; + if (WARN_ON_ONCE(reg_idx >= 2)) + return -EINVAL; + + msr = er->msr + type->msr_offset * box->pmu->pmu_idx; + if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) + return -EINVAL; + + /* always use the 32~63 bits to pass the PLD config */ + if (er->idx == EXTRA_REG_NHMEX_M_PLD) + reg_idx = 1; + + reg1->idx &= ~(0xff << (reg_idx * 8)); + reg1->reg &= ~(0xffff << (reg_idx * 16)); + reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); + reg1->reg |= msr << (reg_idx * 16); + reg1->config = event->attr.config1; + reg_idx++; + } + /* use config2 to pass the filter config */ + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + + return 0; +} + +static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return box->shared_regs[idx].config; + + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + return config; +} + +static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx; + + idx = __BITS_VALUE(reg1->idx, 0, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + nhmex_mbox_shared_reg_config(box, idx)); + idx = __BITS_VALUE(reg1->idx, 1, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + nhmex_mbox_shared_reg_config(box, idx)); + + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } + + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); + +static struct attribute *nhmex_uncore_mbox_formats_attr[] = { + &format_attr_count_mode.attr, + &format_attr_storage_mode.attr, + &format_attr_wrap_mode.attr, + &format_attr_flag_mode.attr, + &format_attr_inc_sel.attr, + &format_attr_set_flag_sel.attr, + &format_attr_filter_cfg.attr, + &format_attr_filter_match.attr, + &format_attr_filter_mask.attr, + &format_attr_dsp.attr, + &format_attr_thr.attr, + &format_attr_fvc.attr, + &format_attr_pgt.attr, + &format_attr_map.attr, + &format_attr_iss.attr, + &format_attr_pld.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_mbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_mbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_mbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_mbox_msr_enable_event, + .hw_config = nhmex_mbox_hw_config, + .get_constraint = nhmex_mbox_get_constraint, + .put_constraint = nhmex_mbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_mbox = { + .name = "mbox", + .num_counters = 6, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_M0_MSR_PMU_CTL0, + .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, + .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_M_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 8, + .event_descs = nhmex_uncore_mbox_events, + .ops = &nhmex_uncore_mbox_ops, + .format_group = &nhmex_uncore_mbox_format_group, +}; + +void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int port; + + /* adjust the main event selector */ + if (reg1->idx % 2) { + reg1->idx--; + hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } else { + reg1->idx++; + hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } + + /* adjust address or config of extra register */ + port = reg1->idx / 6 + box->pmu->pmu_idx * 4; + switch (reg1->idx % 6) { + case 0: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); + break; + case 1: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); + break; + case 2: + /* the 8~15 bits to the 0~7 bits */ + reg1->config >>= 8; + break; + case 3: + /* the 0~7 bits to the 8~15 bits */ + reg1->config <<= 8; + break; + case 4: + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); + break; + case 5: + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); + break; + }; +} + +/* + * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. + * An event set consists of 6 events, the 3rd and 4th events in + * an event set use the same extra register. So an event set uses + * 5 extra registers. + */ +static struct event_constraint * +nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + struct intel_uncore_extra_reg *er; + unsigned long flags; + int idx, er_idx; + u64 config1; + bool ok = false; + + if (!uncore_box_is_fake(box) && reg1->alloc) + return NULL; + + idx = reg1->idx % 6; + config1 = reg1->config; +again: + er_idx = idx; + /* the 3rd and 4th events use the same extra register */ + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (idx < 2) { + if (!atomic_read(&er->ref) || er->config == reg1->config) { + atomic_inc(&er->ref); + er->config = reg1->config; + ok = true; + } + } else if (idx == 2 || idx == 3) { + /* + * these two events use different fields in a extra register, + * the 0~7 bits and the 8~15 bits respectively. + */ + u64 mask = 0xff << ((idx - 2) * 8); + if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || + !((er->config ^ config1) & mask)) { + atomic_add(1 << ((idx - 2) * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + } else { + if (!atomic_read(&er->ref) || + (er->config == (hwc->config >> 32) && + er->config1 == reg1->config && + er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config = (hwc->config >> 32); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + /* + * The Rbox events are always in pairs. The paired + * events are functional identical, but use different + * extra registers. If we failed to take an extra + * register, try the alternative. + */ + if (idx % 2) + idx--; + else + idx++; + if (idx != reg1->idx % 6) { + if (idx == 2) + config1 >>= 8; + else if (idx == 3) + config1 <<= 8; + goto again; + } + } else { + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx % 6) + nhmex_rbox_alter_er(box, event); + reg1->alloc = 1; + } + return NULL; + } + return &constraint_empty; +} + +static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + int idx, er_idx; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + if (idx == 2 || idx == 3) + atomic_sub(1 << ((idx - 2) * 8), &er->ref); + else + atomic_dec(&er->ref); + + reg1->alloc = 0; +} + +static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int port, idx; + + idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> + NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + if (idx >= 0x18) + return -EINVAL; + + reg1->idx = idx; + reg1->config = event->attr.config1; + + port = idx / 6 + box->pmu->pmu_idx * 4; + idx %= 6; + switch (idx) { + case 0: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); + break; + case 1: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); + break; + case 2: + case 3: + reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port); + break; + case 4: + case 5: + if (idx == 4) + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); + else + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); + reg2->config = event->attr.config2; + hwc->config |= event->attr.config & (~0ULL << 32); + break; + }; + return 0; +} + +static u64 nhmex_rbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + er = &box->shared_regs[idx]; + + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + + return config; +} + +static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx, er_idx; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + switch (idx) { + case 0: + case 1: + wrmsrl(reg1->reg, reg1->config); + break; + case 2: + case 3: + wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); + break; + case 4: + case 5: + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, hwc->config >> 32); + wrmsrl(reg1->reg + 2, reg2->config); + break; + }; + + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); +} + +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); +DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); + +static struct attribute *nhmex_uncore_rbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_xbr_mm_cfg.attr, + &format_attr_xbr_match.attr, + &format_attr_xbr_mask.attr, + &format_attr_qlx_cfg.attr, + &format_attr_iperf_cfg.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_rbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_rbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_rbox_events[] = { + INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), + INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_rbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_rbox_msr_enable_event, + .hw_config = nhmex_rbox_hw_config, + .get_constraint = nhmex_rbox_get_constraint, + .put_constraint = nhmex_rbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_rbox = { + .name = "rbox", + .num_counters = 8, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_R_MSR_PMON_CTL0, + .perf_ctr = NHMEX_R_MSR_PMON_CNT0, + .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_R_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 20, + .event_descs = nhmex_uncore_rbox_events, + .ops = &nhmex_uncore_rbox_ops, + .format_group = &nhmex_uncore_rbox_format_group +}; + +static struct intel_uncore_type *nhmex_msr_uncores[] = { + &nhmex_uncore_ubox, + &nhmex_uncore_cbox, + &nhmex_uncore_bbox, + &nhmex_uncore_sbox, + &nhmex_uncore_mbox, + &nhmex_uncore_rbox, + &nhmex_uncore_wbox, + NULL, +}; +/* end of Nehalem-EX uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu) +{ + struct intel_uncore_box *box; + int i, size; + + size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); + + box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + if (!box) + return NULL; + + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + static struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} + +static int +uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct event_constraint *c; + + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + + if (event->hw.config == ~0ULL) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + int i, wmin, wmax, ret = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = uncore_get_event_constraint(box, box->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + /* slow path */ + if (i != n) + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, NULL, n); +out: + kfree(fake_box); + return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + hwc->config = ~0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } + } + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->attr_groups[1]); + type->attr_groups[1] = NULL; +} + +static void uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *events_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*events_group), GFP_KERNEL); + if (!events_group) + goto fail; + + attrs = (struct attribute **)(events_group + 1); + events_group->name = "events"; + events_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->attr_groups[1] = events_group; + } + + type->pmus = pmus; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + box = uncore_alloc_box(type, 0); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + for (i = 0; i < type->num_boxes; i++) { + pmu = &type->pmus[i]; + if (pmu->func_id == pdev->devfn) + break; + if (pmu->func_id < 0) { + pmu->func_id = pdev->devfn; + break; + } + pmu = NULL; + } + + if (!pmu) { + kfree(box); + return -EINVAL; + } + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu = box->pmu; + int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + + type = (struct intel_uncore_type *)id->driver_data; + + return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + snbep_pci2phy_map_init(); + break; + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + kfree(box); + } + } +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + kfree(box); + box = NULL; + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(type, cpu); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void __cpuinit +uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); +} + +static int + __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, cpu, max_cores; + + max_cores = boot_cpu_data.x86_max_cores; + switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; + msr_uncores = snb_msr_uncores; + break; + case 45: /* Sandy Birdge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; + msr_uncores = snbep_msr_uncores; + break; + case 46: + msr_uncores = nhmex_msr_uncores; + break; + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + ret = uncore_pci_init(); + if (ret) + goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 0000000..c9e5dc5 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,621 @@ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/perf_event.h> +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */ +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f +#define SNBEP_CBO_MSR_OFFSET 0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +/* NHM-EX event control */ +#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff +#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 +#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) +#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) +#define NHMEX_PMON_CTL_PMI_EN (1 << 20) +#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) +#define NHMEX_PMON_CTL_INVERT (1 << 23) +#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 +#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_UMASK_MASK | \ + NHMEX_PMON_CTL_EDGE_DET | \ + NHMEX_PMON_CTL_INVERT | \ + NHMEX_PMON_CTL_TRESH_MASK) + +/* NHM-EX Ubox */ +#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define NHMEX_U_MSR_PMON_CTR 0xc11 +#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 + +#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) +#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e +#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) +#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) +#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) + +#define NHMEX_U_PMON_RAW_EVENT_MASK \ + (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_EDGE_DET) + +/* NHM-EX Cbox */ +#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 +#define NHMEX_C0_MSR_PMON_CTR0 0xd11 +#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 +#define NHMEX_C_MSR_OFFSET 0x20 + +/* NHM-EX Bbox */ +#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 +#define NHMEX_B0_MSR_PMON_CTR0 0xc31 +#define NHMEX_B0_MSR_PMON_CTL0 0xc30 +#define NHMEX_B_MSR_OFFSET 0x40 +#define NHMEX_B0_MSR_MATCH 0xe45 +#define NHMEX_B0_MSR_MASK 0xe46 +#define NHMEX_B1_MSR_MATCH 0xe4d +#define NHMEX_B1_MSR_MASK 0xe4e + +#define NHMEX_B_PMON_CTL_EN (1 << 0) +#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_B_PMON_CTR_SHIFT 6 +#define NHMEX_B_PMON_CTR_MASK \ + (0x3 << NHMEX_B_PMON_CTR_SHIFT) +#define NHMEX_B_PMON_RAW_EVENT_MASK \ + (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ + NHMEX_B_PMON_CTR_MASK) + +/* NHM-EX Sbox */ +#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 +#define NHMEX_S0_MSR_PMON_CTR0 0xc51 +#define NHMEX_S0_MSR_PMON_CTL0 0xc50 +#define NHMEX_S_MSR_OFFSET 0x80 +#define NHMEX_S0_MSR_MM_CFG 0xe48 +#define NHMEX_S0_MSR_MATCH 0xe49 +#define NHMEX_S0_MSR_MASK 0xe4a +#define NHMEX_S1_MSR_MM_CFG 0xe58 +#define NHMEX_S1_MSR_MATCH 0xe59 +#define NHMEX_S1_MSR_MASK 0xe5a + +#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) + +/* NHM-EX Mbox */ +#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 +#define NHMEX_M0_MSR_PMU_DSP 0xca5 +#define NHMEX_M0_MSR_PMU_ISS 0xca6 +#define NHMEX_M0_MSR_PMU_MAP 0xca7 +#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 +#define NHMEX_M0_MSR_PMU_PGT 0xca9 +#define NHMEX_M0_MSR_PMU_PLD 0xcaa +#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab +#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 +#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 +#define NHMEX_M_MSR_OFFSET 0x40 +#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 +#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c + +#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) +#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL +#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL +#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 + +#define NHMEX_M_PMON_CTL_EN (1 << 0) +#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) +#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 +#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 +#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) +#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) +#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 +#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ + (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ + (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) +#define NHMEX_M_PMON_RAW_EVENT_MASK \ + (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ + NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ + NHMEX_M_PMON_CTL_WRAP_MODE | \ + NHMEX_M_PMON_CTL_FLAG_MODE | \ + NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) + + +#define NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK 0x1f +#define NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK (0x7 << 5) +#define NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK (0x7 << 8) +#define NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR (1 << 23) +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK \ + (NHMEX_M_PMON_ZDP_CTL_FVC_FVID_MASK | \ + NHMEX_M_PMON_ZDP_CTL_FVC_BCMD_MASK | \ + NHMEX_M_PMON_ZDP_CTL_FVC_RSP_MASK | \ + NHMEX_M_PMON_ZDP_CTL_FVC_PBOX_INIT_ERR) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) + +/* + * use the 9~13 bits to select event If the 7th bit is not set, + * otherwise use the 19~21 bits to select event. + */ +#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_EXTAR_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) +#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_SET_FLAG_SEL_MASK, \ + (u64)-1, NHMEX_M_##r) + +/* NHM-EX Rbox */ +#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 +#define NHMEX_R_MSR_PMON_CTL0 0xe10 +#define NHMEX_R_MSR_PMON_CNT0 0xe11 +#define NHMEX_R_MSR_OFFSET 0x20 + +#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ + ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) +#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ + (((n) < 4 ? 0 : 0x10) + (n) * 4) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ + (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ + (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) + +#define NHMEX_R_PMON_CTL_EN (1 << 0) +#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) +#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK + +/* NHM-EX Wbox */ +#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 +#define NHMEX_W_MSR_PMON_CNT0 0xc90 +#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 +#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 +#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 + +#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; + unsigned pair_ctr_ctl:1; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; + struct list_head box_list; +}; + +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config, config1, config2; + atomic_t ref; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; + struct intel_uncore_pmu *pmu; + struct hrtimer hrtimer; + struct list_head list; + struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline +unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->event_ctl + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->perf_ctr + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + +static inline bool uncore_box_is_fake(struct intel_uncore_box *box) +{ + return (box->phys_id < 0); +} diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 47124a7..92c7e39 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void) * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); - * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0); */ } @@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void) unsigned int low, high; /* If we get stripped -- indexing fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (!(low & (1 << 7))) { diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 32bcfc7..e4dd0f7 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index addf9e8..ee8e9ab 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c deleted file mode 100644 index a640ae5..0000000 --- a/arch/x86/kernel/cpu/sched.c +++ /dev/null @@ -1,55 +0,0 @@ -#include <linux/sched.h> -#include <linux/math64.h> -#include <linux/percpu.h> -#include <linux/irqflags.h> - -#include <asm/cpufeature.h> -#include <asm/processor.h> - -#ifdef CONFIG_SMP - -static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); - -static unsigned long scale_aperfmperf(void) -{ - struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); - unsigned long ratio, flags; - - local_irq_save(flags); - get_aperfmperf(&val); - local_irq_restore(flags); - - ratio = calc_aperfmperf_ratio(old, &val); - *old = val; - - return ratio; -} - -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ - /* - * do aperf/mperf on the cpu level because it includes things - * like turbo mode, which are relevant to full cores. - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return scale_aperfmperf(); - - /* - * maybe have something cpufreq here - */ - - return default_scale_freq_power(sd, cpu); -} - -unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) -{ - /* - * aperf/mperf already includes the smt gain - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return SCHED_LOAD_SCALE; - - return default_scale_smt_power(sd, cpu); -} - -#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 571246d..ae42418b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,8 +27,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pB\n", (void *) address, - reliable ? "" : "? ", (void *) address); + pr_cont(" [<%p>] %s%pB\n", + (void *)address, reliable ? "" : "? ", (void *)address); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; + print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d78..1038a41 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %08lx", *stack++); + pr_cont("\n"); + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs) { int i; - print_modules(); __show_regs(regs, !user_mode_vm(regs)); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); + pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + pr_emerg("Stack:\n"); show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - printk(KERN_EMERG "Code: "); + pr_emerg("Code:"); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad EIP value."); + pr_cont(" Bad EIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont(" <%02x>", c); else - printk(KERN_CONT "%02x ", c); + pr_cont(" %02x", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b761..b653675 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(KERN_CONT " <EOI> "); + pr_cont(" <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %016lx", *stack++); + pr_cont("\n"); + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - print_modules(); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad RIP value."); + pr_cont(" Bad RIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont("<%02x> ", c); else - printk(KERN_CONT "%02x ", c); + pr_cont("%02x ", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4185797..ed858e9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -944,7 +944,7 @@ void __init e820_reserve_resources(void) for (i = 0; i < e820_saved.nr_map; i++) { struct e820entry *entry = &e820_saved.map[i]; firmware_map_add_early(entry->addr, - entry->addr + entry->size - 1, + entry->addr + entry->size, e820_type_to_string(entry->type)); } } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133..69babd8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi -#ifdef CONFIG_SMP - ALIGN - INTR_FRAME -.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 -.if NUM_INVALIDATE_TLB_VECTORS > \idx -ENTRY(invalidate_interrupt\idx) - pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) - jmp .Lcommon_invalidate_interrupt0 - CFI_ADJUST_CFA_OFFSET -8 -END(invalidate_interrupt\idx) -.endif -.endr - CFI_ENDPROC -apicinterrupt INVALIDATE_TLB_VECTOR_START, \ - invalidate_interrupt0, smp_invalidate_interrupt -#endif - apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ @@ -1758,10 +1740,30 @@ end_repeat_nmi: */ call save_paranoid DEFAULT_FRAME 0 + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc60..7ad683d 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -294,9 +294,9 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); + pr_notice("Broke affinity for irq %i\n", irq); else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); + pr_notice("Cannot set affinity for irq %i\n", irq); } /* @@ -328,6 +328,7 @@ void fixup_irqs(void) chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } + __this_cpu_write(vector_irq[vector], -1); } } #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 252981a..6e03b0d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -171,79 +171,6 @@ static void __init smp_intr_init(void) */ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - /* IPIs for invalidation */ -#define ALLOC_INVTLB_VEC(NR) \ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ - invalidate_interrupt##NR) - - switch (NUM_INVALIDATE_TLB_VECTORS) { - default: - ALLOC_INVTLB_VEC(31); - case 31: - ALLOC_INVTLB_VEC(30); - case 30: - ALLOC_INVTLB_VEC(29); - case 29: - ALLOC_INVTLB_VEC(28); - case 28: - ALLOC_INVTLB_VEC(27); - case 27: - ALLOC_INVTLB_VEC(26); - case 26: - ALLOC_INVTLB_VEC(25); - case 25: - ALLOC_INVTLB_VEC(24); - case 24: - ALLOC_INVTLB_VEC(23); - case 23: - ALLOC_INVTLB_VEC(22); - case 22: - ALLOC_INVTLB_VEC(21); - case 21: - ALLOC_INVTLB_VEC(20); - case 20: - ALLOC_INVTLB_VEC(19); - case 19: - ALLOC_INVTLB_VEC(18); - case 18: - ALLOC_INVTLB_VEC(17); - case 17: - ALLOC_INVTLB_VEC(16); - case 16: - ALLOC_INVTLB_VEC(15); - case 15: - ALLOC_INVTLB_VEC(14); - case 14: - ALLOC_INVTLB_VEC(13); - case 13: - ALLOC_INVTLB_VEC(12); - case 12: - ALLOC_INVTLB_VEC(11); - case 11: - ALLOC_INVTLB_VEC(10); - case 10: - ALLOC_INVTLB_VEC(9); - case 9: - ALLOC_INVTLB_VEC(8); - case 8: - ALLOC_INVTLB_VEC(7); - case 7: - ALLOC_INVTLB_VEC(6); - case 6: - ALLOC_INVTLB_VEC(5); - case 5: - ALLOC_INVTLB_VEC(4); - case 4: - ALLOC_INVTLB_VEC(3); - case 3: - ALLOC_INVTLB_VEC(2); - case 2: - ALLOC_INVTLB_VEC(1); - case 1: - ALLOC_INVTLB_VEC(0); - break; - } - /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 1d5d31e..dc1404b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -107,7 +107,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) { struct setup_data_node *node; struct setup_data *data; - int error = -ENOMEM; + int error; struct dentry *d; struct page *pg; u64 pa_data; @@ -121,8 +121,10 @@ static int __init create_setup_data_nodes(struct dentry *parent) while (pa_data) { node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) + if (!node) { + error = -ENOMEM; goto err_dir; + } pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); if (PageHighMem(pg)) { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8bfb614..3f61904 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags) /** * kgdb_arch_handle_exception - Handle architecture specific GDB packets. - * @vector: The error vector of the exception that happened. + * @e_vector: The error vector of the exception that happened. * @signo: The signal number of the exception that happened. * @err_code: The error code of the exception that happened. - * @remcom_in_buffer: The buffer of the packet we have read. - * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. - * @regs: The &struct pt_regs of the current process. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. * * This function MUST handle the 'c' and 's' command packets, * as well packets to set / remove a hardware breakpoint, if used. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5a..c1d61ee 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,6 +39,9 @@ #include <asm/desc.h> #include <asm/tlbflush.h> #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/hypervisor.h> static int kvmapf = 1; @@ -283,6 +286,22 @@ static void kvm_register_steal_time(void) cpu, __pa(st)); } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic_write(APIC_EOI, APIC_EOI_ACK); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void) smp_processor_id()); } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + if (has_steal_clock) kvm_register_steal_time(); } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void) { if (!__get_cpu_var(apf_reason).enabled) return; @@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused) smp_processor_id()); } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); +} + static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) - on_each_cpu(kvm_pv_disable_apf, NULL, 1); + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } @@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { kvm_disable_steal_time(); - kvm_pv_disable_apf(NULL); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); apf_task_wake_all(); } @@ -424,6 +466,9 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -432,6 +477,19 @@ void __init kvm_guest_init(void) #endif } +static bool __init kvm_detect(void) +{ + if (!kvm_para_available()) + return false; + return true; +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + static __init int activate_jump_labels(void) { if (has_steal_clock) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 086eb58..f1b42b3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -120,11 +120,6 @@ bool kvm_check_and_clear_guest_paused(void) bool ret = false; struct pvclock_vcpu_time_info *src; - /* - * per_cpu() is safe here because this function is only called from - * timer functions where preemption is already disabled. - */ - WARN_ON(!in_atomic()); src = &__get_cpu_var(hv_clock); if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc69..4873e62 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -87,6 +87,7 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cpu_device_id.h> +#include <asm/perf_event.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - mutex_lock(µcode_mutex); if (uci->valid) { enum ucode_state ustate; @@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu) if (ustate == UCODE_ERROR) err = -EINVAL; } - mutex_unlock(µcode_mutex); return err; } @@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev, const char *buf, size_t size) { unsigned long val; - int cpu = dev->id; - ssize_t ret = 0; + int cpu; + ssize_t ret = 0, tmp_ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; - if (val == 1) { - get_online_cpus(); - if (cpu_online(cpu)) - ret = reload_for_cpu(cpu); - put_online_cpus(); + if (val != 1) + return size; + + get_online_cpus(); + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); if (!ret) ret = size; @@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL); static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &dev_attr_reload.attr, &dev_attr_version.attr, &dev_attr_processor_flags.attr, NULL @@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { #ifdef MODULE /* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id microcode_id[] = { +static const struct x86_cpu_id __initconst microcode_id[] = { #ifdef CONFIG_MICROCODE_INTEL { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, #endif @@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = { MODULE_DEVICE_TABLE(x86cpu, microcode_id); #endif +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); @@ -540,16 +560,25 @@ static int __init microcode_init(void) mutex_lock(µcode_mutex); error = subsys_interface_register(&mc_cpu_interface); - + if (!error) + perf_check_microcode(); mutex_unlock(µcode_mutex); put_online_cpus(); if (error) goto out_pdev; + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + error = microcode_dev_init(); if (error) - goto out_driver; + goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -559,7 +588,11 @@ static int __init microcode_init(void) return 0; -out_driver: + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: get_online_cpus(); mutex_lock(µcode_mutex); @@ -568,7 +601,7 @@ out_driver: mutex_unlock(µcode_mutex); put_online_cpus(); -out_pdev: + out_pdev: platform_device_unregister(microcode_pdev); return error; @@ -584,6 +617,9 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); unregister_syscore_ops(&mc_syscore_ops); + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + get_online_cpus(); mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94..216a4d7 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -15,6 +15,9 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/moduleloader.h> #include <linux/elf.h> #include <linux/vmalloc.h> @@ -30,9 +33,14 @@ #include <asm/pgtable.h> #if 0 -#define DEBUGP printk +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) #else -#define DEBUGP(fmt...) +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) #endif void *module_alloc(unsigned long size) @@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs, Elf32_Sym *sym; uint32_t *location; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -73,11 +81,11 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: - /* Add the value, subtract its postition */ + /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", + pr_err("%s: Unknown relocation: %u\n", me->name, ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, + ELF64_R_SYM(rel[i].r_info); DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); val = sym->st_value + rel[i].r_addend; @@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, #endif break; default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0b2f84..f84f5c5 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) #ifdef CONFIG_X86_32 /* * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C. Simply have 3 states - * the NMI can be in. + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in: * * 1) not running * 2) executing @@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * If an NMI hits a breakpoint that executes an iret, another * NMI can preempt it. We do not want to allow this new NMI * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the first NMI will perform - * an cmpxchg on the state, and if it doesn't successfully - * reset the state to "not running" it will restart the next - * NMI. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. */ enum nmi_states { - NMI_NOT_RUNNING, + NMI_NOT_RUNNING = 0, NMI_EXECUTING, NMI_LATCHED, }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); #define nmi_nesting_preprocess(regs) \ do { \ - if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ - __get_cpu_var(nmi_state) = NMI_LATCHED; \ + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ + this_cpu_write(nmi_state, NMI_LATCHED); \ return; \ } \ - nmi_restart: \ - __get_cpu_var(nmi_state) = NMI_EXECUTING; \ - } while (0) + this_cpu_write(nmi_state, NMI_EXECUTING); \ + this_cpu_write(nmi_cr2, read_cr2()); \ + } while (0); \ + nmi_restart: #define nmi_nesting_postprocess() \ do { \ - if (cmpxchg(&__get_cpu_var(nmi_state), \ - NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ + write_cr2(this_cpu_read(nmi_cr2)); \ + if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) #else /* x86_64 */ diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 149b8d9..6d9582e 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); } static void __init cleanup_nmi_testsuite(void) @@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, - NMI_FLAG_FIRST, "nmi_selftest")) { + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { nmi_fail = FAILURE; return; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9ce8859..17fff18 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b72838b..299d493 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -22,6 +22,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "Calgary: " fmt + #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> @@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev, offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, npages, 0, boundary_size, 0); if (offset == ~0UL) { - printk(KERN_WARNING "Calgary: IOMMU full.\n"); + pr_warn("IOMMU full\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); @@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == DMA_ERROR_CODE)) { - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); + pr_warn("failed to allocate %u pages in iommu %p\n", + npages, tbl); return DMA_ERROR_CODE; } @@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl) i++; } while ((val & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "Calgary: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("PCI bus not quiesced, continuing anyway\n"); /* invalidate TCE cache */ target = calgary_reg(bbar, tar_offset(tbl->it_busno)); @@ -604,8 +605,7 @@ begin: i++; } while ((val64 & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); /* 3. poll Page Migration DEBUG for SoftStopFault */ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); @@ -617,8 +617,7 @@ begin: if (++count < 100) goto begin; else { - printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " - "aborting TCE cache flush sequence!\n"); + pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); return; /* pray for the best */ } } @@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl) plssr = be32_to_cpu(readl(target)); /* If no error, the agent ID in the CSR is not valid */ - printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " - "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); + pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", + tbl->it_busno, csr, plssr); } static void calioc2_dump_error_regs(struct iommu_table *tbl) @@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) target = calgary_reg(bbar, phboff | 0x800); mck = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", - tbl->it_busno); + pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); + pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); /* dump rest of error regs */ - printk(KERN_EMERG "Calgary: "); + pr_emerg(""); for (i = 0; i < ARRAY_SIZE(errregs); i++) { /* err regs are at 0x810 - 0x870 */ erroff = (0x810 + (i * 0x10)); target = calgary_reg(bbar, phboff | erroff); errregs[i] = be32_to_cpu(readl(target)); - printk("0x%08x@0x%lx ", errregs[i], erroff); + pr_cont("0x%08x@0x%lx ", errregs[i], erroff); } - printk("\n"); + pr_cont("\n"); /* root complex status */ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 62c9457..de2b7ad 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; -/* - * Group multi-function PCI devices into a single device-group for the - * iommu_device_group interface. This tells the iommu driver to pretend - * it cannot distinguish between functions of a device, exposing only one - * group for the device. Useful for disallowing use of individual PCI - * functions from userspace drivers. - */ -int iommu_group_mf __read_mostly; - extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; /* Dummy device used for NULL arguments (normally ISA). */ @@ -100,7 +91,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page = NULL; + struct page *page; unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; @@ -108,6 +99,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; again: + page = NULL; if (!(flag & GFP_ATOMIC)) page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (!page) @@ -193,8 +185,6 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - if (!strncmp(p, "group_mf", 8)) - iommu_group_mf = 1; gart_parse_options(p); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e..ef6a845 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -145,16 +147,14 @@ void show_regs_common(void) /* Board Name is optional */ board = dmi_get_system_info(DMI_BOARD_NAME); - printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk(KERN_CONT " %s %s", vendor, product); - if (board) - printk(KERN_CONT "/%s", board); - printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, + vendor, product, + board ? "/" : "", + board ? board : ""); } void flush_thread(void) @@ -645,7 +645,7 @@ static void amd_e400_idle(void) amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - printk(KERN_INFO "System has AMD C1E enabled\n"); + pr_info("System has AMD C1E enabled\n"); } } @@ -659,8 +659,7 @@ static void amd_e400_idle(void) */ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); - printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", - cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); } #endif if (pm_idle) @@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) /* * One CPU supports mwait => All CPUs supports mwait */ - printk(KERN_INFO "using mwait in idle threads.\n"); + pr_info("using mwait in idle threads\n"); pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pr_info("using AMD E400 aware idle routine\n"); pm_idle = amd_e400_idle; } else pm_idle = default_idle; @@ -715,7 +713,7 @@ static int __init idle_setup(char *str) return -EINVAL; if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); + pr_info("using polling idle threads\n"); pm_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7f..0a980c9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); + pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); BUG(); } } @@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gs = addr; if (doit) { load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } } put_cpu(); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 03920a1..1b27de5 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) /* Set correct numa_node information for AMD NB functions */ -static void __init quirk_amd_nb_node(struct pci_dev *dev) +static void __devinit quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 25b48ed..52190a9 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/reboot.h> #include <linux/init.h> @@ -20,14 +22,12 @@ #include <asm/virtext.h> #include <asm/cpu.h> #include <asm/nmi.h> +#include <asm/smp.h> -#ifdef CONFIG_X86_32 -# include <linux/ctype.h> -# include <linux/mc146818rtc.h> -# include <asm/realmode.h> -#else -# include <asm/x86_init.h> -#endif +#include <linux/ctype.h> +#include <linux/mc146818rtc.h> +#include <asm/realmode.h> +#include <asm/x86_init.h> /* * Power off function, if any @@ -49,7 +49,7 @@ int reboot_force; */ static int reboot_default = 1; -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP static int reboot_cpu = -1; #endif @@ -67,8 +67,8 @@ bool port_cf9_safe = false; * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] * warm Don't set the cold reboot flag * cold Set the cold reboot flag - * bios Reboot by jumping through the BIOS (only for X86_32) - * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * bios Reboot by jumping through the BIOS + * smp Reboot by executing reset on BSP or other CPU * triple Force a triple fault (init) * kbd Use the keyboard controller. cold reset (default) * acpi Use the RESET_REG in the FADT @@ -95,7 +95,6 @@ static int __init reboot_setup(char *str) reboot_mode = 0; break; -#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP case 's': if (isdigit(*(str+1))) { @@ -112,7 +111,6 @@ static int __init reboot_setup(char *str) #endif /* CONFIG_SMP */ case 'b': -#endif case 'a': case 'k': case 't': @@ -138,7 +136,6 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) @@ -152,16 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_BIOS) { reboot_type = BOOT_BIOS; - printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "BIOS", d->ident); } return 0; } -void machine_real_restart(unsigned int type) +void __noreturn machine_real_restart(unsigned int type) { - void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) - real_mode_header->machine_real_restart_asm; - local_irq_disable(); /* @@ -181,25 +176,28 @@ void machine_real_restart(unsigned int type) /* * Switch back to the initial page table. */ +#ifdef CONFIG_X86_32 load_cr3(initial_page_table); - - /* - * Write 0x1234 to absolute memory location 0x472. The BIOS reads - * this on booting to tell it to "Bypass memory test (also warm - * boot)". This seems like a fairly standard thing that gets set by - * REBOOT.COM programs, and the previous reset routine did this - * too. */ - *((unsigned short *)0x472) = reboot_mode; +#else + write_cr3(real_mode_header->trampoline_pgd); +#endif /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -#endif /* CONFIG_X86_32 */ - /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot */ @@ -207,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_CF9) { reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "PCI", d->ident); } return 0; } @@ -217,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { reboot_type = BOOT_KBD; - printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + "KBD", d->ident); } return 0; } /* - * This is a single dmi_table handling all reboot quirks. Note that - * REBOOT_BIOS is only available for 32bit + * This is a single dmi_table handling all reboot quirks. */ static struct dmi_system_id __initdata reboot_dmi_table[] = { -#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -377,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, -#endif /* CONFIG_X86_32 */ { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, @@ -451,6 +447,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, { } }; @@ -576,13 +580,11 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; -#ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; -#endif case BOOT_ACPI: acpi_reboot(); @@ -624,12 +626,10 @@ void native_machine_shutdown(void) /* The boot cpu is always logical cpu 0 */ int reboot_cpu_id = 0; -#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; -#endif /* Make certain the cpu I'm about to reboot on is online */ if (!cpu_online(reboot_cpu_id)) @@ -670,7 +670,7 @@ static void __machine_emergency_restart(int emergency) static void native_machine_restart(char *__unused) { - printk("machine restart\n"); + pr_notice("machine restart\n"); if (!reboot_force) machine_shutdown(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16be6dc..f4b9b80 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1031,8 +1031,6 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); - x86_platform.wallclock_init(); - mcheck_init(); arch_init_ideal_nops(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5a98aa2..5cdff03 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -21,7 +21,7 @@ #include <asm/cpu.h> #include <asm/stackprotector.h> -DEFINE_PER_CPU(int, cpu_number); +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 21af737..b280908 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,6 +6,9 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> @@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3fab55b..7c5a8c3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1,4 +1,4 @@ -/* + /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> @@ -39,6 +39,8 @@ * Glauber Costa : i386 and x86_64 integration */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <linux/smp.h> #include <linux/module.h> @@ -104,17 +106,17 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; /* representing HT siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); -DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); @@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void) * boards) */ - pr_debug("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC()\n"); if (apic->smp_callin_clear_local_apic) apic->smp_callin_clear_local_apic(); setup_local_APIC(); @@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - * * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock(); lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); @@ -349,9 +342,12 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) - return topology_sane(c, o, "mc"); + if (c->phys_proc_id == o->phys_proc_id) { + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return true; + return topology_sane(c, o, "mc"); + } return false; } @@ -429,17 +425,16 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - pr_debug("Before bogomips.\n"); + pr_debug("Before bogomips\n"); for_each_possible_cpu(cpu) if (cpumask_test_cpu(cpu, cpu_callout_mask)) bogosum += cpu_data(cpu).loops_per_jiffy; - printk(KERN_INFO - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - pr_debug("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1\n"); } void __inquire_remote_apic(int apicid) @@ -449,18 +444,17 @@ void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); + pr_info("Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); + pr_info("... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk(KERN_CONT - "a previous APIC delivery may have failed\n"); + pr_cont("a previous APIC delivery may have failed\n"); apic_icr_write(APIC_DM_REMRD | regs[i], apicid); @@ -473,10 +467,10 @@ void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk(KERN_CONT "%08x\n", status); + pr_cont("%08x\n", status); break; default: - printk(KERN_CONT "failed\n"); + pr_cont("failed\n"); } } } @@ -510,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); } - pr_debug("NMI sent.\n"); + pr_debug("NMI sent\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -537,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - pr_debug("Asserting INIT.\n"); + pr_debug("Asserting INIT\n"); /* * Turn INIT on target chip @@ -553,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) mdelay(10); - pr_debug("Deasserting INIT.\n"); + pr_debug("Deasserting INIT\n"); /* Target chip */ /* Send IPI */ @@ -586,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - pr_debug("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d\n", num_starts); for (j = 1; j <= num_starts; j++) { - pr_debug("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - pr_debug("After apic_write.\n"); + pr_debug("After apic_write\n"); /* * STARTUP IPI @@ -610,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) */ udelay(300); - pr_debug("Startup point 1.\n"); + pr_debug("Startup point 1\n"); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -625,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - pr_debug("After Startup.\n"); + pr_debug("After Startup\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -644,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid) if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" Ok.\n"); + pr_cont(" OK\n"); current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -728,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * allow APs to start initializing. */ - pr_debug("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d\n", cpu); cpumask_set_cpu(cpu, cpu_callout_mask); - pr_debug("After Callout %d.\n", cpu); + pr_debug("After Callout %d\n", cpu); /* * Wait 5s total for a response @@ -758,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - pr_err("CPU%d: Not responding.\n", cpu); + pr_err("CPU%d: Not responding\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -803,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { - printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); + pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -884,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus) unsigned int cpu; unsigned nr; - printk(KERN_WARNING - "More than 8 CPUs detected - skipping them.\n" - "Use CONFIG_X86_BIGSMP.\n"); + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); nr = 0; for_each_present_cpu(cpu) { @@ -907,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING - "weird, boot CPU (#%d) not listed by the BIOS.\n", + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); @@ -920,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (!smp_found_config && !acpi_lapic) { preempt_enable(); - printk(KERN_NOTICE "SMP motherboard not detected.\n"); + pr_notice("SMP motherboard not detected\n"); disable_smp(); if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return -1; } @@ -933,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus) * CPU too, but we do it for the sake of robustness anyway. */ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - printk(KERN_NOTICE - "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_physical_apicid); + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); @@ -948,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation." - "(tell your hw vendor)\n"); + pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); } smpboot_clear_io_apic(); disable_ioapic_support(); @@ -962,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated.\n"); + pr_info("SMP mode deactivated\n"); smpboot_clear_io_apic(); connect_bsp_APIC(); @@ -1014,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); + pr_info("SMP disabled\n"); disable_smp(); goto out; } @@ -1052,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - printk(KERN_INFO "CPU%d: ", 0); + pr_info("CPU%d: ", 0); print_cpu_info(&cpu_data(0)); x86_init.timers.setup_percpu_clockev(); @@ -1102,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - pr_debug("Boot done.\n"); + pr_debug("Boot done\n"); nmi_selftest(); impress_friends(); @@ -1163,8 +1152,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - printk(KERN_WARNING - "%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } @@ -1173,13 +1161,12 @@ __init void prefill_possible_map(void) if (!setup_max_cpus) #endif if (possible > i) { - printk(KERN_WARNING - "%d Processors exceeds max_cpus limit of %u\n", + pr_warn("%d Processors exceeds max_cpus limit of %u\n", possible, setup_max_cpus); possible = i; } - printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d9..b481341 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -9,6 +9,9 @@ /* * Handle hardware traps and faults. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/spinlock.h> @@ -143,12 +146,11 @@ trap_signal: #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); + pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } #endif @@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code) if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", + pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } force_sig(SIGSEGV, tsk); @@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); + pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147..cfa5d4f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> @@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); + pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); tsc_disabled = 1; return 1; } @@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - printk("Fast TSC calibration failed\n"); + pr_err("Fast TSC calibration failed\n"); return 0; success: @@ -392,7 +393,7 @@ success: */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); - printk("Fast TSC calibration using PIT\n"); + pr_info("Fast TSC calibration using PIT\n"); return delta; } @@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void) * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { - printk(KERN_INFO - "TSC: PIT calibration matches %s. %d loops\n", - hpet ? "HPET" : "PMTIMER", i + 1); + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } @@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void) */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ - printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { - printk("TSC: No reference (HPET/PMTIMER) available\n"); + pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " - "failed.\n"); + pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ - printk(KERN_INFO "TSC: using %s reference calibration\n", - hpet ? "HPET" : "PMTIMER"); + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " - "Using PIT calibration\n"); + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } @@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void) * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ - printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", - hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } @@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason) tsc_unstable = 1; sched_clock_stable = 0; disable_sched_clock_irqtime(); - printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); + pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_mark_unstable(&clocksource_tsc); @@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) goto out; tsc_khz = freq; - printk(KERN_INFO "Refined TSC clocksource calibration: " - "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); out: clocksource_register_khz(&clocksource_tsc, tsc_khz); @@ -970,9 +968,9 @@ void __init tsc_init(void) return; } - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); /* * Secondary CPUs do not run through tsc_init(), so set up diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910..36fd420 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { int ret; struct insn insn; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58a..54abcc0 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -28,6 +28,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/errno.h> #include <linux/interrupt.h> @@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { - printk("no vm86_info: BAD\n"); + pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); if (tmp) { - printk("vm86: could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86_info\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 8eeb55a..992f890 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -16,6 +16,7 @@ #include <linux/pci_ids.h> #include <linux/pci_regs.h> #include <linux/smp.h> +#include <linux/irq.h> #include <asm/apic.h> #include <asm/pci-direct.h> @@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void) ctl = readl(address + 4); printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + + /* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP + if (cap & ctl & BIT(8)) { + ctl &= ~BIT(8); +#ifdef CONFIG_PROC_FS + /* Don't let users change irq affinity via procfs */ + no_irq_affinity = 1; +#endif + } +#endif + if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); @@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void) pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); pv_init_ops.patch = vsmp_patch; - ctl &= ~(1 << 4); - writel(ctl, address + 4); - ctl = readl(address + 4); - printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } + writel(ctl, address + 4); + ctl = readl(address + 4); + pr_info("vSMP CTL: control set to:0x%08x\n", ctl); early_iounmap(address, 8); } @@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void) #endif } +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +/* + * In vSMP, all cpus should be capable of handling interrupts, regardless of + * the APIC used. + */ +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + cpumask_setall(retmask); +} + +static void vsmp_apic_post_init(void) +{ + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; + apic->vector_allocation_domain = fill_vector_allocation_domain; +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + x86_platform.apic_post_init = vsmp_apic_post_init; + vsmp_cap_cpus(); set_vsmp_pv_ops(); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7515cf0..8d141b3 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,6 +18,8 @@ * use the vDSO. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> @@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - struct task_struct *tsk; - - if (!show_unhandled_signals || !__ratelimit(&rs)) + if (!show_unhandled_signals) return; - tsk = current; - - printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, tsk->comm, task_pid_nr(tsk), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) @@ -139,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +#ifdef CONFIG_SECCOMP +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) +{ + if (!seccomp_mode(&tsk->seccomp)) + return 0; + task_pt_regs(tsk)->orig_ax = syscall_nr; + task_pt_regs(tsk)->ax = syscall_nr; + return __secure_computing(syscall_nr); +} +#else +#define vsyscall_seccomp(_tsk, _nr) 0 +#endif + static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -174,6 +184,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) int vsyscall_nr; int prev_sig_on_uaccess_error; long ret; + int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -205,9 +216,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - if (seccomp_mode(&tsk->seccomp)) - do_exit(SIGKILL); - /* * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. @@ -222,8 +230,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) * address 0". */ ret = -EFAULT; + skip = 0; switch (vsyscall_nr) { case 0: + skip = vsyscall_seccomp(tsk, __NR_gettimeofday); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) break; @@ -234,6 +247,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; case 1: + skip = vsyscall_seccomp(tsk, __NR_time); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(time_t))) break; @@ -241,6 +258,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; case 2: + skip = vsyscall_seccomp(tsk, __NR_getcpu); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) break; @@ -253,6 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + if (skip) { + if ((long)regs->ax <= 0L) /* seccomp errno emulation */ + goto do_ret; + goto done; /* seccomp trace/trap */ + } + if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -271,10 +298,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) regs->ax = ret; +do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; - +done: return true; sigsegv: diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f..6020f6f 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic_string); EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 35c5e54..9f3167e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } -void wallclock_init_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, - .wallclock_init = wallclock_init_noop, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bd18149..3d3e207 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -3,6 +3,9 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bootmem.h> #include <linux/compat.h> #include <asm/i387.h> @@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf) BUG_ON(sig_xstate_size < xstate_size); if ((unsigned long)buf % 64) - printk("save_i387_xstate: bad fpstate %p\n", buf); + pr_err("%s: bad fpstate %p\n", __func__, buf); if (!used_math()) return 0; @@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void) pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pr_err("FP/SSE not shown under xsave features 0x%llx\n", pcntxt_mask); BUG(); } @@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void) setup_xstate_init(); - printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " - "cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", + pcntxt_mask, xstate_size); } /* |