diff options
Diffstat (limited to 'arch/x86')
36 files changed, 437 insertions, 208 deletions
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index cae3feb..db75d07 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -91,7 +91,7 @@ static int detect_memory_e801(void) if (oreg.ax > 15*1024) { return -1; /* Bogus! */ } else if (oreg.ax == 15*1024) { - boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax; } else { /* * This ignores memory above 16MB if we have a memory diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index d87988b..34595d5 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -78,6 +78,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 43085bf..156cd5d 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) * Don't enable translation but enable GART IO and CPU accesses. * Also, set DISTLBWALKPRB since GART tables memory is UC. */ - ctl = DISTLBWALKPRB | order << 1; + ctl = order << 1; pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } @@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; - /* address of the mappings table */ - addr >>= 12; - tmp = (u32) addr<<4; - tmp &= ~0xf; - pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); - - /* Enable GART translation for this hammer. */ - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= GARTEN; - ctl &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN | DISTLBWALKPRB; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c4bd267..a97a240 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,7 +150,7 @@ void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); -int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr); +int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 3d4dab4..a50fc9f 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -51,7 +51,7 @@ static inline void numa_remove_cpu(int cpu) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); +void debug_cpumask_set_cpu(int cpu, int node, bool enable); #endif #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723..d56187c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 3e094af..130f1ee 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -94,6 +94,8 @@ /* after this # consecutive successes, bump up the throttle if it was lowered */ #define COMPLETE_THRESHOLD 5 +#define UV_LB_SUBNODEID 0x10 + /* * number of entries in the destination side payload queue */ @@ -124,7 +126,7 @@ * The distribution specification (32 bytes) is interpreted as a 256-bit * distribution vector. Adjacent bits correspond to consecutive even numbered * nodeIDs. The result of adding the index of a given bit to the 15-bit - * 'base_dest_nodeid' field of the header corresponds to the + * 'base_dest_nasid' field of the header corresponds to the * destination nodeID associated with that specified bit. */ struct bau_target_uvhubmask { @@ -176,7 +178,7 @@ struct bau_msg_payload { struct bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid of the */ + unsigned int base_dest_nasid:15; /* nasid of the */ /* bits 20:6 */ /* first bit in uvhub map */ unsigned int command:8; /* message type */ /* bits 28:21 */ @@ -378,6 +380,10 @@ struct ptc_stats { unsigned long d_rcanceled; /* number of messages canceled by resets */ }; +struct hub_and_pnode { + short uvhub; + short pnode; +}; /* * one per-cpu; to locate the software tables */ @@ -399,10 +405,12 @@ struct bau_control { int baudisabled; int set_bau_off; short cpu; + short osnode; short uvhub_cpu; short uvhub; short cpus_in_socket; short cpus_in_uvhub; + short partition_base_pnode; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -422,15 +430,16 @@ struct bau_control { int congested_period; cycles_t period_time; long period_requests; + struct hub_and_pnode *target_hub_and_pnode; }; static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) { return constant_test_bit(uvhub, &dstp->bits[0]); } -static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) +static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp) { - __set_bit(uvhub, &dstp->bits[0]); + __set_bit(pnode, &dstp->bits[0]); } static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, int nbits) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a501741..4298002 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -398,6 +398,8 @@ struct uv_blade_info { unsigned short nr_online_cpus; unsigned short pnode; short memory_nid; + spinlock_t nmi_lock; + unsigned long nmi_count; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 20cafea..f5bb64a 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SCRATCH5 */ +/* ========================================================================= */ +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x00778 + +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL +union uvh_scratch5_u { + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5 : 64; /* RW, W1CS */ + } s; +}; #endif /* __ASM_UV_MMRS_X86_H__ */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 643ebf2..d3d8590 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,6 +68,17 @@ struct x86_init_oem { }; /** + * struct x86_init_mapping - platform specific initial kernel pagetable setup + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage + * + * For more details on the purpose of this hook, look in + * init_memory_mapping and the commit that added it. + */ +struct x86_init_mapping { + void (*pagetable_reserve)(u64 start, u64 end); +}; + +/** * struct x86_init_paging - platform specific paging functions * @pagetable_setup_start: platform specific pre paging_init() call * @pagetable_setup_done: platform specific post paging_init() call @@ -123,6 +134,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 86d1ad4..73fb469 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -499,7 +499,7 @@ out: * Don't enable translation yet but enable GART IO and CPU * accesses and set DISTLBWALKPRB since GART table memory is UC. */ - u32 ctl = DISTLBWALKPRB | aper_order << 1; + u32 ctl = aper_order << 1; bus = amd_nb_bus_dev_ranges[i].bus; dev_base = amd_nb_bus_dev_ranges[i].dev_base; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 68df09b..45fd33d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -128,8 +128,8 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -static int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr); +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) @@ -3570,7 +3570,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -int +static int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) { struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); @@ -3585,8 +3585,8 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -static int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr) +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) { unsigned int id = attr->ioapic, pin = attr->ioapic_pin; int ret; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 33b10a0..7acd2d2 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -37,6 +37,13 @@ #include <asm/smp.h> #include <asm/x86_init.h> #include <asm/emergency-restart.h> +#include <asm/nmi.h> + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) +DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void) */ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { + unsigned long real_uv_nmi; + int bid; + if (reason != DIE_NMIUNKNOWN) return NOTIFY_OK; if (in_crash_kexec) /* do nothing if entering the crash kernel */ return NOTIFY_OK; + /* - * Use a lock so only one cpu prints at a time - * to prevent intermixed output. + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + + if (unlikely(real_uv_nmi)) { + spin_lock(&uv_blade_info[bid].nmi_lock); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (real_uv_nmi) { + uv_blade_info[bid].nmi_count++; + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + } + spin_unlock(&uv_blade_info[bid].nmi_lock); + } + + if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) + return NOTIFY_DONE; + + __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; + + /* + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. */ spin_lock(&uv_nmi_lock); - pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); dump_stack(); spin_unlock(&uv_nmi_lock); @@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, }; void uv_register_nmi_notifier(void) @@ -720,8 +756,9 @@ void __init uv_system_init(void) printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -747,6 +784,7 @@ void __init uv_system_init(void) uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); max_pnode = max(pnode, max_pnode); blade++; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0b4be43..adee12e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,6 +228,7 @@ #include <linux/kthread.h> #include <linux/jiffies.h> #include <linux/acpi.h> +#include <linux/syscore_ops.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -1238,6 +1239,7 @@ static int suspend(int vetoable) local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); @@ -1255,6 +1257,7 @@ static int suspend(int vetoable) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; + syscore_resume(); sysdev_resume(); local_irq_enable(); @@ -1280,6 +1283,7 @@ static void standby(void) local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1287,6 +1291,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); + syscore_resume(); sysdev_resume(); local_irq_enable(); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3532d3b..6f9d1f6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -613,7 +613,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #endif /* As a rule processors have APIC timer running in deep C states */ - if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) + if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400)) set_cpu_cap(c, X86_FEATURE_ARAT); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 167f97b..bb0adad 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -509,6 +509,7 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9..0f03446 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -446,18 +446,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index eed3673a..e638689 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -586,8 +586,12 @@ static int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } + /* + * Do not allow config1 (extended registers) to propagate, + * there's no sane user-space generalization yet: + */ if (attr->type == PERF_TYPE_RAW) - return x86_pmu_extra_regs(event->attr.config, event); + return 0; if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); @@ -609,8 +613,8 @@ static int x86_setup_perfctr(struct perf_event *event) /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ if (!x86_pmu.bts_active) return -EOPNOTSUPP; @@ -1284,6 +1288,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) { /* @@ -1370,8 +1384,6 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_DONE; } - apic_write(APIC_LVTPC, APIC_DM_NMI); - handled = x86_pmu.handle_irq(args->regs); if (!handled) return NOTIFY_DONE; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 461f62b..cf4e369 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(L1D) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ @@ -427,7 +427,9 @@ static __initconst const struct x86_pmu amd_pmu = { * * Exceptions: * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) * 0x00B FP PERF_CTL[3] * 0x00D FP PERF_CTL[3] * 0x023 DE PERF_CTL[2:0] @@ -448,6 +450,8 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x0DF LS PERF_CTL[5:0] * 0x1D6 EX PERF_CTL[5:0] * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used */ static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); @@ -460,18 +464,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); static struct event_constraint * amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) { - unsigned int event_code = amd_get_event_code(&event->hw); + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); switch (event_code & AMD_EVENT_TYPE_MASK) { case AMD_EVENT_FP: switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; case 0x003: case 0x00B: case 0x00D: return &amd_f15_PMC3; - default: - return &amd_f15_PMC53; } + return &amd_f15_PMC53; case AMD_EVENT_LS: case AMD_EVENT_DC: case AMD_EVENT_EX_LS: diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8fc2b2c..447a28d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -25,7 +25,7 @@ struct intel_percore { /* * Intel PerfMon, used on Core and later. */ -static const u64 intel_perfmon_event_map[] = +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -184,26 +184,23 @@ static __initconst const u64 snb_hw_cache_event_ids }, }, [ C(LL ) ] = { - /* - * TBD: Need Off-core Response Performance Monitoring support - */ [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -285,26 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, /* * Use RFO, not WRITEBACK, because a write miss would typically occur * on RFO. */ [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01bb, - /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -352,16 +349,36 @@ static __initconst const u64 westmere_hw_cache_event_ids }; /* - * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3 + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 */ -#define DMND_DATA_RD (1 << 0) -#define DMND_RFO (1 << 1) -#define DMND_WB (1 << 3) -#define PF_DATA_RD (1 << 4) -#define PF_DATA_RFO (1 << 5) -#define RESP_UNCORE_HIT (1 << 8) -#define RESP_MISS (0xf600) /* non uncore hit */ +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) static __initconst const u64 nehalem_hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] @@ -370,16 +387,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs { [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT, - [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS, + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT, - [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS, + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT, - [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS, + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, }, } }; @@ -391,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ @@ -933,6 +950,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This handler doesn't seem to have any issues with the unmasking + * so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); @@ -998,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; + if (event->attr.freq) + return NULL; + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); @@ -1305,7 +1335,7 @@ static void intel_clovertown_quirks(void) * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. + * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips. @@ -1409,6 +1439,18 @@ static __init int intel_pmu_init(void) x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + + if (ebx & 0x40) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + + pr_cont("erratum AAJ80 worked around, "); + } pr_cont("Nehalem events, "); break; @@ -1425,6 +1467,7 @@ static __init int intel_pmu_init(void) case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index c2520e1..e93fcd5 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -947,14 +947,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - p4_pmu_disable_event(event); + x86_pmu_stop(event, 0); } - if (handled) { - /* p4 quirk: unmask it again */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + if (handled) inc_irq_stat(apic_perf_irqs); - } + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 706a9fb..e90f084 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -391,7 +391,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr); + return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); } static void __init ioapic_add_ofnode(struct device_node *np) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c969fd9..f1a6244d 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - preempt_disable(); + local_irq_save(flags); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { @@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + local_irq_restore(flags); } static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 82ada01..b117efd 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -81,6 +81,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45892dc..f65e5b5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -655,6 +658,9 @@ restore: } goto restore; } + + ptrace_put_breakpoints(tsk); + return ((orig_ret < 0) ? orig_ret : rc); } @@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (n < HBP_NUM) { struct perf_event *bp; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + bp = thread->ptrace_bps[n]; if (!bp) - return 0; - val = bp->hw.info.address; + val = 0; + else + val = bp->hw.info.address; + + ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event *bp; struct thread_struct *t = &tsk->thread; struct perf_event_attr attr; + int err = 0; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); @@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) - return PTR_ERR(bp); + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto put; + } t->ptrace_bps[nr] = bp; } else { - int err; - bp = t->ptrace_bps[nr]; attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); - if (err) - return err; } - - return 0; +put: + ptrace_put_breakpoints(tsk); + return err; } /* diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S index 29092b3..1d5c46d 100644 --- a/arch/x86/kernel/reboot_32.S +++ b/arch/x86/kernel/reboot_32.S @@ -21,26 +21,26 @@ r_base = . /* Get our own relocated address */ call 1f 1: popl %ebx - subl $1b, %ebx + subl $(1b - r_base), %ebx /* Compute the equivalent real-mode segment */ movl %ebx, %ecx shrl $4, %ecx /* Patch post-real-mode segment jump */ - movw dispatch_table(%ebx,%eax,2),%ax - movw %ax, 101f(%ebx) - movw %cx, 102f(%ebx) + movw (dispatch_table - r_base)(%ebx,%eax,2),%ax + movw %ax, (101f - r_base)(%ebx) + movw %cx, (102f - r_base)(%ebx) /* Set up the IDT for real mode. */ - lidtl machine_real_restart_idt(%ebx) + lidtl (machine_real_restart_idt - r_base)(%ebx) /* * Set up a GDT from which we can load segment descriptors for real * mode. The GDT is not used in real mode; it is just needed here to * prepare the descriptors. */ - lgdtl machine_real_restart_gdt(%ebx) + lgdtl (machine_real_restart_gdt - r_base)(%ebx) /* * Load the data segment registers with 16-bit compatible values diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8ed8908..c2871d3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -312,26 +312,6 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) -{ - int node1 = early_cpu_to_node(cpu1); - int node2 = early_cpu_to_node(cpu2); - - /* - * Our CPU scheduler assumes all logical cpus in the same physical cpu - * share the same node. But, buggy ACPI or NUMA emulation might assign - * them to different node. Fix it. - */ - if (node1 != node2) { - pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", - cpu1, node1, cpu2, node2, node2); - - numa_remove_cpu(cpu1); - numa_set_node(cpu1, node2); - numa_add_cpu(cpu1); - } -} - static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); @@ -340,7 +320,6 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); - check_cpu_siblings_on_same_node(cpu1, cpu2); } @@ -382,12 +361,10 @@ void __cpuinit set_cpu_sibling_map(int cpu) per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - check_cpu_siblings_on_same_node(cpu, i); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); - check_cpu_siblings_on_same_node(cpu, i); /* * Does this new cpu bringup a new core? */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9..75ef4b1 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 286d289..37b8b0f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } +void __init native_pagetable_reserve(u64 start, u64 end) +{ + memblock_x86_reserve_range(start, end, "PGTABLE"); +} + struct map_range { unsigned long start; unsigned long end; @@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ if (!after_bootmem && pgt_buf_end > pgt_buf_start) - memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, - pgt_buf_end << PAGE_SHIFT, "PGTABLE"); + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 9559d36..745258d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -213,53 +213,48 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } -struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +void debug_cpumask_set_cpu(int cpu, int node, bool enable) { - int node = early_cpu_to_node(cpu); struct cpumask *mask; char buf[64]; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ - return NULL; + return; } mask = node_to_cpumask_map[node]; if (!mask) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); - return NULL; + return; } + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); - return mask; + return; } # ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static void __cpuinit numa_set_cpumask(int cpu, bool enable) { - struct cpumask *mask; - - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } void __cpuinit numa_add_cpu(int cpu) { - numa_set_cpumask(cpu, 1); + numa_set_cpumask(cpu, true); } void __cpuinit numa_remove_cpu(int cpu) { - numa_set_cpumask(cpu, 0); + numa_set_cpumask(cpu, false); } # endif /* !CONFIG_NUMA_EMU */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e8c00cc..85b52fc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -306,7 +306,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->end = min(bi->end, high); /* and there's no empty block */ - if (bi->start == bi->end) { + if (bi->start >= bi->end) { numa_remove_memblk_from(i--, mi); continue; } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index ad091e4..de84cc1 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -454,10 +454,9 @@ void __cpuinit numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) +static void __cpuinit numa_set_cpumask(int cpu, bool enable) { - struct cpumask *mask; - int nid, physnid, i; + int nid, physnid; nid = early_cpu_to_node(cpu); if (nid == NUMA_NO_NODE) { @@ -467,28 +466,21 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) physnid = emu_nid_to_phys[nid]; - for_each_online_node(i) { + for_each_online_node(nid) { if (emu_nid_to_phys[nid] != physnid) continue; - mask = debug_cpumask_set_cpu(cpu, enable); - if (!mask) - return; - - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + debug_cpumask_set_cpu(cpu, nid, enable); } } void __cpuinit numa_add_cpu(int cpu) { - numa_set_cpumask(cpu, 1); + numa_set_cpumask(cpu, true); } void __cpuinit numa_remove_cpu(int cpu) { - numa_set_cpumask(cpu, 0); + numa_set_cpumask(cpu, false); } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index 2d6d226..e70be38 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -347,7 +347,7 @@ "pciclass0c03"; reg = <0x16800 0x0 0x0 0x0 0x0>; - interrupts = <22 3>; + interrupts = <22 1>; }; usb@d,1 { @@ -357,7 +357,7 @@ "pciclass0c03"; reg = <0x16900 0x0 0x0 0x0 0x0>; - interrupts = <22 3>; + interrupts = <22 1>; }; sata@e,0 { @@ -367,7 +367,7 @@ "pciclass0106"; reg = <0x17000 0x0 0x0 0x0 0x0>; - interrupts = <23 3>; + interrupts = <23 1>; }; flash@f,0 { diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 7cb6424..c58e0ea 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - int tcpu; - int uvhub; int locals = 0; int remotes = 0; int hubs = 0; + int tcpu; + int tpnode; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; struct bau_control *tbcp; + struct hub_and_pnode *hpp; /* kernel was booted 'nobau' */ if (nobau) @@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - /* cpu statistics */ for_each_cpu(tcpu, flush_mask) { - uvhub = uv_cpu_to_blade_id(tcpu); - bau_uvhub_set(uvhub, &bau_desc->distribution); - if (uvhub == bcp->uvhub) + /* + * The distribution vector is a bit map of pnodes, relative + * to the partition base pnode (and the partition base nasid + * in the header). + * Translate cpu to pnode and hub using an array stored + * in local memory. + */ + hpp = &bcp->socket_master->target_hub_and_pnode[tcpu]; + tpnode = hpp->pnode - bcp->partition_base_pnode; + bau_uvhub_set(tpnode, &bau_desc->distribution); + if (hpp->uvhub == bcp->uvhub) locals++; else remotes++; @@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) * an interrupt, but causes an error message to be returned to * the sender. */ -static void uv_enable_timeouts(void) +static void __init uv_enable_timeouts(void) { int uvhub; int nuvhubs; @@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void) } /* - * initialize the sending side's sending buffers + * Initialize the sending side's sending buffers. */ static void -uv_activation_descriptor_init(int node, int pnode) +uv_activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; @@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; + /* the 14-bit pnode */ uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, (n << UV_DESC_BASE_PNODE_SHIFT | m)); - /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each * cpu even though we only use the first one; one descriptor can * describe a broadcast to 256 uv hubs. */ @@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode) memset(bd2, 0, sizeof(struct bau_desc)); bd2->header.sw_ack_flag = 1; /* - * base_dest_nodeid is the nasid of the first uvhub - * in the partition. The bit map will indicate uvhub numbers, - * which are 0-N in a partition. Pnodes are unique system-wide. + * The base_dest_nasid set in the message header is the nasid + * of the first uvhub in the partition. The bit map will + * indicate destination pnode numbers relative to that base. + * They may not be consecutive if nasid striding is being used. */ - bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); - bd2->header.dest_subnodeid = 0x10; /* the LB */ + bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); + bd2->header.dest_subnodeid = UV_LB_SUBNODEID; bd2->header.command = UV_NET_ENDPOINT_INTD; bd2->header.int_both = 1; /* @@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode) /* * Initialization of each UV hub's structures */ -static void __init uv_init_uvhub(int uvhub, int vector) +static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode) { int node; int pnode; @@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector) node = uvhub_to_first_node(uvhub); pnode = uv_blade_to_pnode(uvhub); - uv_activation_descriptor_init(node, pnode); + uv_activation_descriptor_init(node, pnode, base_pnode); uv_payload_queue_init(node, pnode); /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS + * The below initialization can't be in firmware because the + * messaging IRQ will be determined by the OS. */ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, @@ -1491,10 +1500,11 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static int __init uv_init_per_cpu(int nuvhubs) +static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode) { int i; int cpu; + int tcpu; int pnode; int uvhub; int have_hmaster; @@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs) bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); pnode = uv_cpu_hub_info(cpu)->pnode; + if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) { + printk(KERN_EMERG + "cpu %d pnode %d-%d beyond %d; BAU disabled\n", + cpu, pnode, base_part_pnode, + UV_DISTRIBUTION_SIZE); + return 1; + } + bcp->osnode = cpu_to_node(cpu); + bcp->partition_base_pnode = uv_partition_base_pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); bdp = &uvhub_descs[uvhub]; @@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs) bdp->pnode = pnode; /* kludge: 'assuming' one node per socket, and assuming that disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1); + socket = bcp->osnode & 1; bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; @@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs) nextsocket: socket++; socket_mask = (socket_mask >> 1); + /* each socket gets a local array of pnodes/hubs */ + bcp = smaster; + bcp->target_hub_and_pnode = kmalloc_node( + sizeof(struct hub_and_pnode) * + num_possible_cpus(), GFP_KERNEL, bcp->osnode); + memset(bcp->target_hub_and_pnode, 0, + sizeof(struct hub_and_pnode) * + num_possible_cpus()); + for_each_present_cpu(tcpu) { + bcp->target_hub_and_pnode[tcpu].pnode = + uv_cpu_hub_info(tcpu)->pnode; + bcp->target_hub_and_pnode[tcpu].uvhub = + uv_cpu_hub_info(tcpu)->numa_blade_id; + } } } kfree(uvhub_descs); @@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void) spin_lock_init(&disable_lock); congested_cycles = microsec_2_cycles(congested_response_us); - if (uv_init_per_cpu(nuvhubs)) { - nobau = 1; - return 0; - } - uv_partition_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { if (uv_blade_nr_possible_cpus(uvhub) && (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) uv_partition_base_pnode = uv_blade_to_pnode(uvhub); + } + + if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) { + nobau = 1; + return 0; + } vector = UV_BAU_MESSAGE; for_each_possible_blade(uvhub) if (uv_blade_nr_possible_cpus(uvhub)) - uv_init_uvhub(uvhub, vector); + uv_init_uvhub(uvhub, vector, uv_partition_base_pnode); uv_enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a991b57..0684f3c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) +{ + /* reserve the range used */ + native_pagetable_reserve(start, end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, + PFN_PHYS(pgt_buf_top)); + while (end < PFN_PHYS(pgt_buf_top)) { + make_lowmem_page_readwrite(__va(end)); + end += PAGE_SIZE; + } +} + static void xen_post_allocator_init(void); static __init void xen_pagetable_setup_done(pgd_t *base) @@ -1473,16 +1487,20 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - -#ifdef CONFIG_X86_32 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ if (pte_val_ma(*ptep) & _PAGE_PRESENT) pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & pte_val_ma(pte)); -#endif + + return pte; +} +#else /* CONFIG_X86_64 */ +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); /* * If the new pfn is within the range of the newly allocated @@ -1491,12 +1509,13 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * it is RO. */ if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_end)) || + pfn >= pgt_buf_start && pfn < pgt_buf_top)) || (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); return pte; } +#endif /* CONFIG_X86_64 */ /* Init-time set_pte while constructing initial pagetables, which doesn't allow RO pagetable pages to be remapped RW */ @@ -2100,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { void __init xen_init_mmu_ops(void) { + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index fa0269a..90bac0a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -227,7 +227,7 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; - xen_extra_mem_start = mem_end; + xen_extra_mem_start = max((1ULL << 32), mem_end); for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; |