summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c7
-rw-r--r--arch/x86/kernel/acpi/cstate.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/acpi/sleep.h4
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S4
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/amd_gart_64.c11
-rw-r--r--arch/x86/kernel/apic/apic.c89
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c15
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c504
-rw-r--r--arch/x86/kernel/apic/numaq_32.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c7
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_64.c6
-rw-r--r--arch/x86/kernel/check.c20
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c32
-rw-r--r--arch/x86/kernel/cpu/common.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c12
-rw-r--r--arch/x86/kernel/cpu/match.c91
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event.c191
-rw-r--r--arch/x86/kernel/cpu/perf_event.h51
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c32
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c570
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c181
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c526
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c19
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c19
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_32.c6
-rw-r--r--arch/x86/kernel/devicetree.c101
-rw-r--r--arch/x86/kernel/dumpstack.c32
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/entry_32.S17
-rw-r--r--arch/x86/kernel/entry_64.S117
-rw-r--r--arch/x86/kernel/ftrace.c500
-rw-r--r--arch/x86/kernel/i387.c84
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/init_task.c42
-rw-r--r--arch/x86/kernel/irq.c7
-rw-r--r--arch/x86/kernel/irq_32.c19
-rw-r--r--arch/x86/kernel/irqinit.c9
-rw-r--r--arch/x86/kernel/kdebugfs.c9
-rw-r--r--arch/x86/kernel/kgdb.c67
-rw-r--r--arch/x86/kernel/kprobes-common.h102
-rw-r--r--arch/x86/kernel/kprobes-opt.c512
-rw-r--r--arch/x86/kernel/kprobes.c668
-rw-r--r--arch/x86/kernel/kvm.c17
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c12
-rw-r--r--arch/x86/kernel/microcode_core.c34
-rw-r--r--arch/x86/kernel/microcode_intel.c14
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/nmi.c95
-rw-r--r--arch/x86/kernel/nmi_selftest.c50
-rw-r--r--arch/x86/kernel/paravirt.c18
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-nommu.c6
-rw-r--r--arch/x86/kernel/pci-swiotlb.c17
-rw-r--r--arch/x86/kernel/probe_roms.c1
-rw-r--r--arch/x86/kernel/process.c198
-rw-r--r--arch/x86/kernel/process_32.c65
-rw-r--r--arch/x86/kernel/process_64.c151
-rw-r--r--arch/x86/kernel/ptrace.c111
-rw-r--r--arch/x86/kernel/setup.c32
-rw-r--r--arch/x86/kernel/setup_percpu.c14
-rw-r--r--arch/x86/kernel/signal.c141
-rw-r--r--arch/x86/kernel/smpboot.c223
-rw-r--r--arch/x86/kernel/sys_x86_64.c40
-rw-r--r--arch/x86/kernel/syscall_64.c8
-rw-r--r--arch/x86/kernel/tboot.c9
-rw-r--r--arch/x86/kernel/tce_64.c1
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/tls.c5
-rw-r--r--arch/x86/kernel/traps.c143
-rw-r--r--arch/x86/kernel/tsc.c17
-rw-r--r--arch/x86/kernel/tsc_sync.c29
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c33
-rw-r--r--arch/x86/kernel/x86_init.c14
-rw-r--r--arch/x86/kernel/xsave.c1
106 files changed, 4371 insertions, 2241 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059..56ebd1f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_OPTPROBES) += kprobes-opt.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f3..7c439fe 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+ if (!apic->apic_id_valid(apic_id) && enabled)
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
else
acpi_register_lapic(apic_id, enabled);
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
@@ -642,6 +642,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
kfree(buffer.pointer);
buffer.length = ACPI_ALLOCATE_BUFFER;
buffer.pointer = NULL;
+ lapic = NULL;
if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
goto out;
@@ -650,7 +651,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
goto free_tmp_map;
cpumask_copy(tmp_map, cpu_present_mask);
- acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
+ acpi_register_lapic(physid, ACPI_MADT_ENABLED);
/*
* If mp_register_lapic successfully generates a new logical cpu
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb..d2b7f27 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -14,6 +14,7 @@
#include <acpi/processor.h>
#include <asm/acpi.h>
#include <asm/mwait.h>
+#include <asm/special_insns.h>
/*
* Initialize bm_flags based on the CPU cache properties
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 103b6ab..146a49c 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -24,6 +24,10 @@ unsigned long acpi_realmode_flags;
static char temp_stack[4096];
#endif
+asmlinkage void acpi_enter_s3(void)
+{
+ acpi_enter_sleep_state(3, wake_sleep_flags);
+}
/**
* acpi_suspend_lowlevel - save kernel state
*
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 416d4be..d68677a 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -3,12 +3,16 @@
*/
#include <asm/trampoline.h>
+#include <linux/linkage.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
extern int wakeup_pmode_return;
+extern u8 wake_sleep_flags;
+extern asmlinkage void acpi_enter_s3(void);
+
extern unsigned long acpi_copy_wakeup_routine(unsigned long);
extern void wakeup_long64(void);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 13ab7205..7261083 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -74,9 +74,7 @@ restore_registers:
ENTRY(do_suspend_lowlevel)
call save_processor_state
call save_registers
- pushl $3
- call acpi_enter_sleep_state
- addl $4, %esp
+ call acpi_enter_s3
# In case of S3 failure, we'll emerge here. Jump
# to ret_point to recover
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164..014d1d2 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -71,9 +71,7 @@ ENTRY(do_suspend_lowlevel)
movq %rsi, saved_rsi
addq $8, %rsp
- movl $3, %edi
- xorl %eax, %eax
- call acpi_enter_sleep_state
+ call acpi_enter_s3
/* in case something went wrong, restore the machine status and go on */
jmp resume_point
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b1e7c7f..e663112 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -477,7 +477,7 @@ error:
/* allocate and map a coherent mapping */
static void *
gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
- gfp_t flag)
+ gfp_t flag, struct dma_attrs *attrs)
{
dma_addr_t paddr;
unsigned long align_mask;
@@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
}
__free_pages(page, get_order(size));
} else
- return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+ return dma_generic_alloc_coherent(dev, size, dma_addr, flag,
+ attrs);
return NULL;
}
@@ -508,7 +509,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
/* free a coherent mapping */
static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr, struct dma_attrs *attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
free_pages((unsigned long)vaddr, get_order(size));
@@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = {
.unmap_sg = gart_unmap_sg,
.map_page = gart_map_page,
.unmap_page = gart_unmap_page,
- .alloc_coherent = gart_alloc_coherent,
- .free_coherent = gart_free_coherent,
+ .alloc = gart_alloc_coherent,
+ .free = gart_free_coherent,
.mapping_error = gart_mapping_error,
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2eec05b..39a222e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/irq_remapping.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
@@ -383,20 +384,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
{
- unsigned int rsvd; /* 0: uninitialized */
+ unsigned int rsvd, vector;
if (offset >= APIC_EILVT_NR_MAX)
return ~0;
- rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+ rsvd = atomic_read(&eilvt_offsets[offset]);
do {
- if (rsvd &&
- !eilvt_entry_is_changeable(rsvd, new))
+ vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
+ if (vector && !eilvt_entry_is_changeable(vector, new))
/* may not change if vectors are different */
return rsvd;
rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
} while (rsvd != new);
+ rsvd &= ~APIC_EILVT_MASKED;
+ if (rsvd && rsvd != vector)
+ pr_info("LVT offset %d assigned for vector 0x%02x\n",
+ offset, rsvd);
+
return new;
}
@@ -1320,11 +1326,13 @@ void __cpuinit setup_local_APIC(void)
acked);
break;
}
- if (cpu_has_tsc) {
- rdtscll(ntsc);
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
+ if (queued) {
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ }
} while (queued && max_loops > 0);
WARN_ON(max_loops <= 0);
@@ -1436,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void)
* Now that local APIC setup is completed for BP, configure the fault
* handling for interrupt remapping.
*/
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
+ if (irq_remapping_enabled)
+ irq_remap_enable_fault_handling();
}
@@ -1512,7 +1520,7 @@ void enable_x2apic(void)
int __init enable_IR(void)
{
#ifdef CONFIG_IRQ_REMAP
- if (!intr_remapping_supported()) {
+ if (!irq_remapping_supported()) {
pr_debug("intr-remapping not supported\n");
return -1;
}
@@ -1523,7 +1531,7 @@ int __init enable_IR(void)
return -1;
}
- return enable_intr_remapping();
+ return irq_remapping_enable();
#endif
return -1;
}
@@ -1532,10 +1540,13 @@ void __init enable_IR_x2apic(void)
{
unsigned long flags;
int ret, x2apic_enabled = 0;
- int dmar_table_init_ret;
+ int hardware_init_ret;
- dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret && !x2apic_supported())
+ /* Make sure irq_remap_ops are initialized */
+ setup_irq_remapping_ops();
+
+ hardware_init_ret = irq_remapping_prepare();
+ if (hardware_init_ret && !x2apic_supported())
return;
ret = save_ioapic_entries();
@@ -1551,7 +1562,7 @@ void __init enable_IR_x2apic(void)
if (x2apic_preenabled && nox2apic)
disable_x2apic();
- if (dmar_table_init_ret)
+ if (hardware_init_ret)
ret = -1;
else
ret = enable_IR();
@@ -1632,9 +1643,11 @@ static int __init apic_verify(void)
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
/* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ }
pr_info("Found and enabled local APIC!\n");
return 0;
@@ -1652,13 +1665,15 @@ int __init apic_force_enable(unsigned long addr)
* MSR. This can only be done in software for Intel P6 or later
* and AMD K7 (Model > 1) or later.
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
}
return apic_verify();
}
@@ -2167,8 +2182,8 @@ static int lapic_suspend(void)
local_irq_save(flags);
disable_local_APIC();
- if (intr_remapping_enabled)
- disable_intr_remapping();
+ if (irq_remapping_enabled)
+ irq_remapping_disable();
local_irq_restore(flags);
return 0;
@@ -2184,7 +2199,7 @@ static void lapic_resume(void)
return;
local_irq_save(flags);
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
/*
* IO-APIC and PIC have their own resume routines.
* We just mask them here to make sure the interrupt
@@ -2204,10 +2219,12 @@ static void lapic_resume(void)
* FIXME! This will be wrong if we ever support suspend on
* SMP! We'll need to do this as part of the CPU restore!
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
}
maxlvt = lapic_get_maxlvt();
@@ -2234,8 +2251,8 @@ static void lapic_resume(void)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled)
- reenable_intr_remapping(x2apic_mode);
+ if (irq_remapping_enabled)
+ irq_remapping_reenable(x2apic_mode);
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 8c3cdde..0e881c4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -180,6 +180,7 @@ static struct apic apic_flat = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -226,6 +227,7 @@ static struct apic apic_flat = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -337,6 +339,7 @@ static struct apic apic_physflat = {
.name = "physical flat",
.probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
@@ -384,6 +387,7 @@ static struct apic apic_physflat = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 775b82b..a6e4c6e 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -124,6 +124,7 @@ struct apic apic_noop = {
.probe = noop_probe,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -180,6 +181,7 @@ struct apic apic_noop = {
.read = noop_apic_read,
.write = noop_apic_write,
+ .eoi_write = noop_apic_write,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
.wait_icr_idle = noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 09d3d8c..6ec6d5d 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void)
return get_apic_id(apic_read(APIC_ID));
}
+static int numachip_apic_id_valid(int apicid)
+{
+ /* Trust what bootloader passes in MADT */
+ return 1;
+}
+
static int numachip_apic_id_registered(void)
{
return physid_isset(read_xapic_id(), phys_cpu_present_map);
@@ -201,8 +207,11 @@ static void __init map_csrs(void)
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
- c->phys_proc_id = node;
- per_cpu(cpu_llc_id, smp_processor_id()) = node;
+
+ if (c->phys_proc_id != node) {
+ c->phys_proc_id = node;
+ per_cpu(cpu_llc_id, smp_processor_id()) = node;
+ }
}
static int __init numachip_system_init(void)
@@ -238,6 +247,7 @@ static struct apic apic_numachip __refconst = {
.name = "NumaConnect system",
.probe = numachip_probe,
.acpi_madt_oem_check = numachip_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
@@ -285,6 +295,7 @@ static struct apic apic_numachip __refconst = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 521bead..31fbdbf 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -198,6 +198,7 @@ static struct apic apic_bigsmp = {
.name = "bigsmp",
.probe = probe_bigsmp,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = bigsmp_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
@@ -247,6 +248,7 @@ static struct apic apic_bigsmp = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 5d513bc..db4ab1b 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = {
.name = "es7000",
.probe = probe_es7000,
.acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = es7000_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -677,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -690,6 +692,7 @@ static struct apic __refdata apic_es7000 = {
.name = "es7000",
.probe = probe_es7000,
.acpi_madt_oem_check = es7000_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = es7000_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
@@ -740,6 +743,7 @@ static struct apic __refdata apic_es7000 = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fb07275..ffdc152 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -64,9 +64,26 @@
#include <asm/apic.h>
#define __apicdebuginit(type) static type __init
+
#define for_each_irq_pin(entry, head) \
for (entry = head; entry; entry = entry->next)
+#ifdef CONFIG_IRQ_REMAP
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return false;
+}
+static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+}
+#endif
+
/*
* Is the SiS APIC rmw bug present ?
* -1 = don't know, 0 = no, 1 = yes
@@ -294,6 +311,7 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
irq_free_desc(at);
}
+
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -314,16 +332,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
writel(vector, &io_apic->eoi);
}
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
@@ -334,7 +353,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
*
* Older SiS APIC requires we rewrite the index register
*/
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -343,29 +362,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- int pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- /* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return true;
- }
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return false;
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -377,6 +373,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
return eu.entry;
}
@@ -384,9 +381,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
union entry_union eu;
unsigned long flags;
+
raw_spin_lock_irqsave(&ioapic_lock, flags);
eu.entry = __ioapic_read_entry(apic, pin);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
return eu.entry;
}
@@ -396,8 +395,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
* the interrupt, and we need to make sure the entry is fully populated
* before that happens.
*/
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
union entry_union eu = {{0, 0}};
@@ -409,6 +407,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
+
raw_spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, e);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -435,8 +434,7 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int
-__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list **last, *entry;
@@ -521,6 +519,7 @@ static void io_apic_sync(struct irq_pin_list *entry)
* a dummy read from the IO-APIC
*/
struct io_apic __iomem *io_apic;
+
io_apic = io_apic_base(entry->apic);
readl(&io_apic->data);
}
@@ -1322,77 +1321,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
fasteoi ? "fasteoi" : "edge");
}
-
-static int setup_ir_ioapic_entry(int irq,
- struct IR_IO_APIC_route_entry *entry,
- unsigned int destination, int vector,
- struct io_apic_irq_attr *attr)
-{
- int index;
- struct irte irte;
- int ioapic_id = mpc_ioapic_id(attr->ioapic);
- struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
-
- if (!iommu) {
- pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
- return -ENODEV;
- }
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0) {
- pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
- return -ENOMEM;
- }
-
- prepare_irte(&irte, vector, destination);
-
- /* Set source-id of interrupt request */
- set_ioapic_sid(&irte, ioapic_id);
-
- modify_irte(irq, &irte);
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
- "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
- "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
- "Avail:%X Vector:%02X Dest:%08X "
- "SID:%04X SQ:%X SVT:%X)\n",
- attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
- irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
- irte.avail, irte.vector, irte.dest_id,
- irte.sid, irte.sq, irte.svt);
-
- memset(entry, 0, sizeof(*entry));
-
- entry->index2 = (index >> 15) & 0x1;
- entry->zero = 0;
- entry->format = 1;
- entry->index = (index & 0x7fff);
- /*
- * IO-APIC RTE will be configured with virtual vector.
- * irq handler will do the explicit EOI to the io-apic.
- */
- entry->vector = attr->ioapic_pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = attr->trigger;
- entry->polarity = attr->polarity;
-
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (attr->trigger)
- entry->mask = 1;
-
- return 0;
-}
-
static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
unsigned int destination, int vector,
struct io_apic_irq_attr *attr)
{
- if (intr_remapping_enabled)
- return setup_ir_ioapic_entry(irq,
- (struct IR_IO_APIC_route_entry *)entry,
- destination, vector, attr);
+ if (irq_remapping_enabled)
+ return setup_ioapic_remapped_entry(irq, entry, destination,
+ vector, attr);
memset(entry, 0, sizeof(*entry));
@@ -1549,7 +1484,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
{
struct IO_APIC_route_entry entry;
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
return;
memset(&entry, 0, sizeof(entry));
@@ -1635,7 +1570,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
" Pol Stat Indx2 Zero Vect:\n");
} else {
@@ -1644,7 +1579,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
}
for (i = 0; i <= reg_01.bits.entries; i++) {
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
struct IO_APIC_route_entry entry;
struct IR_IO_APIC_route_entry *ir_entry;
@@ -2011,7 +1946,7 @@ void disable_IO_APIC(void)
* IOAPIC RTE as well as interrupt-remapping table entry).
* As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2035,7 +1970,7 @@ void disable_IO_APIC(void)
* Use virtual wire A mode when interrupt remapping is enabled.
*/
if (cpu_has_apic || apic_from_smp_config())
- disconnect_bsp_APIC(!intr_remapping_enabled &&
+ disconnect_bsp_APIC(!irq_remapping_enabled &&
ioapic_i8259.pin != -1);
}
@@ -2351,71 +2286,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
-#ifdef CONFIG_IRQ_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int dest, irq = data->irq;
- struct irte irte;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -EINVAL;
-
- if (get_irte(irq, &irte))
- return -EBUSY;
-
- if (assign_irq_vector(irq, cfg, mask))
- return -EBUSY;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * Atomically updates the IRTE with the new destination, vector
- * and flushes the interrupt entry cache.
- */
- modify_irte(irq, &irte);
-
- /*
- * After this point, all the interrupts will start arriving
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
- */
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- cpumask_copy(data->affinity, mask);
- return 0;
-}
-
-#else
-static inline int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- return 0;
-}
-#endif
-
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
@@ -2512,21 +2382,96 @@ static void ack_apic_edge(struct irq_data *data)
atomic_t irq_mis_count;
-static void ack_apic_level(struct irq_data *data)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
{
- struct irq_cfg *cfg = data->chip_data;
- int i, do_unmask_irq = 0, irq = data->irq;
- unsigned long v;
+ struct irq_pin_list *entry;
+ unsigned long flags;
- irq_complete_move(cfg);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
/* If we are moving the irq we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- do_unmask_irq = 1;
mask_ioapic(cfg);
+ return true;
}
+ return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+ struct irq_cfg *cfg, bool masked)
+{
+ if (unlikely(masked)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(cfg))
+ irq_move_masked_irq(data);
+ unmask_ioapic(cfg);
+ }
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+ return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+ struct irq_cfg *cfg, bool masked)
+{
+}
#endif
+static void ack_apic_level(struct irq_data *data)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ int i, irq = data->irq;
+ unsigned long v;
+ bool masked;
+
+ irq_complete_move(cfg);
+ masked = ioapic_irqd_mask(data, cfg);
+
/*
* It appears there is an erratum which affects at least version 0x11
* of I/O APIC (that's the 82093AA and cores integrated into various
@@ -2581,38 +2526,7 @@ static void ack_apic_level(struct irq_data *data)
eoi_ioapic_irq(irq, cfg);
}
- /* Now we can move and renable the irq */
- if (unlikely(do_unmask_irq)) {
- /* Only migrate the irq if the ack has been received.
- *
- * On rare occasions the broadcast level triggered ack gets
- * delayed going to ioapics, and if we reprogram the
- * vector while Remote IRR is still set the irq will never
- * fire again.
- *
- * To prevent this scenario we read the Remote IRR bit
- * of the ioapic. This has two effects.
- * - On any sane system the read of the ioapic will
- * flush writes (and acks) going to the ioapic from
- * this cpu.
- * - We get to see if the ACK has actually been delivered.
- *
- * Based on failed experiments of reprogramming the
- * ioapic entry from outside of irq context starting
- * with masking the ioapic entry and then polling until
- * Remote IRR was clear before reprogramming the
- * ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
- *
- * However there appears to be no other way to plug
- * this race, so if the Remote IRR bit is not
- * accurate and is causing problems then it is a hardware bug
- * and you can go talk to the chipset vendor about it.
- */
- if (!io_apic_level_ack_pending(cfg))
- irq_move_masked_irq(data);
- unmask_ioapic(cfg);
- }
+ ioapic_irqd_unmask(data, cfg, masked);
}
#ifdef CONFIG_IRQ_REMAP
@@ -2639,7 +2553,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
chip->irq_eoi = ir_ack_apic_level;
#ifdef CONFIG_SMP
- chip->irq_set_affinity = ir_ioapic_set_affinity;
+ chip->irq_set_affinity = set_remapped_irq_affinity;
#endif
}
#endif /* CONFIG_IRQ_REMAP */
@@ -2852,7 +2766,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
@@ -2885,7 +2799,7 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
@@ -3109,7 +3023,7 @@ void destroy_irq(unsigned int irq)
irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
if (irq_remapped(cfg))
- free_irte(irq);
+ free_remapped_irq(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -3138,54 +3052,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
if (irq_remapped(cfg)) {
- struct irte irte;
- int ir_index;
- u16 sub_handle;
-
- ir_index = map_irq_to_irte_handle(irq, &sub_handle);
- BUG_ON(ir_index == -1);
-
- prepare_irte(&irte, cfg->vector, dest);
-
- /* Set source-id of interrupt request */
- if (pdev)
- set_msi_sid(&irte, pdev);
- else
- set_hpet_sid(&irte, hpet_id);
-
- modify_irte(irq, &irte);
+ compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
+ return err;
+ }
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
msg->address_hi = MSI_ADDR_BASE_HI;
- msg->data = sub_handle;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
- MSI_ADDR_IR_SHV |
- MSI_ADDR_IR_INDEX1(ir_index) |
- MSI_ADDR_IR_INDEX2(ir_index);
- } else {
- if (x2apic_enabled())
- msg->address_hi = MSI_ADDR_BASE_HI |
- MSI_ADDR_EXT_DEST_ID(dest);
- else
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
- }
return err;
}
@@ -3228,33 +3122,6 @@ static struct irq_chip msi_chip = {
.irq_retrigger = ioapic_retrigger_irq,
};
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
-{
- struct intel_iommu *iommu;
- int index;
-
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- printk(KERN_ERR
- "Unable to map PCI %s to iommu\n", pci_name(dev));
- return -ENOENT;
- }
-
- index = alloc_irte(iommu, irq, nvec);
- if (index < 0) {
- printk(KERN_ERR
- "Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
- return -ENOSPC;
- }
- return index;
-}
-
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
struct irq_chip *chip = &msi_chip;
@@ -3285,7 +3152,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int node, ret, sub_handle, index = 0;
unsigned int irq, irq_want;
struct msi_desc *msidesc;
- struct intel_iommu *iommu = NULL;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3299,7 +3165,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (irq == 0)
return -1;
irq_want = irq + 1;
- if (!intr_remapping_enabled)
+ if (!irq_remapping_enabled)
goto no_ir;
if (!sub_handle) {
@@ -3307,23 +3173,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
* allocate the consecutive block of IRTE's
* for 'nvec'
*/
- index = msi_alloc_irte(dev, irq, nvec);
+ index = msi_alloc_remapped_irq(dev, irq, nvec);
if (index < 0) {
ret = index;
goto error;
}
} else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
+ ret = msi_setup_remapped_irq(dev, irq, index,
+ sub_handle);
+ if (ret < 0)
goto error;
- }
- /*
- * setup the mapping between the irq and the IRTE
- * base index, the sub_handle pointing to the
- * appropriate interrupt remap table entry.
- */
- set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
ret = setup_msi_irq(dev, msidesc, irq);
@@ -3441,15 +3300,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
struct msi_msg msg;
int ret;
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_hpet_to_ir(id);
- int index;
-
- if (!iommu)
- return -1;
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
+ if (irq_remapping_enabled) {
+ if (!setup_hpet_msi_remapped(irq, id))
return -1;
}
@@ -3828,8 +3680,8 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
- if (intr_remapping_enabled)
- ir_ioapic_set_affinity(idata, mask, false);
+ if (irq_remapping_enabled)
+ set_remapped_irq_affinity(idata, mask, false);
else
ioapic_set_affinity(idata, mask, false);
}
@@ -3871,7 +3723,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
return res;
}
-void __init ioapic_and_gsi_init(void)
+void __init native_io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
@@ -3967,18 +3819,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
static __init int bad_ioapic(unsigned long address)
{
if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
- "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+ pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, nr_ioapics);
return 1;
}
if (!address) {
- printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
+ pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
return 1;
}
return 0;
}
+static __init int bad_ioapic_register(int idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+
+ reg_00.raw = io_apic_read(idx, 0);
+ reg_01.raw = io_apic_read(idx, 1);
+ reg_02.raw = io_apic_read(idx, 2);
+
+ if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+ pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+ mpc_ioapic_addr(idx));
+ return 1;
+ }
+
+ return 0;
+}
+
void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
{
int idx = 0;
@@ -3995,6 +3865,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
ioapics[idx].mp_config.apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+
+ if (bad_ioapic_register(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return;
+ }
+
ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
@@ -4015,10 +3891,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
if (gsi_cfg->gsi_end >= gsi_top)
gsi_top = gsi_cfg->gsi_end + 1;
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
- mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
- gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+ idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
nr_ioapics++;
}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index c4a61ca..f00a68c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = {
.name = "NUMAQ",
.probe = probe_numaq,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = numaq_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -529,6 +530,7 @@ static struct apic __refdata apic_numaq = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0787bb3..1b291da 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -92,6 +92,7 @@ static struct apic apic_default = {
.name = "default",
.probe = probe_default,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -141,6 +142,7 @@ static struct apic apic_default = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 1911442..659897c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -496,6 +496,7 @@ static struct apic apic_summit = {
.name = "summit",
.probe = probe_summit,
.acpi_madt_oem_check = summit_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = summit_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -545,6 +546,7 @@ static struct apic apic_summit = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 5007958..ff35cff 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -259,6 +260,7 @@ static struct apic apic_x2apic_cluster = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f5373df..c17e982 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -24,6 +24,12 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (x2apic_phys)
return x2apic_enabled();
+ else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) &&
+ x2apic_enabled()) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return 1;
+ }
else
return 0;
}
@@ -119,6 +125,7 @@ static struct apic apic_x2apic_phys = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
@@ -165,6 +172,7 @@ static struct apic apic_x2apic_phys = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 79b05b8..c6d03f7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
+static int uv_apic_id_valid(int apicid)
+{
+ return 1;
+}
+
static int uv_apic_id_registered(void)
{
return 1;
@@ -351,6 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.name = "UV large system",
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_valid = uv_apic_id_valid,
.apic_id_registered = uv_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
@@ -398,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f76623c..07b0c0d 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -231,7 +231,6 @@
#include <linux/syscore_ops.h>
#include <linux/i8253.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/olpc.h>
@@ -1234,8 +1233,7 @@ static int suspend(int vetoable)
struct apm_user *as;
dpm_suspend_start(PMSG_SUSPEND);
-
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
syscore_suspend();
@@ -1259,9 +1257,9 @@ static int suspend(int vetoable)
syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
-
+ dpm_resume_start(PMSG_RESUME);
dpm_resume_end(PMSG_RESUME);
+
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1275,7 @@ static void standby(void)
{
int err;
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
syscore_suspend();
@@ -1291,7 +1289,7 @@ static void standby(void)
syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
+ dpm_resume_start(PMSG_RESUME);
}
static apm_event_t get_event(void)
@@ -2403,7 +2401,7 @@ static void __exit apm_exit(void)
* (pm_idle), Wait for all processors to update cached/local
* copies of pm_idle before proceeding.
*/
- cpu_idle_wait();
+ kick_all_cpus_sync();
}
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 834e897..1b4754f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,6 +1,12 @@
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
+#ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
+#else
+# define __SYSCALL_X32(nr, sym, compat) /* nothing */
+#endif
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 5da1269..e2dbcb7 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,21 +27,29 @@ static int num_scan_areas;
static __init int set_corruption_check(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- memory_corruption_check = simple_strtol(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ memory_corruption_check = val;
+ return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- corruption_check_period = simple_strtoul(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ corruption_check_period = val;
+ return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 25f24dc..6ab6aa2 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
obj-y += rdrand.o
+obj-y += match.o
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4..146bb62 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/io.h>
+#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
@@ -25,7 +26,8 @@
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
- * http://www.amd.com/K6/k6docs/revgd.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
@@ -93,7 +95,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
"system stability may be impaired when more than 32 MB are used.\n");
else
printk(KERN_CONT "probably OK (after B9730xxxx).\n");
- printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
}
/* K6 with old style WHCR */
@@ -352,10 +353,11 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
node = per_cpu(cpu_llc_id, cpu);
/*
- * If core numbers are inconsistent, it's likely a multi-fabric platform,
- * so invoke platform-specific handler
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
*/
- if (c->phys_proc_id != node)
+ if (x86_cpuinit.fixup_cpu_id)
x86_cpuinit.fixup_cpu_id(c, node);
if (!node_online(node)) {
@@ -456,6 +458,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ if (!check_tsc_unstable())
+ sched_clock_stable = 1;
}
#ifdef CONFIG_X86_64
@@ -576,6 +580,24 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
+ /* re-enable TopologyExtensions if switched off by BIOS */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ u64 val;
+
+ if (!rdmsrl_amd_safe(0xc0011005, &val)) {
+ val |= 1ULL << 54;
+ wrmsrl_amd_safe(0xc0011005, val);
+ rdmsrl(0xc0011005, val);
+ if (val & (1ULL << 54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ printk(KERN_INFO FW_INFO "CPU: Re-enabling "
+ "disabled Topology Extensions Support\n");
+ }
+ }
+ }
+
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c0f7d68..82f29e7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,6 +18,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/debugreg.h>
#include <asm/sections.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
@@ -28,6 +29,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/mtrr.h>
#include <linux/numa.h>
#include <asm/asm.h>
@@ -933,7 +935,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = {
{ 0xc0011000, 0xc001103b},
};
-static void __cpuinit print_cpu_msr(void)
+static void __cpuinit __print_cpu_msr(void)
{
unsigned index_min, index_max;
unsigned index;
@@ -997,13 +999,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
else
printk(KERN_CONT "\n");
-#ifdef CONFIG_SMP
+ print_cpu_msr(c);
+}
+
+void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
+{
if (c->cpu_index < show_msr)
- print_cpu_msr();
-#else
- if (show_msr)
- print_cpu_msr();
-#endif
+ __print_cpu_msr();
}
static __init int setup_disablecpuid(char *arg)
@@ -1045,7 +1047,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
DEFINE_PER_CPU(unsigned int, irq_count) = -1;
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
/*
* Special IST stacks which the CPU switches to when it calls
@@ -1115,7 +1116,6 @@ void debug_stack_reset(void)
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
@@ -1163,15 +1163,6 @@ static void dbg_restore_debug_regs(void)
#endif /* ! CONFIG_KGDB */
/*
- * Prints an error where the NUMA and configured core-number mismatch and the
- * platform didn't override this to fix it up
- */
-void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
-{
- pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
-}
-
-/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
@@ -1194,7 +1185,7 @@ void __cpuinit cpu_init(void)
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ if (cpu != 0 && this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 73d08ed..9a7c90d 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -433,14 +433,14 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
/* check if @slot is already used or the index is already disabled */
ret = amd_get_l3_disable_slot(nb, slot);
if (ret >= 0)
- return -EINVAL;
+ return -EEXIST;
if (index > nb->l3_cache.indices)
return -EINVAL;
/* check whether the other slot has disabled the same index already */
if (index == amd_get_l3_disable_slot(nb, !slot))
- return -EINVAL;
+ return -EEXIST;
amd_l3_disable_index(nb, cpu, slot, index);
@@ -468,8 +468,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
- printk(KERN_WARNING "L3 disable slot %d in use!\n",
- slot);
+ pr_warning("L3 slot %d in use/index already disabled!\n",
+ slot);
return err;
}
return count;
@@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
new_l2 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid >> index_msb;
+ l2_id = c->apicid & ~((1 << index_msb) - 1);
break;
case 3:
new_l3 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(
num_threads_sharing);
- l3_id = c->apicid >> index_msb;
+ l3_id = c->apicid & ~((1 << index_msb) - 1);
break;
default:
break;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 0000000..3656537
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,91 @@
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/**
+ * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ * {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL. The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ * { X86_VENDOR_INTEL, 6, 0x12 }
+ * or to match a specific CPU feature
+ * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+ const struct x86_cpu_id *m;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+ if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+ continue;
+ if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+ continue;
+ if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+ continue;
+ if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+ continue;
+ return m;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
+
+ssize_t arch_print_cpu_modalias(struct device *dev,
+ struct device_attribute *attr,
+ char *bufptr)
+{
+ int size = PAGE_SIZE;
+ int i, n;
+ char *buf = bufptr;
+
+ n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
+ "model:%04X:feature:",
+ boot_cpu_data.x86_vendor,
+ boot_cpu_data.x86,
+ boot_cpu_data.x86_model);
+ size -= n;
+ buf += n;
+ size -= 1;
+ for (i = 0; i < NCAPINTS*32; i++) {
+ if (boot_cpu_has(i)) {
+ n = snprintf(buf, size, ",%04X", i);
+ if (n >= size) {
+ WARN(1, "x86 features overflow page\n");
+ break;
+ }
+ size -= n;
+ buf += n;
+ }
+ }
+ *buf++ = '\n';
+ return buf - bufptr;
+}
+
+int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (buf) {
+ arch_print_cpu_modalias(NULL, NULL, buf);
+ add_uevent_var(env, "MODALIAS=%s", buf);
+ kfree(buf);
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 7395d5f..0c82091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,14 @@ static struct severity {
#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
#define MCACOD 0xffff
+/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
+#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
+#define MCACOD_SCRUBMSK 0xfff0
+#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
+#define MCACOD_DATA 0x0134 /* Data Load */
+#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
MCESEV(
NO, "Invalid",
@@ -102,11 +109,24 @@ static struct severity {
SER, BITCLR(MCI_STATUS_S)
),
- /* AR add known MCACODs here */
MCESEV(
PANIC, "Action required with lost events",
SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "HT thread notices Action required: data load error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ AR, "Action required: data load error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
@@ -115,11 +135,11 @@ static struct severity {
/* known AO MCACODs: */
MCESEV(
AO, "Action optional: memory scrubbing error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
),
MCESEV(
SOME, "Action optional: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2..297edb1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void)
{
unsigned int next, i, prev = 0;
- next = rcu_dereference_check_mce(mcelog.next);
+ next = ACCESS_ONCE(mcelog.next);
do {
struct mce *m;
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
irq_work_queue(&__get_cpu_var(mce_irq_work));
}
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+ }
+}
+
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -562,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- percpu_inc(mce_poll_count);
+ this_cpu_inc(mce_poll_count);
mce_gather_info(&m, NULL);
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
@@ -917,6 +935,51 @@ static void mce_clear_state(unsigned long *toclear)
}
/*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define MCE_INFO_MAX 16
+
+struct mce_info {
+ atomic_t inuse;
+ struct task_struct *t;
+ __u64 paddr;
+ int restartable;
+} mce_info[MCE_INFO_MAX];
+
+static void mce_save_info(__u64 addr, int c)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+ if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+ mi->t = current;
+ mi->paddr = addr;
+ mi->restartable = c;
+ return;
+ }
+ }
+
+ mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+
+static struct mce_info *mce_find_info(void)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+ if (atomic_read(&mi->inuse) && mi->t == current)
+ return mi;
+ return NULL;
+}
+
+static void mce_clear_info(struct mce_info *mi)
+{
+ atomic_set(&mi->inuse, 0);
+}
+
+/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
@@ -954,7 +1017,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
atomic_inc(&mce_entry);
- percpu_inc(mce_exception_count);
+ this_cpu_inc(mce_exception_count);
if (!banks)
goto out;
@@ -969,7 +1032,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
barrier();
/*
- * When no restart IP must always kill or panic.
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
@@ -1023,16 +1088,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
continue;
}
- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
/*
* Action optional error. Queue address for later processing.
@@ -1052,6 +1108,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
}
}
+ /* mce_clear_state will clear *final, save locally for use later */
+ m = *final;
+
if (!no_way_out)
mce_clear_state(toclear);
@@ -1063,27 +1122,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
no_way_out = worst >= MCE_PANIC_SEVERITY;
/*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- *
- * This is mainly used in the case when the system doesn't
- * support MCE broadcasting or it has been disabled.
- */
- if (no_way_out && tolerant < 3)
- mce_panic("Fatal machine check on current CPU", final, msg);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
+ * At insane "tolerant" levels we take no action. Otherwise
+ * we only die if we have no other choice. For less serious
+ * issues we try to recover, or limit damage to the current
+ * process.
*/
-
- if (kill_it && tolerant < 3)
- force_sig(SIGBUS, current);
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
+ if (tolerant < 3) {
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+ if (worst == MCE_AR_SEVERITY) {
+ /* schedule action before return to userland */
+ mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
+ set_thread_flag(TIF_MCE_NOTIFY);
+ } else if (kill_it) {
+ force_sig(SIGBUS, current);
+ }
+ }
if (worst > 0)
mce_report_event(regs);
@@ -1094,34 +1148,63 @@ out:
}
EXPORT_SYMBOL_GPL(do_machine_check);
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int vector, int flags)
{
- printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+
+ return 0;
}
+#endif
/*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
+ * Called in process context that interrupted by MCE and marked with
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
+ * This code is allowed to sleep.
+ * Attempt possible recovery such as calling the high level VM handler to
+ * process any corrupted pages, and kill/signal current process if required.
+ * Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
- mce_notify_irq();
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR);
+ struct mce_info *mi = mce_find_info();
+
+ if (!mi)
+ mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+ pfn = mi->paddr >> PAGE_SHIFT;
+
+ clear_thread_flag(TIF_MCE_NOTIFY);
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ mi->paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
+ mi->restartable == 0) {
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS, current);
+ }
+ mce_clear_info(mi);
}
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
static void mce_process_work(struct work_struct *dummy)
{
- mce_notify_process();
+ unsigned long pfn;
+
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR, 0);
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1294,6 @@ int mce_notify_irq(void)
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
- clear_thread_flag(TIF_MCE_NOTIFY);
-
if (test_and_clear_bit(0, &mce_need_notify)) {
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
@@ -1541,6 +1622,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
return rc;
}
rc = -EFAULT;
@@ -1859,7 +1946,7 @@ static struct bus_type mce_subsys = {
.dev_name = "machinecheck",
};
-struct device *mce_device[CONFIG_NR_CPUS];
+DEFINE_PER_CPU(struct device *, mce_device);
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2038,7 +2125,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
goto error2;
}
cpumask_set_cpu(cpu, mce_device_initialized);
- mce_device[cpu] = dev;
+ per_cpu(mce_device, cpu) = dev;
return 0;
error2:
@@ -2055,7 +2142,7 @@ error:
static __cpuinit void mce_device_remove(unsigned int cpu)
{
- struct device *dev = mce_device[cpu];
+ struct device *dev = per_cpu(mce_device, cpu);
int i;
if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2069,7 +2156,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
device_unregister(dev);
cpumask_clear_cpu(cpu, mce_device_initialized);
- mce_device[cpu] = NULL;
+ per_cpu(mce_device, cpu) = NULL;
}
/* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e4eeaaf..99b5717 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
- struct device *dev = mce_device[cpu];
+ struct device *dev = per_cpu(mce_device, cpu);
char name[32];
sprintf(name, "threshold_bank%i", bank);
@@ -587,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- dev = mce_device[i];
+ dev = per_cpu(mce_device, i);
if (dev)
err = sysfs_create_link(&dev->kobj,b->kobj, name);
if (err)
@@ -667,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&mce_device[cpu]->kobj, name);
+ dev = per_cpu(mce_device, cpu);
+ sysfs_remove_link(&dev->kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -679,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- dev = mce_device[i];
+ dev = per_cpu(mce_device, i);
if (dev)
sysfs_remove_link(&dev->kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 5c0e653..2d5454c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,7 +9,6 @@
#include <linux/smp.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/mce.h>
#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 67bb17a..47a1870 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -25,7 +25,6 @@
#include <linux/cpu.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 54060f5..2d7998f 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,7 +8,6 @@
#include <linux/init.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/mce.h>
#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 97b2635..75772ae 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -12,7 +12,6 @@
#include <asm/processor-flags.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
-#include <asm/system.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 7928963..a041e09 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
+ unsigned long base;
unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
@@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that go above 4GB */
- if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
|| size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
- gentry.base <<= PAGE_SHIFT;
+ gentry.base = base << PAGE_SHIFT;
gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
@@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that would overflow */
if (size != (__typeof__(gentry.size))size)
gentry.base = gentry.size = gentry.type = 0;
else {
+ gentry.base = base;
gentry.size = size;
gentry.type = type;
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce10..e049d6d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,13 +24,14 @@
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
+#include <linux/device.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
#include <asm/nmi.h>
-#include <asm/compat.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/timer.h>
#include "perf_event.h"
@@ -351,6 +352,36 @@ int x86_setup_perfctr(struct perf_event *event)
return 0;
}
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+ u64 m = event->attr.branch_sample_type;
+ u64 b = 0;
+
+ /* must capture all branches */
+ if (!(m & PERF_SAMPLE_BRANCH_ANY))
+ return 0;
+
+ m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_user)
+ b |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ /*
+ * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+ */
+
+ return m == b;
+}
+
int x86_pmu_hw_config(struct perf_event *event)
{
if (event->attr.precise_ip) {
@@ -367,6 +398,36 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+ }
+ }
}
/*
@@ -423,6 +484,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
/* mark unused */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
return x86_pmu.hw_config(event);
}
@@ -577,14 +639,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
/* Prefer fixed purpose counters */
if (x86_pmu.num_counters_fixed) {
idx = X86_PMC_IDX_FIXED;
- for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
}
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
- for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
@@ -1121,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1157,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
@@ -1210,6 +1270,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_STARTING:
+ if (x86_pmu.attr_rdpmc)
+ set_in_cr4(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1246,6 +1308,11 @@ static void __init pmu_check_apic(void)
pr_info("no hardware sampling interrupt available.\n");
}
+static struct attribute_group x86_pmu_format_group = {
+ .name = "format",
+ .attrs = NULL,
+};
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1319,6 +1386,9 @@ static int __init init_hw_perf_events(void)
}
}
+ x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+ x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -1542,23 +1612,115 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+ int idx = event->hw.idx;
+
+ if (!x86_pmu.attr_rdpmc)
+ return 0;
+
+ if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
+ idx -= X86_PMC_IDX_FIXED;
+ idx |= 1 << 30;
+ }
+
+ return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static void change_rdpmc(void *info)
+{
+ bool enable = !!(unsigned long)info;
+
+ if (enable)
+ set_in_cr4(X86_CR4_PCE);
+ else
+ clear_in_cr4(X86_CR4_PCE);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val = simple_strtoul(buf, NULL, 0);
+
+ if (!!val != !!x86_pmu.attr_rdpmc) {
+ x86_pmu.attr_rdpmc = !!val;
+ smp_call_function(change_rdpmc, (void *)val, 1);
+ }
+
+ return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+ &dev_attr_rdpmc.attr,
+ NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+ .attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+ &x86_pmu_attr_group,
+ &x86_pmu_format_group,
+ NULL,
+};
+
+static void x86_pmu_flush_branch_stack(void)
+{
+ if (x86_pmu.flush_branch_stack)
+ x86_pmu.flush_branch_stack();
+}
+
static struct pmu pmu = {
- .pmu_enable = x86_pmu_enable,
- .pmu_disable = x86_pmu_disable,
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
+
+ .attr_groups = x86_pmu_attr_groups,
.event_init = x86_pmu_event_init,
- .add = x86_pmu_add,
- .del = x86_pmu_del,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
.start_txn = x86_pmu_start_txn,
.cancel_txn = x86_pmu_cancel_txn,
.commit_txn = x86_pmu_commit_txn,
+
+ .event_idx = x86_pmu_event_idx,
+ .flush_branch_stack = x86_pmu_flush_branch_stack,
};
+void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+{
+ userpg->cap_usr_time = 0;
+ userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
+ userpg->pmc_width = x86_pmu.cntval_bits;
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ return;
+
+ userpg->cap_usr_time = 1;
+ userpg->time_mult = this_cpu_read(cyc2ns);
+ userpg->time_shift = CYC2NS_SCALE_FACTOR;
+ userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+}
+
/*
* callchain support
*/
@@ -1595,6 +1757,9 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
}
#ifdef CONFIG_COMPAT
+
+#include <asm/compat.h>
+
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index c30c807..6638aaf 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -33,6 +33,7 @@ enum extra_reg_type {
EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
+ EXTRA_REG_LBR = 2, /* lbr_select */
EXTRA_REG_MAX /* number of entries needed */
};
@@ -130,6 +131,8 @@ struct cpu_hw_events {
void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ struct er_account *lbr_sel;
+ u64 br_sel;
/*
* Intel host/guest exclude bits
@@ -268,6 +271,29 @@ struct x86_pmu_quirk {
void (*func)(void);
};
+union x86_pmu_config {
+ struct {
+ u64 event:8,
+ umask:8,
+ usr:1,
+ os:1,
+ edge:1,
+ pc:1,
+ interrupt:1,
+ __reserved1:1,
+ en:1,
+ inv:1,
+ cmask:8,
+ event2:4,
+ __reserved2:4,
+ go:1,
+ ho:1;
+ } bits;
+ u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -309,10 +335,20 @@ struct x86_pmu {
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
+ /*
+ * sysfs attrs
+ */
+ int attr_rdpmc;
+ struct attribute **format_attrs;
+
+ /*
+ * CPU Hotplug hooks
+ */
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
+ void (*flush_branch_stack)(void);
/*
* Intel Arch Perfmon v2+
@@ -334,6 +370,8 @@ struct x86_pmu {
*/
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
int lbr_nr; /* hardware stack size */
+ u64 lbr_sel_mask; /* LBR_SELECT valid bits */
+ const int *lbr_sel_map; /* lbr_select mappings */
/*
* Extra registers for events
@@ -447,6 +485,15 @@ extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+ return ip > PAGE_OFFSET;
+#else
+ return (long)ip < 0;
+#endif
+}
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -527,6 +574,10 @@ void intel_pmu_lbr_init_nhm(void);
void intel_pmu_lbr_init_atom(void);
+void intel_pmu_lbr_init_snb(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
int p4_pmu_init(void);
int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67250a5..6565226 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -134,11 +134,19 @@ static u64 amd_pmu_event_map(int hw_event)
static int amd_pmu_hw_config(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
+ int ret;
+ /* pass precise event sampling to ibs: */
+ if (event->attr.precise_ip && get_ibs_caps())
+ return -ENOENT;
+
+ ret = x86_pmu_hw_config(event);
if (ret)
return ret;
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -202,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
- if (nb->owners[i] == event) {
- cmpxchg(nb->owners+i, event, NULL);
+ if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
- }
}
}
@@ -401,6 +407,21 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *amd_format_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
@@ -423,6 +444,8 @@ static __initconst const struct x86_pmu amd_pmu = {
.get_event_constraints = amd_get_event_constraints,
.put_event_constraints = amd_put_event_constraints,
+ .format_attrs = amd_format_attr,
+
.cpu_prepare = amd_pmu_cpu_prepare,
.cpu_starting = amd_pmu_cpu_starting,
.cpu_dead = amd_pmu_cpu_dead,
@@ -593,6 +616,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
.cpu_dead = amd_pmu_cpu_dead,
#endif
.cpu_starting = amd_pmu_cpu_starting,
+ .format_attrs = amd_format_attr,
};
__init int amd_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3b8a2d3..da9bcdc 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -9,6 +9,7 @@
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/ptrace.h>
#include <asm/apic.h>
@@ -16,36 +17,591 @@ static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-static struct pmu perf_ibs;
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+enum ibs_states {
+ IBS_ENABLED = 0,
+ IBS_STARTED = 1,
+ IBS_STOPPING = 2,
+
+ IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+ struct perf_event *event;
+ unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ struct cpu_perf_ibs __percpu *pcpu;
+ u64 (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int overflow = 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ if (unlikely(left < (s64)min)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ /*
+ * If the hw period that triggers the sw overflow is too short
+ * we might hit the irq handler. This biases the results.
+ * Thus we shorten the next-to-last period and set the last
+ * period to the max period.
+ */
+ if (left > max) {
+ left -= max;
+ if (left > max)
+ left = max;
+ else if (left < min)
+ left = min;
+ }
+
+ *hw_period = (u64)left;
+
+ return overflow;
+}
+
+static int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - width;
+ u64 prev_raw_count;
+ u64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ return 0;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+ if (perf_ibs_fetch.pmu.type == type)
+ return &perf_ibs_fetch;
+ if (perf_ibs_op.pmu.type == type)
+ return &perf_ibs_op;
+ return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
+ * perf record -a -e r076:p ... # same as -e cpu-cycles:p
+ * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+ switch (event->attr.precise_ip) {
+ case 0:
+ return -ENOENT;
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (event->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ switch (event->attr.config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ *config = 0;
+ return 0;
+ }
+ break;
+ case PERF_TYPE_RAW:
+ switch (event->attr.config) {
+ case 0x0076:
+ *config = 0;
+ return 0;
+ case 0x00C1:
+ *config = IBS_OP_CNT_CTL;
+ return 0;
+ }
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return -EOPNOTSUPP;
+}
static int perf_ibs_init(struct perf_event *event)
{
- if (perf_ibs.type != event->attr.type)
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs;
+ u64 max_cnt, config;
+ int ret;
+
+ perf_ibs = get_ibs_pmu(event->attr.type);
+ if (perf_ibs) {
+ config = event->attr.config;
+ } else {
+ perf_ibs = &perf_ibs_op;
+ ret = perf_ibs_precise_event(event, &config);
+ if (ret)
+ return ret;
+ }
+
+ if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
+
+ if (config & ~perf_ibs->config_mask)
+ return -EINVAL;
+
+ if (hwc->sample_period) {
+ if (config & perf_ibs->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+ if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+ /*
+ * lower 4 bits can not be set in ibs max cnt,
+ * but allowing it in case we adjust the
+ * sample period to set a frequency.
+ */
+ return -EINVAL;
+ hwc->sample_period &= ~0x0FULL;
+ if (!hwc->sample_period)
+ hwc->sample_period = 0x10;
+ } else {
+ max_cnt = config & perf_ibs->cnt_mask;
+ config &= ~perf_ibs->cnt_mask;
+ event->attr.sample_period = max_cnt << 4;
+ hwc->sample_period = event->attr.sample_period;
+ }
+
+ if (!hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * If we modify hwc->sample_period, we also need to update
+ * hwc->last_period and hwc->period_left.
+ */
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ hwc->config_base = perf_ibs->msr;
+ hwc->config = config;
+
return 0;
}
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 *period)
+{
+ int overflow;
+
+ /* ignore lower 4 bits in min count: */
+ overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+ local64_set(&hwc->prev_count, 0);
+
+ return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+ return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+ u64 count = 0;
+
+ if (config & IBS_OP_VAL)
+ count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+ if (ibs_caps & IBS_CAPS_RDWROPCNT)
+ count += (config & IBS_OP_CUR_CNT) >> 32;
+
+ return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+ u64 *config)
+{
+ u64 count = perf_ibs->get_count(*config);
+
+ /*
+ * Set width to 64 since we do not overflow on max width but
+ * instead on max count. In perf_ibs_set_period() we clear
+ * prev count manually on overflow.
+ */
+ while (!perf_event_try_update(event, count, 64)) {
+ rdmsrl(event->hw.config_base, *config);
+ count = perf_ibs->get_count(*config);
+ }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ config &= ~perf_ibs->cnt_mask;
+ wrmsrl(hwc->config_base, config);
+ config &= ~perf_ibs->enable_mask;
+ wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 period;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ perf_ibs_set_period(perf_ibs, hwc, &period);
+ set_bit(IBS_STARTED, pcpu->state);
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 config;
+ int stopping;
+
+ stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+ if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+ return;
+
+ rdmsrl(hwc->config_base, config);
+
+ if (stopping) {
+ set_bit(IBS_STOPPING, pcpu->state);
+ perf_ibs_disable_event(perf_ibs, hwc, config);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * Clear valid bit to not count rollovers on update, rollovers
+ * are only updated in the irq handler.
+ */
+ config &= ~perf_ibs->valid_mask;
+
+ perf_ibs_event_update(perf_ibs, event, &config);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
static int perf_ibs_add(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+ return -ENOSPC;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ pcpu->event = event;
+
+ if (flags & PERF_EF_START)
+ perf_ibs_start(event, PERF_EF_RELOAD);
+
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+ return;
+
+ perf_ibs_stop(event, PERF_EF_UPDATE);
+
+ pcpu->event = NULL;
+
+ perf_event_update_userpage(event);
}
-static struct pmu perf_ibs = {
- .event_init= perf_ibs_init,
- .add= perf_ibs_add,
- .del= perf_ibs_del,
+static void perf_ibs_read(struct perf_event *event) { }
+
+static struct perf_ibs perf_ibs_fetch = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ .config_mask = IBS_FETCH_CONFIG_MASK,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .enable_mask = IBS_FETCH_ENABLE,
+ .valid_mask = IBS_FETCH_VAL,
+ .max_period = IBS_FETCH_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
+ .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+
+ .get_count = get_ibs_fetch_count,
};
+static struct perf_ibs perf_ibs_op = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSOPCTL,
+ .config_mask = IBS_OP_CONFIG_MASK,
+ .cnt_mask = IBS_OP_MAX_CNT,
+ .enable_mask = IBS_OP_ENABLE,
+ .valid_mask = IBS_OP_VAL,
+ .max_period = IBS_OP_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
+ .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+
+ .get_count = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ struct perf_event *event = pcpu->event;
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ struct perf_ibs_data ibs_data;
+ int offset, size, check_rip, offset_max, throttle = 0;
+ unsigned int msr;
+ u64 *buf, *config, period;
+
+ if (!test_bit(IBS_STARTED, pcpu->state)) {
+ /*
+ * Catch spurious interrupts after stopping IBS: After
+ * disabling IBS there could be still incomming NMIs
+ * with samples that even have the valid bit cleared.
+ * Mark all this NMIs as handled.
+ */
+ return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+ }
+
+ msr = hwc->config_base;
+ buf = ibs_data.regs;
+ rdmsrl(msr, *buf);
+ if (!(*buf++ & perf_ibs->valid_mask))
+ return 0;
+
+ config = &ibs_data.regs[0];
+ perf_ibs_event_update(perf_ibs, event, config);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+ goto out; /* no sw counter overflow */
+
+ ibs_data.caps = ibs_caps;
+ size = 1;
+ offset = 1;
+ check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+ if (event->attr.sample_type & PERF_SAMPLE_RAW)
+ offset_max = perf_ibs->offset_max;
+ else if (check_rip)
+ offset_max = 2;
+ else
+ offset_max = 1;
+ do {
+ rdmsrl(msr + offset, *buf++);
+ size++;
+ offset = find_next_bit(perf_ibs->offset_mask,
+ perf_ibs->offset_max,
+ offset + 1);
+ } while (offset < offset_max);
+ ibs_data.size = sizeof(u64) * size;
+
+ regs = *iregs;
+ if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+ } else {
+ instruction_pointer_set(&regs, ibs_data.regs[1]);
+ regs.flags |= PERF_EFLAGS_EXACT;
+ }
+
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.size = sizeof(u32) + ibs_data.size;
+ raw.data = ibs_data.data;
+ data.raw = &raw;
+ }
+
+ throttle = perf_event_overflow(event, &data, &regs);
+out:
+ if (throttle)
+ perf_ibs_disable_event(perf_ibs, hwc, *config);
+ else
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+
+ return 1;
+}
+
+static int __kprobes
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int handled = 0;
+
+ handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+ handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+ struct cpu_perf_ibs __percpu *pcpu;
+ int ret;
+
+ pcpu = alloc_percpu(struct cpu_perf_ibs);
+ if (!pcpu)
+ return -ENOMEM;
+
+ perf_ibs->pcpu = pcpu;
+
+ ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+ if (ret) {
+ perf_ibs->pcpu = NULL;
+ free_percpu(pcpu);
+ }
+
+ return ret;
+}
+
static __init int perf_event_ibs_init(void)
{
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
- perf_pmu_register(&perf_ibs, "ibs", -1);
+ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+ perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+ register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 61d4f79..166546e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -728,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids
},
};
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+ /* user explicitly requested branch sampling */
+ if (has_branch_stack(event))
+ return true;
+
+ /* implicit branch sampling to correct PEBS skid */
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+ return true;
+
+ return false;
+}
+
static void intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -882,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event)
cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+ /*
+ * must disable before any actual event
+ * because any event may be combined with LBR
+ */
+ if (intel_pmu_needs_lbr_smpl(event))
+ intel_pmu_lbr_disable(event);
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
return;
@@ -936,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
intel_pmu_enable_bts(hwc->config);
return;
}
+ /*
+ * must enabled before any actual event
+ * because any event may be combined with LBR
+ */
+ if (intel_pmu_needs_lbr_smpl(event))
+ intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1001,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
int handled;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1056,7 +1080,10 @@ again:
if (!intel_pmu_save_and_restart(event))
continue;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -1124,17 +1151,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
*/
static struct event_constraint *
__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+ struct perf_event *event,
+ struct hw_perf_event_extra *reg)
{
struct event_constraint *c = &emptyconstraint;
- struct hw_perf_event_extra *reg = &event->hw.extra_reg;
struct er_account *era;
unsigned long flags;
int orig_idx = reg->idx;
/* already allocated shared msr */
if (reg->alloc)
- return &unconstrained;
+ return NULL; /* call x86_get_event_constraint() */
again:
era = &cpuc->shared_regs->regs[reg->idx];
@@ -1157,14 +1184,10 @@ again:
reg->alloc = 1;
/*
- * All events using extra_reg are unconstrained.
- * Avoids calling x86_get_event_constraints()
- *
- * Must revisit if extra_reg controlling events
- * ever have constraints. Worst case we go through
- * the regular event constraint table.
+ * need to call x86_get_event_constraint()
+ * to check if associated event has constraints
*/
- c = &unconstrained;
+ c = NULL;
} else if (intel_try_alt_er(event, orig_idx)) {
raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
@@ -1201,11 +1224,23 @@ static struct event_constraint *
intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
- struct event_constraint *c = NULL;
-
- if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
- c = __intel_shared_reg_get_constraints(cpuc, event);
-
+ struct event_constraint *c = NULL, *d;
+ struct hw_perf_event_extra *xreg, *breg;
+
+ xreg = &event->hw.extra_reg;
+ if (xreg->idx != EXTRA_REG_NONE) {
+ c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+ if (c == &emptyconstraint)
+ return c;
+ }
+ breg = &event->hw.branch_reg;
+ if (breg->idx != EXTRA_REG_NONE) {
+ d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+ if (d == &emptyconstraint) {
+ __intel_shared_reg_put_constraints(cpuc, xreg);
+ c = d;
+ }
+ }
return c;
}
@@ -1253,6 +1288,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
reg = &event->hw.extra_reg;
if (reg->idx != EXTRA_REG_NONE)
__intel_shared_reg_put_constraints(cpuc, reg);
+
+ reg = &event->hw.branch_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
}
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1288,12 +1327,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
*
* Thereby we gain a PEBS capable cycle counter.
*/
- u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
event->hw.config = alt_config;
}
+ if (intel_pmu_needs_lbr_smpl(event)) {
+ ret = intel_pmu_setup_lbr_filter(event);
+ if (ret)
+ return ret;
+ }
+
if (event->attr.type != PERF_TYPE_RAW)
return 0;
@@ -1383,6 +1429,24 @@ static void core_pmu_enable_all(int added)
}
}
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_arch_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
@@ -1407,6 +1471,7 @@ static __initconst const struct x86_pmu core_pmu = {
.put_event_constraints = intel_put_event_constraints,
.event_constraints = intel_core_event_constraints,
.guest_get_msrs = core_guest_get_msrs,
+ .format_attrs = intel_arch_formats_attr,
};
struct intel_shared_regs *allocate_shared_regs(int cpu)
@@ -1432,7 +1497,7 @@ static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!x86_pmu.extra_regs)
+ if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
return NOTIFY_OK;
cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1454,22 +1519,28 @@ static void intel_pmu_cpu_starting(int cpu)
*/
intel_pmu_lbr_reset();
- if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
+ cpuc->lbr_sel = NULL;
+
+ if (!cpuc->shared_regs)
return;
- for_each_cpu(i, topology_thread_cpumask(cpu)) {
- struct intel_shared_regs *pc;
+ if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+ for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ struct intel_shared_regs *pc;
- pc = per_cpu(cpu_hw_events, i).shared_regs;
- if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
- cpuc->shared_regs = pc;
- break;
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
+ if (pc && pc->core_id == core_id) {
+ cpuc->kfree_on_online = cpuc->shared_regs;
+ cpuc->shared_regs = pc;
+ break;
+ }
}
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
}
- cpuc->shared_regs->core_id = core_id;
- cpuc->shared_regs->refcnt++;
+ if (x86_pmu.lbr_sel_map)
+ cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
}
static void intel_pmu_cpu_dying(int cpu)
@@ -1487,6 +1558,33 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}
+static void intel_pmu_flush_branch_stack(void)
+{
+ /*
+ * Intel LBR does not tag entries with the
+ * PID of the current task, then we need to
+ * flush it on ctxsw
+ * For now, we simply reset it
+ */
+ if (x86_pmu.lbr_nr)
+ intel_pmu_lbr_reset();
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_any.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+
+ &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+ NULL,
+};
+
static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
@@ -1510,10 +1608,13 @@ static __initconst const struct x86_pmu intel_pmu = {
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
+ .format_attrs = intel_arch3_formats_attr,
+
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
+ .flush_branch_stack = intel_pmu_flush_branch_stack,
};
static __init void intel_clovertown_quirk(void)
@@ -1690,9 +1791,11 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_nehalem_extra_regs;
/* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
x86_add_quirk(intel_nehalem_quirk);
@@ -1727,9 +1830,11 @@ __init int intel_pmu_init(void)
x86_pmu.er_flags |= ERF_HAS_RSP_1;
/* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
pr_cont("Westmere events, ");
break;
@@ -1740,7 +1845,7 @@ __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
- intel_pmu_lbr_init_nhm();
+ intel_pmu_lbr_init_snb();
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
@@ -1750,9 +1855,11 @@ __init int intel_pmu_init(void)
x86_pmu.er_flags |= ERF_NO_HT_SHARING;
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
pr_cont("SandyBridge events, ");
break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d6bd49f..5a3edc2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
#include <linux/slab.h>
#include <asm/perf_event.h>
+#include <asm/insn.h>
#include "perf_event.h"
@@ -315,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)
ds->bts_index = ds->bts_buffer_base;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
regs.ip = 0;
/*
@@ -439,9 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
- intel_pmu_lbr_enable(event);
}
void intel_pmu_pebs_disable(struct perf_event *event)
@@ -454,9 +451,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
- intel_pmu_lbr_disable(event);
}
void intel_pmu_pebs_enable_all(void)
@@ -475,17 +469,6 @@ void intel_pmu_pebs_disable_all(void)
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
-#include <asm/insn.h>
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
- return ip > PAGE_OFFSET;
-#else
- return (long)ip < 0;
-#endif
-}
-
static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -572,6 +555,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* both formats and we don't use the other fields in this
* routine.
*/
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct pebs_record_core *pebs = __pebs;
struct perf_sample_data data;
struct pt_regs regs;
@@ -579,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
if (!intel_pmu_save_and_restart(event))
return;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
/*
* We use the interrupt regs as a base because the PEBS record
@@ -602,6 +585,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
else
regs.flags &= ~PERF_EFLAGS_EXACT;
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
+
if (perf_event_overflow(event, &data, &regs))
x86_pmu_stop(event, 0);
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 47a7e63..520b426 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
#include <asm/perf_event.h>
#include <asm/msr.h>
+#include <asm/insn.h>
#include "perf_event.h"
@@ -14,6 +15,100 @@ enum {
};
/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
+#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT 2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT 5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
+#define LBR_FAR_BIT 8 /* do not capture far branches */
+
+#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
+#define LBR_USER (1 << LBR_USER_BIT)
+#define LBR_JCC (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR (1 << LBR_FAR_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP -1 /* LBR filter not supported */
+#define LBR_IGN 0 /* ignored */
+
+#define LBR_ANY \
+ (LBR_JCC |\
+ LBR_REL_CALL |\
+ LBR_IND_CALL |\
+ LBR_RETURN |\
+ LBR_REL_JMP |\
+ LBR_IND_JMP |\
+ LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
+
+#define for_each_branch_sample_type(x) \
+ for ((x) = PERF_SAMPLE_BRANCH_USER; \
+ (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+
+#define X86_BR_ANY \
+ (X86_BR_CALL |\
+ X86_BR_RET |\
+ X86_BR_SYSCALL |\
+ X86_BR_SYSRET |\
+ X86_BR_INT |\
+ X86_BR_IRET |\
+ X86_BR_JCC |\
+ X86_BR_JMP |\
+ X86_BR_IRQ |\
+ X86_BR_IND_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL \
+ (X86_BR_CALL |\
+ X86_BR_IND_CALL |\
+ X86_BR_SYSCALL |\
+ X86_BR_IRQ |\
+ X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
* We only support LBR implementations that have FREEZE_LBRS_ON_PMI
* otherwise it becomes near impossible to get a reliable stack.
*/
@@ -21,6 +116,10 @@ enum {
static void __intel_pmu_lbr_enable(void)
{
u64 debugctl;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->lbr_sel)
+ wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -76,11 +175,11 @@ void intel_pmu_lbr_enable(struct perf_event *event)
* Reset the LBR stack if we changed task context to
* avoid data leaks.
*/
-
if (event->ctx->task && cpuc->lbr_context != event->ctx) {
intel_pmu_lbr_reset();
cpuc->lbr_context = event->ctx;
}
+ cpuc->br_sel = event->hw.branch_reg.reg;
cpuc->lbr_users++;
}
@@ -95,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- if (cpuc->enabled && !cpuc->lbr_users)
+ if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
+ /* avoid stale pointer */
+ cpuc->lbr_context = NULL;
+ }
}
void intel_pmu_lbr_enable_all(void)
@@ -115,6 +217,9 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}
+/*
+ * TOS = most recently recorded branch
+ */
static inline u64 intel_pmu_lbr_tos(void)
{
u64 tos;
@@ -142,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].flags = 0;
+ cpuc->lbr_entries[i].from = msr_lastbranch.from;
+ cpuc->lbr_entries[i].to = msr_lastbranch.to;
+ cpuc->lbr_entries[i].mispred = 0;
+ cpuc->lbr_entries[i].predicted = 0;
+ cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
}
-#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
-
/*
* Due to lack of segmentation in Linux the effective address (offset)
* is the same as the linear address, allowing us to merge the LIP and EIP
@@ -165,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
- u64 from, to, flags = 0;
+ u64 from, to, mis = 0, pred = 0;
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
- flags = !!(from & LBR_FROM_FLAG_MISPRED);
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
from = (u64)((((s64)from) << 1) >> 1);
}
- cpuc->lbr_entries[i].from = from;
- cpuc->lbr_entries[i].to = to;
- cpuc->lbr_entries[i].flags = flags;
+ cpuc->lbr_entries[i].from = from;
+ cpuc->lbr_entries[i].to = to;
+ cpuc->lbr_entries[i].mispred = mis;
+ cpuc->lbr_entries[i].predicted = pred;
+ cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
}
@@ -193,28 +301,404 @@ void intel_pmu_lbr_read(void)
intel_pmu_lbr_read_32(cpuc);
else
intel_pmu_lbr_read_64(cpuc);
+
+ intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+ u64 br_type = event->attr.branch_sample_type;
+ int mask = 0;
+
+ if (br_type & PERF_SAMPLE_BRANCH_USER)
+ mask |= X86_BR_USER;
+
+ if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+ mask |= X86_BR_KERNEL;
+
+ /* we ignore BRANCH_HV here */
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY)
+ mask |= X86_BR_ANY;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ mask |= X86_BR_ANY_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ mask |= X86_BR_IND_CALL;
+ /*
+ * stash actual user request into reg, it may
+ * be used by fixup code for some CPU
+ */
+ event->hw.branch_reg.reg = mask;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+ u64 br_type = event->attr.branch_sample_type;
+ u64 mask = 0, m;
+ u64 v;
+
+ for_each_branch_sample_type(m) {
+ if (!(br_type & m))
+ continue;
+
+ v = x86_pmu.lbr_sel_map[m];
+ if (v == LBR_NOT_SUPP)
+ return -EOPNOTSUPP;
+
+ if (v != LBR_IGN)
+ mask |= v;
+ }
+ reg = &event->hw.branch_reg;
+ reg->idx = EXTRA_REG_LBR;
+
+ /* LBR_SELECT operates in suppress mode so invert mask */
+ reg->config = ~mask & x86_pmu.lbr_sel_mask;
+
+ return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+ int ret = 0;
+
+ /*
+ * no LBR on this PMU
+ */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /*
+ * setup SW LBR filter
+ */
+ intel_pmu_setup_sw_lbr_filter(event);
+
+ /*
+ * setup HW LBR filter, if any
+ */
+ if (x86_pmu.lbr_sel_map)
+ ret = intel_pmu_setup_hw_lbr_filter(event);
+
+ return ret;
}
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to)
+{
+ struct insn insn;
+ void *addr;
+ int bytes, size = MAX_INSN_SIZE;
+ int ret = X86_BR_NONE;
+ int ext, to_plm, from_plm;
+ u8 buf[MAX_INSN_SIZE];
+ int is64 = 0;
+
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+ /*
+ * maybe zero if lbr did not fill up after a reset by the time
+ * we get a PMU interrupt
+ */
+ if (from == 0 || to == 0)
+ return X86_BR_NONE;
+
+ if (from_plm == X86_BR_USER) {
+ /*
+ * can happen if measuring at the user level only
+ * and we interrupt in a kernel thread, e.g., idle.
+ */
+ if (!current->mm)
+ return X86_BR_NONE;
+
+ /* may fail if text not present */
+ bytes = copy_from_user_nmi(buf, (void __user *)from, size);
+ if (bytes != size)
+ return X86_BR_NONE;
+
+ addr = buf;
+ } else
+ addr = (void *)from;
+
+ /*
+ * decoder needs to know the ABI especially
+ * on 64-bit systems running 32-bit apps
+ */
+#ifdef CONFIG_X86_64
+ is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+ insn_init(&insn, addr, is64);
+ insn_get_opcode(&insn);
+
+ switch (insn.opcode.bytes[0]) {
+ case 0xf:
+ switch (insn.opcode.bytes[1]) {
+ case 0x05: /* syscall */
+ case 0x34: /* sysenter */
+ ret = X86_BR_SYSCALL;
+ break;
+ case 0x07: /* sysret */
+ case 0x35: /* sysexit */
+ ret = X86_BR_SYSRET;
+ break;
+ case 0x80 ... 0x8f: /* conditional */
+ ret = X86_BR_JCC;
+ break;
+ default:
+ ret = X86_BR_NONE;
+ }
+ break;
+ case 0x70 ... 0x7f: /* conditional */
+ ret = X86_BR_JCC;
+ break;
+ case 0xc2: /* near ret */
+ case 0xc3: /* near ret */
+ case 0xca: /* far ret */
+ case 0xcb: /* far ret */
+ ret = X86_BR_RET;
+ break;
+ case 0xcf: /* iret */
+ ret = X86_BR_IRET;
+ break;
+ case 0xcc ... 0xce: /* int */
+ ret = X86_BR_INT;
+ break;
+ case 0xe8: /* call near rel */
+ case 0x9a: /* call far absolute */
+ ret = X86_BR_CALL;
+ break;
+ case 0xe0 ... 0xe3: /* loop jmp */
+ ret = X86_BR_JCC;
+ break;
+ case 0xe9 ... 0xeb: /* jmp */
+ ret = X86_BR_JMP;
+ break;
+ case 0xff: /* call near absolute, call far absolute ind */
+ insn_get_modrm(&insn);
+ ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+ switch (ext) {
+ case 2: /* near ind call */
+ case 3: /* far ind call */
+ ret = X86_BR_IND_CALL;
+ break;
+ case 4:
+ case 5:
+ ret = X86_BR_JMP;
+ break;
+ }
+ break;
+ default:
+ ret = X86_BR_NONE;
+ }
+ /*
+ * interrupts, traps, faults (and thus ring transition) may
+ * occur on any instructions. Thus, to classify them correctly,
+ * we need to first look at the from and to priv levels. If they
+ * are different and to is in the kernel, then it indicates
+ * a ring transition. If the from instruction is not a ring
+ * transition instr (syscall, systenter, int), then it means
+ * it was a irq, trap or fault.
+ *
+ * we have no way of detecting kernel to kernel faults.
+ */
+ if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+ && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+ ret = X86_BR_IRQ;
+
+ /*
+ * branch priv level determined by target as
+ * is done by HW when LBR_SELECT is implemented
+ */
+ if (ret != X86_BR_NONE)
+ ret |= to_plm;
+
+ return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+ u64 from, to;
+ int br_sel = cpuc->br_sel;
+ int i, j, type;
+ bool compress = false;
+
+ /* if sampling all branches, then nothing to filter */
+ if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+ return;
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+ from = cpuc->lbr_entries[i].from;
+ to = cpuc->lbr_entries[i].to;
+
+ type = branch_type(from, to);
+
+ /* if type does not correspond, then discard */
+ if (type == X86_BR_NONE || (br_sel & type) != type) {
+ cpuc->lbr_entries[i].from = 0;
+ compress = true;
+ }
+ }
+
+ if (!compress)
+ return;
+
+ /* remove all entries with from=0 */
+ for (i = 0; i < cpuc->lbr_stack.nr; ) {
+ if (!cpuc->lbr_entries[i].from) {
+ j = i;
+ while (++j < cpuc->lbr_stack.nr)
+ cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+ cpuc->lbr_stack.nr--;
+ if (!cpuc->lbr_entries[i].from)
+ continue;
+ }
+ i++;
+ }
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+ [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+ */
+ [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+ */
+ [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+ [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
+};
+
+/* core */
void intel_pmu_lbr_init_core(void)
{
x86_pmu.lbr_nr = 4;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x40;
- x86_pmu.lbr_to = 0x60;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("4-deep LBR, ");
}
+/* nehalem/westmere */
void intel_pmu_lbr_init_nhm(void)
{
x86_pmu.lbr_nr = 16;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x680;
- x86_pmu.lbr_to = 0x6c0;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - workaround LBR_SEL errata (see above)
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void intel_pmu_lbr_init_snb(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("16-deep LBR, ");
}
+/* atom */
void intel_pmu_lbr_init_atom(void)
{
+ /*
+ * only models starting at stepping 10 seems
+ * to have an operational LBR which can freeze
+ * on PMU interrupt
+ */
+ if (boot_cpu_data.x86_mask < 10) {
+ pr_cont("LBR disabled due to erratum");
+ return;
+ }
+
x86_pmu.lbr_nr = 8;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x40;
- x86_pmu.lbr_to = 0x60;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("8-deep LBR, ");
}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef484d9..47124a7 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
handled += overflow;
/* event overflow for sure */
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
continue;
+
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1271,6 +1271,17 @@ done:
return num ? -EINVAL : 0;
}
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht, "config:63" );
+
+static struct attribute *intel_p4_formats_attr[] = {
+ &format_attr_cccr.attr,
+ &format_attr_escr.attr,
+ &format_attr_ht.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu p4_pmu = {
.name = "Netburst P4/Xeon",
.handle_irq = p4_pmu_handle_irq,
@@ -1305,6 +1316,8 @@ static __initconst const struct x86_pmu p4_pmu = {
* the former idea is taken from OProfile code
*/
.perfctr_second_write = 1,
+
+ .format_attrs = intel_p4_formats_attr,
};
__init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index c7181be..32bcfc7 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -87,6 +87,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
(void)checking_wrmsrl(hwc->config_base, val);
}
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_p6_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = x86_pmu_handle_irq,
@@ -115,6 +132,8 @@ static __initconst const struct x86_pmu p6_pmu = {
.cntval_mask = (1ULL << 32) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = p6_event_constraints,
+
+ .format_attrs = intel_p6_formats_attr,
};
__init int p6_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c7f64e6..addf9e8 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index a524353..39472dd 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -43,7 +43,6 @@
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
static struct class *cpuid_class;
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 642f75a..11891ca 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!userbuf) {
memcpy(buf, (vaddr + offset), csize);
- kunmap_atomic(vaddr, KM_PTE0);
+ kunmap_atomic(vaddr);
} else {
if (!kdump_buf_page) {
printk(KERN_WARNING "Kdump: Kdump buffer page not"
" allocated\n");
- kunmap_atomic(vaddr, KM_PTE0);
+ kunmap_atomic(vaddr);
return -EFAULT;
}
copy_page(kdump_buf_page, vaddr);
- kunmap_atomic(vaddr, KM_PTE0);
+ kunmap_atomic(vaddr);
if (copy_to_user(buf, (kdump_buf_page + offset), csize))
return -EFAULT;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 5282179..3ae2ced 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,6 +4,7 @@
#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/irqdomain.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/of.h>
@@ -17,64 +18,14 @@
#include <linux/initrd.h>
#include <asm/hpet.h>
-#include <asm/irq_controller.h>
#include <asm/apic.h>
#include <asm/pci_x86.h>
__initdata u64 initial_dtb;
char __initdata cmd_line[COMMAND_LINE_SIZE];
-static LIST_HEAD(irq_domains);
-static DEFINE_RAW_SPINLOCK(big_irq_lock);
int __initdata of_ioapic;
-#ifdef CONFIG_X86_IO_APIC
-static void add_interrupt_host(struct irq_domain *ih)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&big_irq_lock, flags);
- list_add(&ih->l, &irq_domains);
- raw_spin_unlock_irqrestore(&big_irq_lock, flags);
-}
-#endif
-
-static struct irq_domain *get_ih_from_node(struct device_node *controller)
-{
- struct irq_domain *ih, *found = NULL;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&big_irq_lock, flags);
- list_for_each_entry(ih, &irq_domains, l) {
- if (ih->controller == controller) {
- found = ih;
- break;
- }
- }
- raw_spin_unlock_irqrestore(&big_irq_lock, flags);
- return found;
-}
-
-unsigned int irq_create_of_mapping(struct device_node *controller,
- const u32 *intspec, unsigned int intsize)
-{
- struct irq_domain *ih;
- u32 virq, type;
- int ret;
-
- ih = get_ih_from_node(controller);
- if (!ih)
- return 0;
- ret = ih->xlate(ih, intspec, intsize, &virq, &type);
- if (ret)
- return 0;
- if (type == IRQ_TYPE_NONE)
- return virq;
- irq_set_irq_type(virq, type);
- return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
unsigned long pci_address_to_pio(phys_addr_t address)
{
/*
@@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] =
},
};
-static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
- u32 *out_hwirq, u32 *out_type)
+static int ioapic_xlate(struct irq_domain *domain,
+ struct device_node *controller,
+ const u32 *intspec, u32 intsize,
+ irq_hw_number_t *out_hwirq, u32 *out_type)
{
- struct mp_ioapic_gsi *gsi_cfg;
struct io_apic_irq_attr attr;
struct of_ioapic_type *it;
- u32 line, idx, type;
+ u32 line, idx;
+ int rc;
- if (intsize < 2)
+ if (WARN_ON(intsize < 2))
return -EINVAL;
- line = *intspec;
- idx = (u32) id->priv;
- gsi_cfg = mp_ioapic_gsi_routing(idx);
- *out_hwirq = line + gsi_cfg->gsi_base;
-
- intspec++;
- type = *intspec;
+ line = intspec[0];
- if (type >= ARRAY_SIZE(of_ioapic_type))
+ if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
return -EINVAL;
- it = of_ioapic_type + type;
- *out_type = it->out_type;
+ it = &of_ioapic_type[intspec[1]];
+ idx = (u32) domain->host_data;
set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
- return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
+ rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
+ cpu_to_node(0), &attr);
+ if (rc)
+ return rc;
+
+ *out_hwirq = line;
+ *out_type = it->out_type;
+ return 0;
}
+const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .xlate = ioapic_xlate,
+};
+
static void __init ioapic_add_ofnode(struct device_node *np)
{
struct resource r;
@@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np)
for (i = 0; i < nr_ioapics; i++) {
if (r.start == mpc_ioapic_addr(i)) {
struct irq_domain *id;
+ struct mp_ioapic_gsi *gsi_cfg;
+
+ gsi_cfg = mp_ioapic_gsi_routing(i);
- id = kzalloc(sizeof(*id), GFP_KERNEL);
+ id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
+ &ioapic_irq_domain_ops,
+ (void*)i);
BUG_ON(!id);
- id->controller = np;
- id->xlate = ioapic_xlate;
- id->priv = (void *)i;
- add_interrupt_host(id);
return;
}
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 4025fe4..571246d 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -37,13 +37,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
const struct stacktrace_ops *ops,
struct thread_info *tinfo, int *graph)
{
- struct task_struct *task = tinfo->task;
+ struct task_struct *task;
unsigned long ret_addr;
- int index = task->curr_ret_stack;
+ int index;
if (addr != (unsigned long)return_to_handler)
return;
+ task = tinfo->task;
+ index = task->curr_ret_stack;
+
if (!task->ret_stack || index < *graph)
return;
@@ -265,10 +268,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#endif
printk("\n");
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- show_registers(regs);
+ show_regs(regs);
#ifdef CONFIG_X86_32
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -308,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err)
static int __init kstack_setup(char *s)
{
+ ssize_t ret;
+ unsigned long val;
+
if (!s)
return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+ kstack_depth_to_print = val;
return 0;
}
early_param("kstack", kstack_setup);
static int __init code_bytes_setup(char *s)
{
- code_bytes = simple_strtoul(s, NULL, 0);
+ ssize_t ret;
+ unsigned long val;
+
+ if (!s)
+ return -EINVAL;
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+
+ code_bytes = val;
if (code_bytes > 8192)
code_bytes = 8192;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c99f9ed..e0b1d78 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,12 +82,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
print_modules();
- __show_regs(regs, 0);
+ __show_regs(regs, !user_mode_vm(regs));
printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
TASK_COMM_LEN, current->comm, task_pid_nr(current),
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 17107bd..791b761 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
unsigned long sp;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 79d97e6..7b784f4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -98,12 +98,6 @@
#endif
.endm
-#ifdef CONFIG_VM86
-#define resume_userspace_sig check_userspace
-#else
-#define resume_userspace_sig resume_userspace
-#endif
-
/*
* User gs save/restore
*
@@ -327,10 +321,19 @@ ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
-check_userspace:
+resume_userspace_sig:
+#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+ /*
+ * We can be coming here from a syscall done in the kernel space,
+ * e.g. a failed kernel_execve().
+ */
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1333d98..cdc79b5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64)
movq %rsp, %rsi
leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS(%rdi)
+ testl $3, CS-RBP(%rsi)
je 1f
SWAPGS
/*
@@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64)
* moving irq_enter into assembly, which would be too much work)
*/
1: incl PER_CPU_VAR(irq_count)
- jne 2f
- mov PER_CPU_VAR(irq_stack_ptr),%rsp
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
-2: /* Store previous stack value */
+ /* Store previous stack value */
pushq %rsi
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
0x77 /* DW_OP_breg7 */, 0, \
@@ -482,7 +481,12 @@ GLOBAL(system_call_after_swapgs)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja badsys
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
@@ -596,7 +600,12 @@ tracesys:
*/
LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
@@ -736,6 +745,40 @@ ENTRY(stub_rt_sigreturn)
CFI_ENDPROC
END(stub_rt_sigreturn)
+#ifdef CONFIG_X86_X32_ABI
+ PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
+
+ENTRY(stub_x32_rt_sigreturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ movq %rsp,%rdi
+ FIXUP_TOP_OF_STACK %r11
+ call sys32_x32_rt_sigreturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_rt_sigreturn)
+
+ENTRY(stub_x32_execve)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ movq %rsp, %rcx
+ call sys32_execve
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_execve)
+
+#endif
+
/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
@@ -813,7 +856,7 @@ ret_from_intr:
/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA_REGISTER rsi
+ CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
leaq ARGOFFSET-RBP(%rsi), %rsp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
@@ -1530,6 +1573,7 @@ ENTRY(nmi)
/* Use %rdx as out temp variable throughout */
pushq_cfi %rdx
+ CFI_REL_OFFSET rdx, 0
/*
* If %cs was not the kernel segment, then the NMI triggered in user
@@ -1554,6 +1598,7 @@ ENTRY(nmi)
*/
lea 6*8(%rsp), %rdx
test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ CFI_REMEMBER_STATE
nested_nmi:
/*
@@ -1585,10 +1630,12 @@ nested_nmi:
nested_nmi_out:
popq_cfi %rdx
+ CFI_RESTORE rdx
/* No need to check faults here */
INTERRUPT_RETURN
+ CFI_RESTORE_STATE
first_nmi:
/*
* Because nested NMIs will use the pushed location that we
@@ -1620,10 +1667,15 @@ first_nmi:
* | pt_regs |
* +-------------------------+
*
- * The saved RIP is used to fix up the copied RIP that a nested
- * NMI may zero out. The original stack frame and the temp storage
+ * The saved stack frame is used to fix up the copied stack frame
+ * that a nested NMI may change to make the interrupted NMI iret jump
+ * to the repeat_nmi. The original stack frame and the temp storage
* is also used by nested NMIs and can not be trusted on exit.
*/
+ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ movq (%rsp), %rdx
+ CFI_RESTORE rdx
+
/* Set the NMI executing variable on the stack. */
pushq_cfi $1
@@ -1631,22 +1683,39 @@ first_nmi:
.rept 5
pushq_cfi 6*8(%rsp)
.endr
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+
+ /* Everything up to here is safe from nested NMIs */
+
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ */
+repeat_nmi:
+ /*
+ * Update the stack variable to say we are still in NMI (the update
+ * is benign for the non-repeat case, where 1 was pushed just above
+ * to this very stack slot).
+ */
+ movq $1, 5*8(%rsp)
/* Make another copy, this one may be modified by nested NMIs */
.rept 5
pushq_cfi 4*8(%rsp)
.endr
-
- /* Do not pop rdx, nested NMIs will corrupt it */
- movq 11*8(%rsp), %rdx
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+end_repeat_nmi:
/*
* Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception. Repeated NMIs
- * caused by an exception and nested NMI will start here, and
- * can still be preempted by another NMI.
+ * NMI if the first NMI took an exception and reset our iret stack
+ * so that we repeat another NMI.
*/
-restart_nmi:
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1675,26 +1744,6 @@ nmi_restore:
CFI_ENDPROC
END(nmi)
- /*
- * If an NMI hit an iret because of an exception or breakpoint,
- * it can lose its NMI context, and a nested NMI may come in.
- * In that case, the nested NMI will change the preempted NMI's
- * stack to jump to here when it does the final iret.
- */
-repeat_nmi:
- INTR_FRAME
- /* Update the stack variable to say we are still in NMI */
- movq $1, 5*8(%rsp)
-
- /* copy the saved stack back to copy stack */
- .rept 5
- pushq_cfi 4*8(%rsp)
- .endr
-
- jmp restart_nmi
- CFI_ENDPROC
-end_repeat_nmi:
-
ENTRY(ignore_sysret)
CFI_STARTPROC
mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c9a281f..32ff365 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,40 +24,21 @@
#include <trace/syscall.h>
#include <asm/cacheflush.h>
+#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
-#include <asm/nmi.h>
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/*
- * modifying_code is set to notify NMIs that they need to use
- * memory barriers when entering or exiting. But we don't want
- * to burden NMIs with unnecessary memory barriers when code
- * modification is not being done (which is most of the time).
- *
- * A mutex is already held when ftrace_arch_code_modify_prepare
- * and post_process are called. No locks need to be taken here.
- *
- * Stop machine will make sure currently running NMIs are done
- * and new NMIs will see the updated variable before we need
- * to worry about NMIs doing memory barriers.
- */
-static int modifying_code __read_mostly;
-static DEFINE_PER_CPU(int, save_modifying_code);
-
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
set_all_modules_text_rw();
- modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
- modifying_code = 0;
set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
-/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
- *
- * Two buffers are added: An IP buffer and a "code" buffer.
- *
- * 1) Put the instruction pointer into the IP buffer
- * and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- * we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
- *
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
- *
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
- */
-
-#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status; /* holds return value of text write */
-static void *mod_code_ip; /* holds the IP to write to */
-static const void *mod_code_newcode; /* holds the text to write to the IP */
-
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
-
-int ftrace_arch_read_dyn_info(char *buf, int size)
-{
- int r;
-
- r = snprintf(buf, size, "%u %u",
- nmi_wait_count,
- atomic_read(&nmi_update_count));
- return r;
-}
-
-static void clear_mod_flag(void)
-{
- int old = atomic_read(&nmi_running);
-
- for (;;) {
- int new = old & ~MOD_CODE_WRITE_FLAG;
-
- if (old == new)
- break;
-
- old = atomic_cmpxchg(&nmi_running, old, new);
- }
-}
-
-static void ftrace_mod_code(void)
-{
- /*
- * Yes, more than one CPU process can be writing to mod_code_status.
- * (and the code itself)
- * But if one were to fail, then they all should, and if one were
- * to succeed, then they all should.
- */
- mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
- MCOUNT_INSN_SIZE);
-
- /* if we fail, then kill any new writers */
- if (mod_code_status)
- clear_mod_flag();
-}
-
-void ftrace_nmi_enter(void)
-{
- __this_cpu_write(save_modifying_code, modifying_code);
-
- if (!__this_cpu_read(save_modifying_code))
- return;
-
- if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
- smp_rmb();
- ftrace_mod_code();
- atomic_inc(&nmi_update_count);
- }
- /* Must have previous changes seen before executions */
- smp_mb();
-}
-
-void ftrace_nmi_exit(void)
-{
- if (!__this_cpu_read(save_modifying_code))
- return;
-
- /* Finish all executions before clearing nmi_running */
- smp_mb();
- atomic_dec(&nmi_running);
-}
-
-static void wait_for_nmi_and_set_mod_flag(void)
-{
- if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
- return;
-
- do {
- cpu_relax();
- } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
-
- nmi_wait_count++;
-}
-
-static void wait_for_nmi(void)
-{
- if (!atomic_read(&nmi_running))
- return;
-
- do {
- cpu_relax();
- } while (atomic_read(&nmi_running));
-
- nmi_wait_count++;
-}
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
- mod_code_ip = (void *)ip;
- mod_code_newcode = new_code;
-
- /* The buffers need to be visible before we let NMIs write them */
- smp_mb();
-
- wait_for_nmi_and_set_mod_flag();
-
- /* Make sure all running NMIs have finished before we write the code */
- smp_mb();
-
- ftrace_mod_code();
-
- /* Make sure the write happens before clearing the bit */
- smp_mb();
-
- clear_mod_flag();
- wait_for_nmi();
-
- return mod_code_status;
+ return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
static const unsigned char *ftrace_nop_replace(void)
@@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
return ret;
}
+int modifying_ftrace_code __read_mostly;
+
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ if (!ftrace_location(regs->ip - 1))
+ return 0;
+
+ regs->ip += MCOUNT_INSN_SIZE - 1;
+
+ return 1;
+}
+
+static int ftrace_write(unsigned long ip, const char *val, int size)
+{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only with
+ * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+ * of the kernel text mapping to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa(ip));
+
+ return probe_kernel_write((void *)ip, val, size);
+}
+
+static int add_break(unsigned long ip, const char *old)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ if (ftrace_write(ip, &brk, 1))
+ return -EPERM;
+
+ return 0;
+}
+
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+
+ return add_break(rec->ip, old);
+}
+
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
+{
+ unsigned const char *old;
+
+ old = ftrace_nop_replace();
+
+ return add_break(rec->ip, old);
+}
+
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_brk_on_nop(rec);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_brk_on_call(rec, ftrace_addr);
+ }
+ return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+ unsigned char ins[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+ const unsigned char *nop;
+ unsigned long ftrace_addr;
+ unsigned long ip = rec->ip;
+
+ /* If we fail the read, just give up */
+ if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* If this does not have a breakpoint, we are done */
+ if (ins[0] != brk)
+ return -1;
+
+ nop = ftrace_nop_replace();
+
+ /*
+ * If the last 4 bytes of the instruction do not match
+ * a nop, then we assume that this is a call to ftrace_addr.
+ */
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+ /*
+ * For extra paranoidism, we check if the breakpoint is on
+ * a call that would actually jump to the ftrace_addr.
+ * If not, don't touch the breakpoint, we make just create
+ * a disaster.
+ */
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+ return -EINVAL;
+ }
+
+ return probe_kernel_write((void *)ip, &nop[0], 1);
+}
+
+static int add_update_code(unsigned long ip, unsigned const char *new)
+{
+ /* skip breakpoint */
+ ip++;
+ new++;
+ if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
+ return -EPERM;
+ return 0;
+}
+
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+ return add_update_code(ip, new);
+}
+
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+ return add_update_code(ip, new);
+}
+
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+
+ if (ftrace_write(ip, new, 1))
+ return -EPERM;
+
+ return 0;
+}
+
+static int finish_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+
+ if (ftrace_write(ip, new, 1))
+ return -EPERM;
+ return 0;
+}
+
+static int finish_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_update_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return finish_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return finish_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static void do_sync_core(void *data)
+{
+ sync_core();
+}
+
+static void run_sync(void)
+{
+ int enable_irqs = irqs_disabled();
+
+ /* We may be called with interrupts disbled (on bootup). */
+ if (enable_irqs)
+ local_irq_enable();
+ on_each_cpu(do_sync_core, NULL, 1);
+ if (enable_irqs)
+ local_irq_disable();
+}
+
+void ftrace_replace_code(int enable)
+{
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_breakpoints(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "updating code";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ report = "removing breakpoints";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = finish_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ return;
+
+ remove_breakpoints:
+ ftrace_bug(ret, rec ? rec->ip : 0);
+ printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ remove_breakpoint(rec);
+ }
+}
+
+void arch_ftrace_update_code(int command)
+{
+ modifying_ftrace_code++;
+
+ ftrace_modify_all_code(command);
+
+ modifying_ftrace_code--;
+}
+
int __init ftrace_dyn_arch_init(void *data)
{
/* The return code is retured via data */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 739d859..f250431 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -16,6 +16,7 @@
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/user.h>
#ifdef CONFIG_X86_64
@@ -32,6 +33,86 @@
# define user32_fxsr_struct user_fxsr_struct
#endif
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * We can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ */
+static inline bool interrupted_kernel_fpu_idle(void)
+{
+ return !__thread_has_fpu(current) &&
+ (read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static inline bool interrupted_user_mode(void)
+{
+ struct pt_regs *regs = get_irq_regs();
+ return regs && user_mode_vm(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+ return !in_interrupt() ||
+ interrupted_user_mode() ||
+ interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void kernel_fpu_begin(void)
+{
+ struct task_struct *me = current;
+
+ WARN_ON_ONCE(!irq_fpu_usable());
+ preempt_disable();
+ if (__thread_has_fpu(me)) {
+ __save_init_fpu(me);
+ __thread_clear_has_fpu(me);
+ /* We do 'stts()' in kernel_fpu_end() */
+ } else {
+ this_cpu_write(fpu_owner_task, NULL);
+ clts();
+ }
+}
+EXPORT_SYMBOL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+ stts();
+ preempt_enable();
+}
+EXPORT_SYMBOL(kernel_fpu_end);
+
+void unlazy_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ if (__thread_has_fpu(tsk)) {
+ __save_init_fpu(tsk);
+ __thread_fpu_end(tsk);
+ } else
+ tsk->fpu_counter = 0;
+ preempt_enable();
+}
+EXPORT_SYMBOL(unlazy_fpu);
+
#ifdef CONFIG_MATH_EMULATION
# define HAVE_HWFP (boot_cpu_data.hard_math)
#else
@@ -44,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size);
unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
static struct i387_fxsave_struct fx_scratch __cpuinitdata;
-void __cpuinit mxcsr_feature_mask_init(void)
+static void __cpuinit mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;
@@ -154,6 +235,7 @@ int init_fpu(struct task_struct *tsk)
if (tsk_used_math(tsk)) {
if (HAVE_HWFP && tsk == current)
unlazy_fpu(tsk);
+ tsk->thread.fpu.last_cpu = ~0;
return 0;
}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 6104852..36d1853 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,7 +15,6 @@
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf..0000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data..cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7943e0c..3dafc60 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -282,8 +282,13 @@ void fixup_irqs(void)
else if (!(warned++))
set_affinity = 0;
+ /*
+ * We unmask if the irq was not marked masked by the
+ * core code. That respects the lazy irq disable
+ * behaviour.
+ */
if (!irqd_can_move_in_process_context(data) &&
- !irqd_irq_disabled(data) && chip->irq_unmask)
+ !irqd_irq_masked(data) && chip->irq_unmask)
chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc861..344faf8 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
irqctx->tinfo.task = curctx->tinfo.task;
irqctx->tinfo.previous_esp = current_stack_pointer;
- /*
- * Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context.
- */
- irqctx->tinfo.preempt_count =
- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+ /* Copy the preempt_count so that the [soft]irq checks work. */
+ irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
@@ -132,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu)
return;
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
@@ -142,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu)
per_cpu(hardirq_ctx, cpu) = irqctx;
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
if (unlikely(!desc))
return false;
- if (!execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5c..252981a 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -16,7 +16,6 @@
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
@@ -61,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
outb(0, 0xF0);
if (ignore_fpu_irq || !boot_cpu_data.hard_math)
return IRQ_NONE;
- math_error(get_irq_regs(), 0, 16);
+ math_error(get_irq_regs(), 0, X86_TRAP_MF);
return IRQ_HANDLED;
}
@@ -306,10 +305,10 @@ void __init native_init_IRQ(void)
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ i = FIRST_EXTERNAL_VECTOR;
+ for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- if (!test_bit(i, used_vectors))
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 90fcf62..1d5d31e 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -68,16 +68,9 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
return count;
}
-static int setup_data_open(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
-
- return 0;
-}
-
static const struct file_operations fops_setup_data = {
.read = setup_data_read,
- .open = setup_data_open,
+ .open = simple_open,
.llseek = default_llseek,
};
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba577..8bfb614 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,10 +43,11 @@
#include <linux/smp.h>
#include <linux/nmi.h>
#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
#include <asm/debugreg.h>
#include <asm/apicdef.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -67,8 +68,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "ss", 4, offsetof(struct pt_regs, ss) },
{ "ds", 4, offsetof(struct pt_regs, ds) },
{ "es", 4, offsetof(struct pt_regs, es) },
- { "fs", 4, -1 },
- { "gs", 4, -1 },
#else
{ "ax", 8, offsetof(struct pt_regs, ax) },
{ "bx", 8, offsetof(struct pt_regs, bx) },
@@ -90,7 +89,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "flags", 4, offsetof(struct pt_regs, flags) },
{ "cs", 4, offsetof(struct pt_regs, cs) },
{ "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
#endif
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
};
int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
@@ -740,6 +743,64 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
regs->ip = ip;
}
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ bpt->type = BP_BREAKPOINT;
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!err)
+ return err;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ return -EBUSY;
+ text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
+ return -EINVAL;
+ bpt->type = BP_POKE_BREAKPOINT;
+#endif /* CONFIG_DEBUG_RODATA */
+ return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+#ifdef CONFIG_DEBUG_RODATA
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ if (bpt->type != BP_POKE_BREAKPOINT)
+ goto knl_write;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ goto knl_write;
+ text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
+ goto knl_write;
+ return err;
+knl_write:
+#endif /* CONFIG_DEBUG_RODATA */
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
struct kgdb_arch arch_kgdb_ops = {
/* Breakpoint instruction: */
.gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
new file mode 100644
index 0000000..3230b68
--- /dev/null
+++ b/arch/x86/kernel/kprobes-common.h
@@ -0,0 +1,102 @@
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %es\n" \
+ " pushl %ds\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern int can_boost(kprobe_opcode_t *instruction);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+ unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *from, void *to);
+extern void synthesize_relcall(void *from, void *to);
+
+#ifdef CONFIG_OPTPROBES
+extern int arch_init_optprobes(void);
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else /* !CONFIG_OPTPROBES */
+static inline int arch_init_optprobes(void)
+{
+ return 0;
+}
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ return addr;
+}
+#endif
+#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
new file mode 100644
index 0000000..c5e410e
--- /dev/null
+++ b/arch/x86/kernel/kprobes-opt.c
@@ -0,0 +1,512 @@
+/*
+ * Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+
+#include "kprobes-common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct optimized_kprobe *op;
+ struct kprobe *kp;
+ long offs;
+ int i;
+
+ for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ kp = get_kprobe((void *)addr - i);
+ /* This function only handles jump-optimized kprobe */
+ if (kp && kprobe_optimized(kp)) {
+ op = container_of(kp, struct optimized_kprobe, kp);
+ /* If op->list is not empty, op is under optimizing */
+ if (list_empty(&op->list))
+ goto found;
+ }
+ }
+
+ return addr;
+found:
+ /*
+ * If the kprobe can be optimized, original bytes which can be
+ * overwritten by jump destination address. In this case, original
+ * bytes must be recovered from op->optinsn.copied_insn buffer.
+ */
+ memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ if (addr == (unsigned long)kp->addr) {
+ buf[0] = kp->opcode;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ } else {
+ offs = addr - (unsigned long)kp->addr - 1;
+ memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ }
+
+ return (unsigned long)buf;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+static void __used __kprobes kprobes_optinsn_template_holder(void)
+{
+ asm volatile (
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end:\n");
+}
+
+#define TMPL_MOVE_IDX \
+ ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ unsigned long flags;
+
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
+ local_irq_save(flags);
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __this_cpu_write(current_kprobe, &op->kp);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+ local_irq_restore(flags);
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len);
+ if (!ret || !can_boost(dest + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+ return 0;
+
+ /*
+ * Do not optimize in the entry code due to the unstable
+ * stack handling.
+ */
+ if ((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
+ insn_get_length(&insn);
+ /* Another subsystem puts a breakpoint */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes
+arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+ u8 *buf;
+ int ret;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ op->optinsn.insn = get_optinsn_slot();
+ if (!op->optinsn.insn)
+ return -ENOMEM;
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff)
+ return -ERANGE;
+
+ buf = (u8 *)op->optinsn.insn;
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+ if (ret < 0) {
+ __arch_remove_optimized_kprobe(op, 0);
+ return ret;
+ }
+ op->optinsn.size = ret;
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ (u8 *)op->kp.addr + op->optinsn.size);
+
+ flush_icache_range((unsigned long) buf,
+ (unsigned long) buf + TMPL_END_IDX +
+ op->optinsn.size + RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+ u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
+{
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ WARN_ON(kprobe_disabled(&op->kp));
+ /* Setup param */
+ setup_optimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_del_init(&op->list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
+{
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ /* Setup param */
+ setup_unoptimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_move(&op->list, done_list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp_batch(jump_poke_params, c);
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+int __kprobes
+setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
+}
+
+int __kprobes arch_init_optprobes(void)
+{
+ /* Allocate code buffer and parameter array */
+ jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_bufs)
+ return -ENOMEM;
+
+ jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_params) {
+ kfree(jump_poke_bufs);
+ jump_poke_bufs = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7da647d..e2f751e 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,16 +30,15 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
* 2005-May Rusty Lynch <rusty.lynch@intel.com>
- * Added function return probes functionality
+ * Added function return probes functionality
* 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
- * kprobe-booster and kretprobe-booster for i386.
+ * kprobe-booster and kretprobe-booster for i386.
* 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- * and kretprobe-booster for x86-64
+ * and kretprobe-booster for x86-64
* 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
- * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
- * unified x86 kprobes code.
+ * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * unified x86 kprobes code.
*/
-
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/string.h>
@@ -59,6 +58,8 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
+#include "kprobes-common.h"
+
void jprobe_return_end(void);
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
doesn't switch kernel stack.*/
{NULL, NULL} /* Terminator */
};
+
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
}
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes synthesize_reljump(void *from, void *to)
+void __kprobes synthesize_reljump(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void __kprobes synthesize_relcall(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
/*
* Skip the prefixes of the instruction.
*/
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
* Returns non-zero if opcode is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-static int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int __kprobes can_boost(kprobe_opcode_t *opcodes)
{
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,13 +215,15 @@ retry:
}
}
-/* Recover the probed instruction at addr for further analysis. */
-static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
{
struct kprobe *kp;
+
kp = get_kprobe((void *)addr);
+ /* There is no probe, return original address */
if (!kp)
- return -EINVAL;
+ return addr;
/*
* Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
*/
memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
buf[0] = kp->opcode;
- return 0;
+ return (unsigned long)buf;
+}
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ unsigned long __addr;
+
+ __addr = __recover_optprobed_insn(buf, addr);
+ if (__addr != addr)
+ return __addr;
+
+ return __recover_probed_insn(buf, addr);
}
/* Check if paddr is at an instruction boundary */
static int __kprobes can_probe(unsigned long paddr)
{
- int ret;
- unsigned long addr, offset = 0;
+ unsigned long addr, __addr, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr)
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr) {
- kernel_insn_init(&insn, (void *)addr);
- insn_get_opcode(&insn);
-
/*
* Check if the instruction has been modified by another
* kprobe, in which case we replace the breakpoint by the
* original instruction in our buffer.
+ * Also, jump optimization will change the breakpoint to
+ * relative-jump. Since the relative-jump itself is
+ * normally used, we just go through if there is no kprobe.
*/
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf, addr);
- if (ret)
- /*
- * Another debugging subsystem might insert
- * this breakpoint. In that case, we can't
- * recover it.
- */
- return 0;
- kernel_insn_init(&insn, buf);
- }
+ __addr = recover_probed_instruction(buf, addr);
+ kernel_insn_init(&insn, (void *)__addr);
insn_get_length(&insn);
+
+ /*
+ * Another debugging subsystem might insert this breakpoint.
+ * In that case, we can't recover it.
+ */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
addr += insn.length;
}
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
* If not, return null.
* Only applicable to 64-bit x86.
*/
-static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
+int __kprobes __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
- int ret;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- kernel_insn_init(&insn, src);
- if (recover) {
- insn_get_opcode(&insn);
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf,
- (unsigned long)src);
- if (ret)
- return 0;
- kernel_insn_init(&insn, buf);
- }
- }
+ kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
insn_get_length(&insn);
+ /* Another subsystem puts a breakpoint, failed to recover */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
memcpy(dest, insn.kaddr, insn.length);
#ifdef CONFIG_X86_64
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
* extension of the original signed 32-bit displacement would
* have given.
*/
- newdisp = (u8 *) src + (s64) insn.displacement.value -
- (u8 *) dest;
+ newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
disp = (u8 *) dest + insn_offset_displacement(&insn);
*(s32 *) disp = (s32) newdisp;
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
static void __kprobes arch_copy_kprobe(struct kprobe *p)
{
+ /* Copy an instruction with recovering if other optprobe modifies it.*/
+ __copy_instruction(p->ainsn.insn, p->addr);
+
/*
- * Copy an instruction without recovering int3, because it will be
- * put by another subsystem.
+ * __copy_instruction can modify the displacement of the instruction,
+ * but it doesn't affect boostable check.
*/
- __copy_instruction(p->ainsn.insn, p->addr, 0);
-
- if (can_boost(p->addr))
+ if (can_boost(p->ainsn.insn))
p->ainsn.boostable = 0;
else
p->ainsn.boostable = -1;
- p->opcode = *p->addr;
+ /* Also, displacement change doesn't affect the first byte */
+ p->opcode = p->ainsn.insn[0];
}
int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void)
}
}
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+void __kprobes
+arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
*sara = (unsigned long) &kretprobe_trampoline;
}
-#ifdef CONFIG_OPTPROBES
-static int __kprobes setup_detour_execution(struct kprobe *p,
- struct pt_regs *regs,
- int reenter);
-#else
-#define setup_detour_execution(p, regs, reenter) (0)
-#endif
-
-static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb, int reenter)
+static void __kprobes
+setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
{
if (setup_detour_execution(p, regs, reenter))
return;
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
*/
-static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static int __kprobes
+reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
return 0;
}
-#ifdef CONFIG_X86_64
-#define SAVE_REGS_STRING \
- /* Skip cs, ip, orig_ax. */ \
- " subq $24, %rsp\n" \
- " pushq %rdi\n" \
- " pushq %rsi\n" \
- " pushq %rdx\n" \
- " pushq %rcx\n" \
- " pushq %rax\n" \
- " pushq %r8\n" \
- " pushq %r9\n" \
- " pushq %r10\n" \
- " pushq %r11\n" \
- " pushq %rbx\n" \
- " pushq %rbp\n" \
- " pushq %r12\n" \
- " pushq %r13\n" \
- " pushq %r14\n" \
- " pushq %r15\n"
-#define RESTORE_REGS_STRING \
- " popq %r15\n" \
- " popq %r14\n" \
- " popq %r13\n" \
- " popq %r12\n" \
- " popq %rbp\n" \
- " popq %rbx\n" \
- " popq %r11\n" \
- " popq %r10\n" \
- " popq %r9\n" \
- " popq %r8\n" \
- " popq %rax\n" \
- " popq %rcx\n" \
- " popq %rdx\n" \
- " popq %rsi\n" \
- " popq %rdi\n" \
- /* Skip orig_ax, ip, cs */ \
- " addq $24, %rsp\n"
-#else
-#define SAVE_REGS_STRING \
- /* Skip cs, ip, orig_ax and gs. */ \
- " subl $16, %esp\n" \
- " pushl %fs\n" \
- " pushl %es\n" \
- " pushl %ds\n" \
- " pushl %eax\n" \
- " pushl %ebp\n" \
- " pushl %edi\n" \
- " pushl %esi\n" \
- " pushl %edx\n" \
- " pushl %ecx\n" \
- " pushl %ebx\n"
-#define RESTORE_REGS_STRING \
- " popl %ebx\n" \
- " popl %ecx\n" \
- " popl %edx\n" \
- " popl %esi\n" \
- " popl %edi\n" \
- " popl %ebp\n" \
- " popl %eax\n" \
- /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
- " addl $24, %esp\n"
-#endif
-
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
* jump instruction after the copied instruction, that jumps to the next
* instruction after the probepoint.
*/
-static void __kprobes resume_execution(struct kprobe *p,
- struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static void __kprobes
+resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
{
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
/*
* Wrapper routine for handling exceptions.
*/
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
+int __kprobes
+kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
{
struct die_args *args = data;
int ret = NOTIFY_DONE;
@@ -1092,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
"current sp %p does not match saved sp %p\n",
stack_addr(regs), kcb->jprobe_saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_registers(saved_regs);
+ show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
- show_registers(regs);
+ show_regs(regs);
BUG();
}
*regs = kcb->jprobe_saved_regs;
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
-
-#ifdef CONFIG_OPTPROBES
-
-/* Insert a call instruction at address 'from', which calls address 'to'.*/
-static void __kprobes synthesize_relcall(void *from, void *to)
-{
- __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
-}
-
-/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
- unsigned long val)
-{
-#ifdef CONFIG_X86_64
- *addr++ = 0x48;
- *addr++ = 0xbf;
-#else
- *addr++ = 0xb8;
-#endif
- *(unsigned long *)addr = val;
-}
-
-static void __used __kprobes kprobes_optinsn_template_holder(void)
-{
- asm volatile (
- ".global optprobe_template_entry\n"
- "optprobe_template_entry: \n"
-#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
- " pushq %rsp\n"
- " pushfq\n"
- SAVE_REGS_STRING
- " movq %rsp, %rsi\n"
- ".global optprobe_template_val\n"
- "optprobe_template_val: \n"
- ASM_NOP5
- ASM_NOP5
- ".global optprobe_template_call\n"
- "optprobe_template_call: \n"
- ASM_NOP5
- /* Move flags to rsp */
- " movq 144(%rsp), %rdx\n"
- " movq %rdx, 152(%rsp)\n"
- RESTORE_REGS_STRING
- /* Skip flags entry */
- " addq $8, %rsp\n"
- " popfq\n"
-#else /* CONFIG_X86_32 */
- " pushf\n"
- SAVE_REGS_STRING
- " movl %esp, %edx\n"
- ".global optprobe_template_val\n"
- "optprobe_template_val: \n"
- ASM_NOP5
- ".global optprobe_template_call\n"
- "optprobe_template_call: \n"
- ASM_NOP5
- RESTORE_REGS_STRING
- " addl $4, %esp\n" /* skip cs */
- " popf\n"
-#endif
- ".global optprobe_template_end\n"
- "optprobe_template_end: \n");
-}
-
-#define TMPL_MOVE_IDX \
- ((long)&optprobe_template_val - (long)&optprobe_template_entry)
-#define TMPL_CALL_IDX \
- ((long)&optprobe_template_call - (long)&optprobe_template_entry)
-#define TMPL_END_IDX \
- ((long)&optprobe_template_end - (long)&optprobe_template_entry)
-
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
-/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op,
- struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- unsigned long flags;
-
- /* This is possible if op is under delayed unoptimizing */
- if (kprobe_disabled(&op->kp))
- return;
-
- local_irq_save(flags);
- if (kprobe_running()) {
- kprobes_inc_nmissed_count(&op->kp);
- } else {
- /* Save skipped registers */
-#ifdef CONFIG_X86_64
- regs->cs = __KERNEL_CS;
-#else
- regs->cs = __KERNEL_CS | get_kernel_rpl();
- regs->gs = 0;
-#endif
- regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
- regs->orig_ax = ~0UL;
-
- __this_cpu_write(current_kprobe, &op->kp);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
- opt_pre_handler(&op->kp, regs);
- __this_cpu_write(current_kprobe, NULL);
- }
- local_irq_restore(flags);
-}
-
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
-{
- int len = 0, ret;
-
- while (len < RELATIVEJUMP_SIZE) {
- ret = __copy_instruction(dest + len, src + len, 1);
- if (!ret || !can_boost(dest + len))
- return -EINVAL;
- len += ret;
- }
- /* Check whether the address range is reserved */
- if (ftrace_text_reserved(src, src + len - 1) ||
- alternatives_text_reserved(src, src + len - 1) ||
- jump_label_text_reserved(src, src + len - 1))
- return -EBUSY;
-
- return len;
-}
-
-/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
-{
- return ((insn->opcode.bytes[0] == 0xff &&
- (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
- insn->opcode.bytes[0] == 0xea); /* Segment based jump */
-}
-
-/* Check whether insn jumps into specified address range */
-static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
-{
- unsigned long target = 0;
-
- switch (insn->opcode.bytes[0]) {
- case 0xe0: /* loopne */
- case 0xe1: /* loope */
- case 0xe2: /* loop */
- case 0xe3: /* jcxz */
- case 0xe9: /* near relative jump */
- case 0xeb: /* short relative jump */
- break;
- case 0x0f:
- if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
- break;
- return 0;
- default:
- if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
- break;
- return 0;
- }
- target = (unsigned long)insn->next_byte + insn->immediate.value;
-
- return (start <= target && target <= start + len);
-}
-
-/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
-{
- int ret;
- unsigned long addr, size = 0, offset = 0;
- struct insn insn;
- kprobe_opcode_t buf[MAX_INSN_SIZE];
-
- /* Lookup symbol including addr */
- if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
- return 0;
-
- /*
- * Do not optimize in the entry code due to the unstable
- * stack handling.
- */
- if ((paddr >= (unsigned long )__entry_text_start) &&
- (paddr < (unsigned long )__entry_text_end))
- return 0;
-
- /* Check there is enough space for a relative jump. */
- if (size - offset < RELATIVEJUMP_SIZE)
- return 0;
-
- /* Decode instructions */
- addr = paddr - offset;
- while (addr < paddr - offset + size) { /* Decode until function end */
- if (search_exception_tables(addr))
- /*
- * Since some fixup code will jumps into this function,
- * we can't optimize kprobe in this function.
- */
- return 0;
- kernel_insn_init(&insn, (void *)addr);
- insn_get_opcode(&insn);
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf, addr);
- if (ret)
- return 0;
- kernel_insn_init(&insn, buf);
- }
- insn_get_length(&insn);
- /* Recover address */
- insn.kaddr = (void *)addr;
- insn.next_byte = (void *)(addr + insn.length);
- /* Check any instructions don't jump into target */
- if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_SIZE,
- RELATIVE_ADDR_SIZE))
- return 0;
- addr += insn.length;
- }
-
- return 1;
-}
-
-/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
-{
- int i;
- struct kprobe *p;
-
- for (i = 1; i < op->optinsn.size; i++) {
- p = get_kprobe(op->kp.addr + i);
- if (p && !kprobe_disabled(p))
- return -EEXIST;
- }
-
- return 0;
-}
-
-/* Check the addr is within the optimized instructions. */
-int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
- unsigned long addr)
-{
- return ((unsigned long)op->kp.addr <= addr &&
- (unsigned long)op->kp.addr + op->optinsn.size > addr);
-}
-
-/* Free optimized instruction slot */
-static __kprobes
-void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
-{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
- op->optinsn.insn = NULL;
- op->optinsn.size = 0;
- }
-}
-
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
-{
- __arch_remove_optimized_kprobe(op, 1);
-}
-
-/*
- * Copy replacing target instructions
- * Target instructions MUST be relocatable (checked inside)
- */
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
-{
- u8 *buf;
- int ret;
- long rel;
-
- if (!can_optimize((unsigned long)op->kp.addr))
- return -EILSEQ;
-
- op->optinsn.insn = get_optinsn_slot();
- if (!op->optinsn.insn)
- return -ENOMEM;
-
- /*
- * Verify if the address gap is in 2GB range, because this uses
- * a relative jump.
- */
- rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
- if (abs(rel) > 0x7fffffff)
- return -ERANGE;
-
- buf = (u8 *)op->optinsn.insn;
-
- /* Copy instructions into the out-of-line buffer */
- ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
- if (ret < 0) {
- __arch_remove_optimized_kprobe(op, 0);
- return ret;
- }
- op->optinsn.size = ret;
-
- /* Copy arch-dep-instance from template */
- memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
-
- /* Set probe information */
- synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
-
- /* Set probe function call */
- synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
-
- /* Set returning jmp instruction at the tail of out-of-line buffer */
- synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
- (u8 *)op->kp.addr + op->optinsn.size);
-
- flush_icache_range((unsigned long) buf,
- (unsigned long) buf + TMPL_END_IDX +
- op->optinsn.size + RELATIVEJUMP_SIZE);
- return 0;
-}
-
-#define MAX_OPTIMIZE_PROBES 256
-static struct text_poke_param *jump_poke_params;
-static struct jump_poke_buffer {
- u8 buf[RELATIVEJUMP_SIZE];
-} *jump_poke_bufs;
-
-static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
- u8 *insn_buf,
- struct optimized_kprobe *op)
-{
- s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
- /* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
-
- insn_buf[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&insn_buf[1]) = rel;
-
- tprm->addr = op->kp.addr;
- tprm->opcode = insn_buf;
- tprm->len = RELATIVEJUMP_SIZE;
-}
-
-/*
- * Replace breakpoints (int3) with relative jumps.
- * Caller must call with locking kprobe_mutex and text_mutex.
- */
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
-{
- struct optimized_kprobe *op, *tmp;
- int c = 0;
-
- list_for_each_entry_safe(op, tmp, oplist, list) {
- WARN_ON(kprobe_disabled(&op->kp));
- /* Setup param */
- setup_optimize_kprobe(&jump_poke_params[c],
- jump_poke_bufs[c].buf, op);
- list_del_init(&op->list);
- if (++c >= MAX_OPTIMIZE_PROBES)
- break;
- }
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp_batch(jump_poke_params, c);
-}
-
-static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
- u8 *insn_buf,
- struct optimized_kprobe *op)
-{
- /* Set int3 to first byte for kprobes */
- insn_buf[0] = BREAKPOINT_INSTRUCTION;
- memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-
- tprm->addr = op->kp.addr;
- tprm->opcode = insn_buf;
- tprm->len = RELATIVEJUMP_SIZE;
-}
-
-/*
- * Recover original instructions and breakpoints from relative jumps.
- * Caller must call with locking kprobe_mutex.
- */
-extern void arch_unoptimize_kprobes(struct list_head *oplist,
- struct list_head *done_list)
-{
- struct optimized_kprobe *op, *tmp;
- int c = 0;
-
- list_for_each_entry_safe(op, tmp, oplist, list) {
- /* Setup param */
- setup_unoptimize_kprobe(&jump_poke_params[c],
- jump_poke_bufs[c].buf, op);
- list_move(&op->list, done_list);
- if (++c >= MAX_OPTIMIZE_PROBES)
- break;
- }
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp_batch(jump_poke_params, c);
-}
-
-/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
-{
- u8 buf[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- buf[0] = BREAKPOINT_INSTRUCTION;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
-}
-
-static int __kprobes setup_detour_execution(struct kprobe *p,
- struct pt_regs *regs,
- int reenter)
-{
- struct optimized_kprobe *op;
-
- if (p->flags & KPROBE_FLAG_OPTIMIZED) {
- /* This kprobe is really able to run optimized path. */
- op = container_of(p, struct optimized_kprobe, kp);
- /* Detour through copied instructions */
- regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
- if (!reenter)
- reset_current_kprobe();
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-
-static int __kprobes init_poke_params(void)
-{
- /* Allocate code buffer and parameter array */
- jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
- MAX_OPTIMIZE_PROBES, GFP_KERNEL);
- if (!jump_poke_bufs)
- return -ENOMEM;
-
- jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
- MAX_OPTIMIZE_PROBES, GFP_KERNEL);
- if (!jump_poke_params) {
- kfree(jump_poke_bufs);
- jump_poke_bufs = NULL;
- return -ENOMEM;
- }
-
- return 0;
-}
-#else /* !CONFIG_OPTPROBES */
-static int __kprobes init_poke_params(void)
-{
- return 0;
-}
-#endif
-
int __init arch_init_kprobes(void)
{
- return init_poke_params();
+ return arch_init_optprobes();
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6..e554e5a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -38,6 +38,7 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
+#include <asm/idle.h>
static int kvmapf = 1;
@@ -78,7 +79,6 @@ struct kvm_task_sleep_node {
u32 token;
int cpu;
bool halted;
- struct mm_struct *mm;
};
static struct kvm_task_sleep_head {
@@ -125,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token)
n.token = token;
n.cpu = smp_processor_id();
- n.mm = current->active_mm;
n.halted = idle || preempt_count() > 1;
- atomic_inc(&n.mm->mm_count);
init_waitqueue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
spin_unlock(&b->lock);
@@ -160,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
- if (!n->mm)
- return;
- mmdrop(n->mm);
if (n->halted)
smp_send_reschedule(n->cpu);
else if (waitqueue_active(&n->wq))
@@ -206,7 +201,7 @@ again:
* async PF was not yet handled.
* Add dummy entry for the token.
*/
- n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ n = kzalloc(sizeof(*n), GFP_ATOMIC);
if (!n) {
/*
* Allocation failed! Busy wait while other cpu
@@ -218,7 +213,6 @@ again:
}
n->token = token;
n->cpu = smp_processor_id();
- n->mm = NULL;
init_waitqueue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
@@ -253,7 +247,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
kvm_async_pf_task_wait((u32)read_cr2());
break;
case KVM_PV_REASON_PAGE_READY:
+ rcu_irq_enter();
+ exit_idle();
kvm_async_pf_task_wake((u32)read_cr2());
+ rcu_irq_exit();
break;
}
}
@@ -438,9 +435,9 @@ void __init kvm_guest_init(void)
static __init int activate_jump_labels(void)
{
if (has_steal_clock) {
- jump_label_inc(&paravirt_steal_enabled);
+ static_key_slow_inc(&paravirt_steal_enabled);
if (steal_acc)
- jump_label_inc(&paravirt_steal_rq_enabled);
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
}
return 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d7..f8492da6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
return ret;
}
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
+}
+
#ifdef CONFIG_X86_LOCAL_APIC
static void __cpuinit kvm_setup_secondary_clock(void)
{
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
* we shouldn't fail.
*/
WARN_ON(kvm_register_clock("secondary cpu clock"));
- /* ok, done with our trickery, call native */
- setup_secondary_APIC_clock();
}
#endif
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.setup_percpu_clockev =
+ x86_cpuinit.early_percpu_clock_init =
kvm_setup_secondary_clock;
#endif
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea69726..ebc9873 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,7 +15,6 @@
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43b..5b19e4d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,7 +23,6 @@
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/cacheflush.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 177183c..7eb1e2b 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -43,7 +43,6 @@
#include <linux/mca.h>
#include <linux/kprobes.h>
#include <linux/slab.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/mman.h>
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 73465aa..8a2ce8f 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -82,11 +82,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
- return -1;
- }
-
csig->rev = c->microcode;
pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
@@ -380,6 +375,13 @@ static struct microcode_ops microcode_amd_ops = {
struct microcode_ops * __init init_amd_microcode(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
patch = (void *)get_zeroed_page(GFP_KERNEL);
if (!patch)
return NULL;
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fda91c3..fbdfc69 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -86,6 +86,7 @@
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/cpu_device_id.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -298,12 +299,11 @@ static ssize_t reload_store(struct device *dev,
{
unsigned long val;
int cpu = dev->id;
- int ret = 0;
- char *end;
+ ssize_t ret = 0;
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
if (val == 1) {
get_online_cpus();
@@ -418,10 +418,8 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
if (err)
return err;
- if (microcode_init_cpu(cpu) == UCODE_ERROR) {
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ if (microcode_init_cpu(cpu) == UCODE_ERROR)
return -EINVAL;
- }
return err;
}
@@ -504,6 +502,20 @@ static struct notifier_block __refdata mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
+#ifdef MODULE
+/* Autoload on Intel and AMD systems */
+static const struct x86_cpu_id microcode_id[] = {
+#ifdef CONFIG_MICROCODE_INTEL
+ { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+#ifdef CONFIG_MICROCODE_AMD
+ { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, microcode_id);
+#endif
+
static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
@@ -513,11 +525,11 @@ static int __init microcode_init(void)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
microcode_ops = init_amd_microcode();
-
- if (!microcode_ops) {
+ else
pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
return -ENODEV;
- }
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 3ca42d0..0327e2b 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
memset(csig, 0, sizeof(*csig));
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- pr_err("CPU%d not a capable Intel processor\n", cpu_num);
- return -1;
- }
-
csig->sig = cpuid_eax(0x00000001);
if ((c->x86_model >= 5) || (c->x86 > 6)) {
@@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = {
struct microcode_ops * __init init_intel_microcode(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 925179f..f21fd94 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -26,7 +26,6 @@
#include <linux/gfp.h>
#include <linux/jump_label.h>
-#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 9635676..eb11369 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,6 @@
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
static struct class *msr_class;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf3..bffdfd4 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -31,14 +31,6 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
-#define NMI_MAX_NAMELEN 16
-struct nmiaction {
- struct list_head list;
- nmi_handler_t handler;
- unsigned int flags;
- char *name;
-};
-
struct nmi_desc {
spinlock_t lock;
struct list_head head;
@@ -54,6 +46,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[2].head),
+ },
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[3].head),
+ },
};
@@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
-static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -107,11 +107,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs,
return handled;
}
-static int __setup_nmi(unsigned int type, struct nmiaction *action)
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
+ if (!action->handler)
+ return -EINVAL;
+
spin_lock_irqsave(&desc->lock, flags);
/*
@@ -120,6 +123,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
* to manage expectations
*/
WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
/*
* some handlers need to be executed first otherwise a fake
@@ -133,8 +138,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
+EXPORT_SYMBOL(__register_nmi_handler);
-static struct nmiaction *__free_nmi(unsigned int type, const char *name)
+void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *n;
@@ -157,61 +163,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name)
spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
- return (n);
}
-
-int register_nmi_handler(unsigned int type, nmi_handler_t handler,
- unsigned long nmiflags, const char *devname)
-{
- struct nmiaction *action;
- int retval = -ENOMEM;
-
- if (!handler)
- return -EINVAL;
-
- action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
- if (!action)
- goto fail_action;
-
- action->handler = handler;
- action->flags = nmiflags;
- action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
- if (!action->name)
- goto fail_action_name;
-
- retval = __setup_nmi(type, action);
-
- if (retval)
- goto fail_setup_nmi;
-
- return retval;
-
-fail_setup_nmi:
- kfree(action->name);
-fail_action_name:
- kfree(action);
-fail_action:
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(register_nmi_handler);
-
-void unregister_nmi_handler(unsigned int type, const char *name)
-{
- struct nmiaction *a;
-
- a = __free_nmi(type, name);
- if (a) {
- kfree(a->name);
- kfree(a);
- }
-}
-
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static notrace __kprobes void
+static __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs, false))
+ return;
+
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -236,15 +197,19 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
-static notrace __kprobes void
+static __kprobes void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs, false))
+ return;
+
pr_emerg(
"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
- show_registers(regs);
+ show_regs(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
@@ -263,7 +228,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
-static notrace __kprobes void
+static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -305,7 +270,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+static __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 0d01a8e..e31bf8d 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -12,6 +12,8 @@
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -20,35 +22,35 @@
#define FAILURE 1
#define TIMEOUT 2
-static int nmi_fail;
+static int __initdata nmi_fail;
/* check to see if NMI IPIs work on this machine */
-static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
-static int testcase_total;
-static int testcase_successes;
-static int expected_testcase_failures;
-static int unexpected_testcase_failures;
-static int unexpected_testcase_unknowns;
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata expected_testcase_failures;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
-static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
{
unexpected_testcase_unknowns++;
return NMI_HANDLED;
}
-static void init_nmi_testsuite(void)
+static void __init init_nmi_testsuite(void)
{
/* trap all the unknown NMIs we may generate */
register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
}
-static void cleanup_nmi_testsuite(void)
+static void __init cleanup_nmi_testsuite(void)
{
unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
}
-static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
{
int cpu = raw_smp_processor_id();
@@ -58,7 +60,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
return NMI_DONE;
}
-static void test_nmi_ipi(struct cpumask *mask)
+static void __init test_nmi_ipi(struct cpumask *mask)
{
unsigned long timeout;
@@ -86,7 +88,7 @@ static void test_nmi_ipi(struct cpumask *mask)
return;
}
-static void remote_ipi(void)
+static void __init remote_ipi(void)
{
cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
@@ -94,19 +96,19 @@ static void remote_ipi(void)
test_nmi_ipi(to_cpumask(nmi_ipi_mask));
}
-static void local_ipi(void)
+static void __init local_ipi(void)
{
cpumask_clear(to_cpumask(nmi_ipi_mask));
cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
test_nmi_ipi(to_cpumask(nmi_ipi_mask));
}
-static void reset_nmi(void)
+static void __init reset_nmi(void)
{
nmi_fail = 0;
}
-static void dotest(void (*testcase_fn)(void), int expected)
+static void __init dotest(void (*testcase_fn)(void), int expected)
{
testcase_fn();
/*
@@ -116,27 +118,27 @@ static void dotest(void (*testcase_fn)(void), int expected)
unexpected_testcase_failures++;
if (nmi_fail == FAILURE)
- printk("FAILED |");
+ printk(KERN_CONT "FAILED |");
else if (nmi_fail == TIMEOUT)
- printk("TIMEOUT|");
+ printk(KERN_CONT "TIMEOUT|");
else
- printk("ERROR |");
+ printk(KERN_CONT "ERROR |");
dump_stack();
} else {
testcase_successes++;
- printk(" ok |");
+ printk(KERN_CONT " ok |");
}
testcase_total++;
reset_nmi();
}
-static inline void print_testname(const char *testname)
+static inline void __init print_testname(const char *testname)
{
printk("%12s:", testname);
}
-void nmi_selftest(void)
+void __init nmi_selftest(void)
{
init_nmi_testsuite();
@@ -149,10 +151,10 @@ void nmi_selftest(void)
print_testname("remote IPI");
dotest(remote_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
print_testname("local IPI");
dotest(local_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
cleanup_nmi_testsuite();
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e..9ce8859 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,6 +26,7 @@
#include <asm/bug.h>
#include <asm/paravirt.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/pgtable.h>
@@ -37,6 +38,7 @@
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
+#include <asm/special_insns.h>
/* nop stub */
void _paravirt_nop(void)
@@ -202,8 +204,8 @@ static void native_flush_tlb_single(unsigned long addr)
__native_flush_tlb_single(addr);
}
-struct jump_label_key paravirt_steal_enabled;
-struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
static u64 native_steal_clock(int cpu)
{
@@ -239,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- percpu_write(paravirt_lazy_mode, mode);
+ this_cpu_write(paravirt_lazy_mode, mode);
}
static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
- percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+ this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -265,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev)
{
BUG_ON(preemptible());
- if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
arch_leave_lazy_mmu_mode();
set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
}
@@ -287,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
if (in_interrupt())
return PARAVIRT_LAZY_NONE;
- return percpu_read(paravirt_lazy_mode);
+ return this_cpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 726494b..b72838b 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -42,7 +42,6 @@
#include <asm/calgary.h>
#include <asm/tce.h>
#include <asm/pci-direct.h>
-#include <asm/system.h>
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
@@ -431,7 +430,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
}
static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
+ dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs)
{
void *ret = NULL;
dma_addr_t mapping;
@@ -464,7 +463,8 @@ error:
}
static void calgary_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
+ void *vaddr, dma_addr_t dma_handle,
+ struct dma_attrs *attrs)
{
unsigned int npages;
struct iommu_table *tbl = find_iommu_table(dev);
@@ -477,8 +477,8 @@ static void calgary_free_coherent(struct device *dev, size_t size,
}
static struct dma_map_ops calgary_dma_ops = {
- .alloc_coherent = calgary_alloc_coherent,
- .free_coherent = calgary_free_coherent,
+ .alloc = calgary_alloc_coherent,
+ .free = calgary_free_coherent,
.map_sg = calgary_map_sg,
.unmap_sg = calgary_unmap_sg,
.map_page = calgary_map_page,
@@ -1480,8 +1480,9 @@ cleanup:
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
+ unsigned long val;
size_t len;
- char* endp;
+ ssize_t ret;
while (*p) {
if (!strncmp(p, "64k", 3))
@@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtoul(p, &endp, 0);
- if (p == endp)
+ ret = kstrtoul(p, 0, &val);
+ if (ret)
break;
+ bridge = val;
if (bridge < MAX_PHB_BUS_NUM) {
printk(KERN_INFO "Calgary: disabling "
"translation for PHB %#x\n", bridge);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769..3003250 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -96,7 +96,8 @@ void __init pci_iommu_alloc(void)
}
}
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
+ dma_addr_t *dma_addr, gfp_t flag,
+ struct dma_attrs *attrs)
{
unsigned long dma_mask;
struct page *page;
@@ -262,10 +263,11 @@ rootfs_initcall(pci_iommu_init);
static __devinit void via_no_dac(struct pci_dev *dev)
{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ if (forbid_dac == 0) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
forbid_dac = 1;
}
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3af4af8..f960506 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -75,7 +75,7 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
}
static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr, struct dma_attrs *attrs)
{
free_pages((unsigned long)vaddr, get_order(size));
}
@@ -96,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev,
}
struct dma_map_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
+ .alloc = dma_generic_alloc_coherent,
+ .free = nommu_free_coherent,
.map_sg = nommu_map_sg,
.map_page = nommu_map_page,
.sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 8f972cb..6c483ba 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -15,21 +15,30 @@
int swiotlb __read_mostly;
static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags)
+ dma_addr_t *dma_handle, gfp_t flags,
+ struct dma_attrs *attrs)
{
void *vaddr;
- vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
+ vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
+ attrs);
if (vaddr)
return vaddr;
return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
}
+static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ struct dma_attrs *attrs)
+{
+ swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+}
+
static struct dma_map_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
- .alloc_coherent = x86_swiotlb_alloc_coherent,
- .free_coherent = swiotlb_free_coherent,
+ .alloc = x86_swiotlb_alloc_coherent,
+ .free = x86_swiotlb_free_coherent,
.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
.sync_single_for_device = swiotlb_sync_single_for_device,
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e8..0bc72e2 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/export.h>
+#include <asm/probe_roms.h>
#include <asm/pci-direct.h>
#include <asm/e820.h>
#include <asm/mmzone.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af..f2e8c0f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -12,16 +12,46 @@
#include <linux/user-return-notifier.h>
#include <linux/dmi.h>
#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/tick.h>
+#include <linux/cpuidle.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
+#include <asm/nmi.h>
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU(unsigned char, is_idle);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+#endif
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -46,10 +76,9 @@ void free_thread_xstate(struct task_struct *tsk)
fpu_free(&tsk->thread.fpu);
}
-void free_thread_info(struct thread_info *ti)
+void arch_release_task_struct(struct task_struct *tsk)
{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, THREAD_ORDER);
+ free_thread_xstate(tsk);
}
void arch_task_cache_init(void)
@@ -84,12 +113,6 @@ void exit_thread(void)
}
}
-void show_regs(struct pt_regs *regs)
-{
- show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
-}
-
void show_regs_common(void)
{
const char *vendor, *product, *board;
@@ -341,44 +364,113 @@ void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
#endif
-#ifdef CONFIG_X86_32
-/*
- * This halt magic was a workaround for ancient floppy DMA
- * wreckage. It should be safe to remove.
- */
-static int hlt_counter;
-void disable_hlt(void)
+static inline int hlt_use_halt(void)
{
- hlt_counter++;
+ return 1;
}
-EXPORT_SYMBOL(disable_hlt);
-void enable_hlt(void)
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
{
- hlt_counter--;
+ BUG();
}
-EXPORT_SYMBOL(enable_hlt);
+#endif
-static inline int hlt_use_halt(void)
+#ifdef CONFIG_X86_64
+void enter_idle(void)
{
- return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+ this_cpu_write(is_idle, 1);
+ atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
-#else
-static inline int hlt_use_halt(void)
+
+static void __exit_idle(void)
{
- return 1;
+ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+ return;
+ atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+ /* idle loop has pid 0 */
+ if (current->pid)
+ return;
+ __exit_idle();
}
#endif
/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. CPU0 already has it initialized but no harm in
+ * doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+ current_thread_info()->status |= TS_POLLING;
+
+ while (1) {
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
+
+ /*
+ * Idle routines should keep interrupts disabled
+ * from here on, until they go to idle.
+ * Otherwise, idle callbacks can misfire.
+ */
+ local_touch_nmi();
+ local_irq_disable();
+
+ enter_idle();
+
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
+
+ /* enter_idle() needs rcu for notifiers */
+ rcu_idle_enter();
+
+ if (cpuidle_idle_call())
+ pm_idle();
+
+ rcu_idle_exit();
+ start_critical_timings();
+
+ /* In many cases the interrupt that ended idle
+ has already called exit_idle. But some idle
+ loops can be woken up without interrupt. */
+ __exit_idle();
+ }
+
+ tick_nohz_idle_exit();
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+ }
+}
+
+/*
* We use this if we don't have any better
* idle routine..
*/
void default_idle(void)
{
if (hlt_use_halt()) {
- trace_power_start(POWER_CSTATE, 1, smp_processor_id());
- trace_cpu_idle(1, smp_processor_id());
+ trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -391,8 +483,8 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ trace_power_end_rcuidle(smp_processor_id());
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -426,32 +518,12 @@ void stop_this_cpu(void *dummy)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
if (!need_resched()) {
- trace_power_start(POWER_CSTATE, 1, smp_processor_id());
- trace_cpu_idle(1, smp_processor_id());
+ trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -461,8 +533,8 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ trace_power_end_rcuidle(smp_processor_id());
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else
local_irq_enable();
}
@@ -474,13 +546,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
- trace_power_start(POWER_CSTATE, 0, smp_processor_id());
- trace_cpu_idle(0, smp_processor_id());
+ trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ trace_power_end_rcuidle(smp_processor_id());
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}
/*
@@ -504,9 +576,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
+ /* Use mwait if idle=mwait boot option is given */
if (boot_option_idle_override == IDLE_FORCE_MWAIT)
return 1;
+ /*
+ * Any idle= boot option other than idle=mwait means that we must not
+ * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
+ */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return 0;
+
if (c->cpuid_level < MWAIT_INFO)
return 0;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c08d1ff..01d8d40 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,7 +9,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -31,20 +30,18 @@
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/personality.h>
-#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
-#include <linux/cpuidle.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/desc.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
@@ -57,7 +54,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <asm/nmi.h>
+#include <asm/switch_to.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -69,62 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
return ((unsigned long *)tsk->thread.sp)[3];
}
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
- BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
- int cpu = smp_processor_id();
-
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- current_thread_info()->status |= TS_POLLING;
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_idle_enter();
- rcu_idle_enter();
- while (!need_resched()) {
-
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(cpu))
- play_dead();
-
- local_touch_nmi();
- local_irq_disable();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
- if (cpuidle_idle_call())
- pm_idle();
- start_critical_timings();
- }
- rcu_idle_exit();
- tick_nohz_idle_exit();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-}
-
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -361,7 +302,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_fpu_finish(next_p, fpu);
- percpu_write(current_task, next_p);
+ this_cpu_write(current_task, next_p);
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index cfa5c90..28e8102 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,7 +14,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -32,17 +31,15 @@
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
-#include <linux/tick.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
-#include <linux/cpuidle.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
@@ -51,116 +48,11 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <asm/nmi.h>
+#include <asm/switch_to.h>
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
-static DEFINE_PER_CPU(unsigned char, is_idle);
-
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-
-void enter_idle(void)
-{
- percpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
-}
-
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
- BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
- current_thread_info()->status |= TS_POLLING;
-
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_idle_enter();
- while (!need_resched()) {
-
- rmb();
-
- if (cpu_is_offline(smp_processor_id()))
- play_dead();
- /*
- * Idle routines should keep interrupts disabled
- * from here on, until they go to idle.
- * Otherwise, idle callbacks can misfire.
- */
- local_touch_nmi();
- local_irq_disable();
- enter_idle();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
-
- /* enter_idle() needs rcu for notifiers */
- rcu_idle_enter();
-
- if (cpuidle_idle_call())
- pm_idle();
-
- rcu_idle_exit();
- start_critical_timings();
-
- /* In many cases the interrupt that ended idle
- has already called exit_idle. But some idle
- loops can be woken up without interrupt. */
- __exit_idle();
- }
-
- tick_nohz_idle_exit();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-}
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
@@ -342,9 +234,10 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
+ current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- percpu_write(old_rsp, new_sp);
+ this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
@@ -365,7 +258,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
{
start_thread_common(regs, new_ip, new_sp,
- __USER32_CS, __USER32_DS, __USER32_DS);
+ test_thread_flag(TIF_X32)
+ ? __USER_CS : __USER32_CS,
+ __USER_DS, __USER_DS);
}
#endif
@@ -464,11 +359,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = percpu_read(old_rsp);
- percpu_write(old_rsp, next->usersp);
- percpu_write(current_task, next_p);
+ prev->usersp = this_cpu_read(old_rsp);
+ this_cpu_write(old_rsp, next->usersp);
+ this_cpu_write(current_task, next_p);
- percpu_write(kernel_stack,
+ this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
@@ -488,6 +383,8 @@ void set_personality_64bit(void)
/* Make sure to be in 64bit mode */
clear_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ADDR32);
+ clear_thread_flag(TIF_X32);
/* Ensure the corresponding mm is not marked. */
if (current->mm)
@@ -500,21 +397,33 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
-void set_personality_ia32(void)
+void set_personality_ia32(bool x32)
{
/* inherit personality from parent */
/* Make sure to be in 32bit mode */
- set_thread_flag(TIF_IA32);
- current->personality |= force_personality32;
+ set_thread_flag(TIF_ADDR32);
/* Mark the associated mm as containing 32-bit tasks. */
if (current->mm)
current->mm->context.ia32_compat = 1;
- /* Prepare the first "return" to user space */
- current_thread_info()->status |= TS_COMPAT;
+ if (x32) {
+ clear_thread_flag(TIF_IA32);
+ set_thread_flag(TIF_X32);
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /* is_compat_task() uses the presence of the x32
+ syscall bit flag to determine compat status */
+ current_thread_info()->status &= ~TS_COMPAT;
+ } else {
+ set_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_X32);
+ current->personality |= force_personality32;
+ /* Prepare the first "return" to user space */
+ current_thread_info()->status |= TS_COMPAT;
+ }
}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
unsigned long get_wchan(struct task_struct *p)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 5026738..13b1990 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,15 +24,16 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/proto.h>
#include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
#include "tls.h"
@@ -1130,6 +1131,100 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+ compat_long_t request, compat_ulong_t caddr,
+ compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+
+ switch (request) {
+ /* Read 32bits at location addr in the USER area. Only allow
+ to return the lower 32bits of segment and debug registers. */
+ case PTRACE_PEEKUSR: {
+ u32 tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, (__u32 __user *)datap);
+ break;
+ }
+
+ /* Write the word at location addr in the USER area. Only allow
+ to update segment and debug registers with the upper 32bits
+ zero-extended. */
+ case PTRACE_POKEUSR:
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ /* normal 64bit interface to access TLS data.
+ Works just like arch_prctl, except that the arguments
+ are reversed. */
+ case PTRACE_ARCH_PRCTL:
+ return do_arch_prctl(child, data, addr);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif
+
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
compat_ulong_t caddr, compat_ulong_t cdata)
{
@@ -1139,6 +1234,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
int ret;
__u32 val;
+#ifdef CONFIG_X86_X32_ABI
+ if (!is_ia32_task())
+ return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+
switch (request) {
case PTRACE_PEEKUSR:
ret = getreg32(child, addr, &val);
@@ -1326,7 +1426,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
int error_code, int si_code,
struct siginfo *info)
{
- tsk->thread.trap_no = 1;
+ tsk->thread.trap_nr = X86_TRAP_DB;
tsk->thread.error_code = error_code;
memset(info, 0, sizeof(*info));
@@ -1380,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
- secure_computing(regs->orig_ax);
+ if (secure_computing(regs->orig_ax)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ ret = -1L;
+ goto out;
+ }
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
@@ -1405,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->dx, regs->r10);
#endif
+out:
return ret ?: regs->orig_ax;
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d7d5099..9b4204e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -90,7 +90,6 @@
#include <asm/processor.h>
#include <asm/bugs.h>
-#include <asm/system.h>
#include <asm/vsyscall.h>
#include <asm/cpu.h>
#include <asm/desc.h>
@@ -394,10 +393,9 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
+ panic("initrd too large to handle, "
+ "disabling initrd (%lld needed, %lld available)\n",
+ ramdisk_size, end_of_lowmem>>1);
}
printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
@@ -509,15 +507,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
#ifdef CONFIG_KEXEC
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
-
- total = max_pfn - min_low_pfn;
-
- return total << PAGE_SHIFT;
-}
-
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
@@ -536,7 +525,7 @@ static void __init reserve_crashkernel(void)
unsigned long long crash_size, crash_base;
int ret;
- total_mem = get_total_mem();
+ total_mem = memblock_phys_mem_size();
ret = parse_crashkernel(boot_command_line, total_mem,
&crash_size, &crash_base);
@@ -749,10 +738,16 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI_LOADER_SIGNATURE, 4)) {
+ "EL32", 4)) {
+ efi_enabled = 1;
+ efi_64bit = false;
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ "EL64", 4)) {
efi_enabled = 1;
- efi_memblock_x86_reserve_range();
+ efi_64bit = true;
}
+ if (efi_enabled && efi_memblock_x86_reserve_range())
+ efi_enabled = 0;
#endif
x86_init.oem.arch_setup();
@@ -1016,7 +1011,8 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
init_apic_mappings();
- ioapic_and_gsi_init();
+ if (x86_io_apic_ops.init)
+ x86_io_apic_ops.init();
kvm_guest_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727..5a98aa2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
#endif
rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
- const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 46a01bdc..115eac4 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -10,10 +10,8 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
-#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/wait.h>
-#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
@@ -24,12 +22,15 @@
#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/vdso.h>
#include <asm/mce.h>
+#include <asm/sighandling.h>
#ifdef CONFIG_X86_64
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
+#include <asm/sys_ia32.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -37,13 +38,6 @@
#include <asm/sigframe.h>
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
#ifdef CONFIG_X86_32
# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
#else
@@ -68,9 +62,8 @@
regs->seg = GET_SEG(seg) | 3; \
} while (0)
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
+ unsigned long *pax)
{
void __user *buf;
unsigned int tmpflags;
@@ -125,9 +118,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
return err;
}
-static int
-setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
{
int err = 0;
@@ -159,7 +151,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
put_user_ex(regs->r15, &sc->r15);
#endif /* CONFIG_X86_64 */
- put_user_ex(current->thread.trap_no, &sc->trapno);
+ put_user_ex(current->thread.trap_nr, &sc->trapno);
put_user_ex(current->thread.error_code, &sc->err);
put_user_ex(regs->ip, &sc->ip);
#ifdef CONFIG_X86_32
@@ -642,6 +634,16 @@ static int signr_convert(int sig)
#define is_ia32 0
#endif /* CONFIG_IA32_EMULATION */
+#ifdef CONFIG_X86_X32_ABI
+#define is_x32 test_thread_flag(TIF_X32)
+
+static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
+ siginfo_t *info, compat_sigset_t *set,
+ struct pt_regs *regs);
+#else /* !CONFIG_X86_X32_ABI */
+#define is_x32 0
+#endif /* CONFIG_X86_X32_ABI */
+
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs);
int ia32_setup_frame(int sig, struct k_sigaction *ka,
@@ -666,8 +668,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
else
ret = ia32_setup_frame(usig, ka, set, regs);
- } else
+#ifdef CONFIG_X86_X32_ABI
+ } else if (is_x32) {
+ ret = x32_setup_rt_frame(usig, ka, info,
+ (compat_sigset_t *)set, regs);
+#endif
+ } else {
ret = __setup_rt_frame(sig, ka, info, set, regs);
+ }
if (ret) {
force_sigsegv(sig, current);
@@ -850,3 +858,102 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
force_sig(SIGSEGV, me);
}
+
+#ifdef CONFIG_X86_X32_ABI
+static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
+ siginfo_t *info, compat_sigset_t *set,
+ struct pt_regs *regs)
+{
+ struct rt_sigframe_x32 __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user32(&frame->info, info))
+ return -EFAULT;
+ }
+
+ put_user_try {
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ put_user_ex(0, &frame->uc.uc__pad0);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (ka->sa.sa_flags & SA_RESTORER) {
+ restorer = ka->sa.sa_restorer;
+ } else {
+ /* could use a vstub here */
+ restorer = NULL;
+ err |= -EFAULT;
+ }
+ put_user_ex(restorer, &frame->pretcode);
+ } put_user_catch(err);
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ka->sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+}
+
+asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
+{
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long ax;
+ struct pt_regs tregs;
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ set_current_blocked(&set);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ goto badframe;
+
+ tregs = *regs;
+ if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
+ goto badframe;
+
+ return ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c..433529e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -50,6 +50,7 @@
#include <linux/tboot.h>
#include <linux/stackprotector.h>
#include <linux/gfp.h>
+#include <linux/cpuidle.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -75,20 +76,8 @@
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
-
-/*
* We need this for trampoline_base protection from concurrent accesses when
* off- and onlining cores wildly.
*/
@@ -96,20 +85,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
void cpu_hotplug_driver_lock(void)
{
- mutex_lock(&x86_cpu_hotplug_driver_mutex);
+ mutex_lock(&x86_cpu_hotplug_driver_mutex);
}
void cpu_hotplug_driver_unlock(void)
{
- mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+ mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}
ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
/* Number of siblings per CPU package */
@@ -219,14 +204,9 @@ static void __cpuinit smp_callin(void)
* Update loops_per_jiffy in cpu_data. Previous call to
* smp_store_cpu_info() stored a value that is close but not as
* accurate as the value just calculated.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
*/
- local_irq_enable();
calibrate_delay();
cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
- local_irq_disable();
pr_debug("Stack at about %p\n", &cpuid);
/*
@@ -255,6 +235,7 @@ notrace static void __cpuinit start_secondary(void *unused)
* most necessary things.
*/
cpu_init();
+ x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
@@ -291,19 +272,6 @@ notrace static void __cpuinit start_secondary(void *unused)
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
x86_platform.nmi_init();
- /*
- * Wait until the cpu which brought this one up marked it
- * online before enabling interrupts. If we don't do that then
- * we can end up waking up the softirq thread before this cpu
- * reached the active state, which makes the scheduler unhappy
- * and schedule the softirq thread on the wrong cpu. This is
- * only observable with forced threaded interrupts, but in
- * theory it could also happen w/o them. It's just way harder
- * to achieve.
- */
- while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
- cpu_relax();
-
/* enable local interrupts */
local_irq_enable();
@@ -331,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool __cpuinit
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
{
- cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
}
+#define link_mask(_m, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
+ cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
+} while (0)
+
+static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+ c->compute_unit_id == o->compute_unit_id)
+ return topology_sane(c, o, "smt");
+
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
+}
+
+static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+ return topology_sane(c, o, "llc");
+
+ return false;
+}
+
+static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id)
+ return topology_sane(c, o, "mc");
+
+ return false;
+}
void __cpuinit set_cpu_sibling_map(int cpu)
{
- int i;
+ bool has_mc = boot_cpu_data.x86_max_cores > 1;
+ bool has_smt = smp_num_siblings > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
- if (smp_num_siblings > 1) {
- for_each_cpu(i, cpu_sibling_setup_mask) {
- struct cpuinfo_x86 *o = &cpu_data(i);
-
- if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
- if (c->phys_proc_id == o->phys_proc_id &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
- c->compute_unit_id == o->compute_unit_id)
- link_thread_siblings(cpu, i);
- } else if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- link_thread_siblings(cpu, i);
- }
- }
- } else {
+ if (!has_smt && !has_mc) {
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
- }
-
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-
- if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
- cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(cpu));
c->booted_cores = 1;
return;
}
for_each_cpu(i, cpu_sibling_setup_mask) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(sibling, cpu, i);
+
+ if ((i == cpu) || (has_mc && match_llc(c, o)))
+ link_mask(llc_shared, cpu, i);
+
+ if ((i == cpu) || (has_mc && match_mc(c, o))) {
+ link_mask(core, cpu, i);
+
/*
* Does this new cpu bringup a new core?
*/
@@ -414,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
* For perf, we return last level cache shared map.
* And for power savings, we return cpu_core_map
*/
- if ((sched_mc_power_savings || sched_smt_power_savings) &&
- !(cpu_has(c, X86_FEATURE_AMD_DCM)))
+ if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
return cpu_core_mask(cpu);
else
return cpu_llc_shared_mask(cpu);
@@ -634,22 +632,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
-
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
static void __cpuinit announce_cpu(int cpu, int apicid)
{
@@ -676,58 +658,31 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
{
unsigned long boot_error = 0;
unsigned long start_ip;
int timeout;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
-
- INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
- c_idle.idle = get_idle_for_cpu(cpu);
-
- /*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
- */
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
- }
-
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
+ idle->thread.sp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(idle))) - 1);
+ per_cpu(current_task, cpu) = idle;
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- destroy_work_on_stack(&c_idle.work);
- return PTR_ERR(c_idle.idle);
- }
-
- set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
- per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(c_idle.idle) -
+ (unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = c_idle.idle->thread.sp;
+ stack_start = idle->thread.sp;
/* start_ip had better be page-aligned! */
start_ip = trampoline_address();
@@ -740,8 +695,6 @@ do_rest:
* the targeted processor.
*/
- printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
-
atomic_set(&init_deasserted, 0);
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -791,9 +744,10 @@ do_rest:
schedule();
}
- if (cpumask_test_cpu(cpu, cpu_callin_mask))
+ if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
+ print_cpu_msr(&cpu_data(cpu));
pr_debug("CPU%d: has booted.\n", cpu);
- else {
+ } else {
boot_error = 1;
if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
== 0xA5A5A5A5)
@@ -830,12 +784,10 @@ do_rest:
*/
smpboot_restore_warm_reset_vector();
}
-
- destroy_work_on_stack(&c_idle.work);
return boot_error;
}
-int __cpuinit native_cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
unsigned long flags;
@@ -847,7 +799,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
!physid_isset(apicid, phys_cpu_present_map) ||
- (!x2apic_mode && apicid >= 255)) {
+ !apic->apic_id_valid(apicid)) {
printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
return -EINVAL;
}
@@ -868,7 +820,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- err = do_boot_cpu(apicid, cpu);
+ err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
@@ -1422,7 +1374,8 @@ void native_play_dead(void)
tboot_shutdown(TB_SHUTDOWN_WFS);
mwait_play_dead(); /* Only returns on failure */
- hlt_play_dead();
+ if (cpuidle_play_dead())
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 0514890..b4d3c39 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -98,7 +98,7 @@ out:
static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+ if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
unsigned long new_begin;
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+ if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32))
&& len <= mm->cached_hole_size) {
mm->cached_hole_size = 0;
mm->free_area_cache = begin;
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
+ unsigned long addr = addr0, start_addr;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -205,7 +205,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
/* for MAP_32BIT mappings we force the legact mmap base */
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+ if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
goto bottomup;
/* requesting a specific address */
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
mm->free_area_cache = mm->mmap_base;
}
+try_again:
/* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- unsigned long tmp_addr = align_addr(addr - len, filp,
- ALIGN_TOPDOWN);
-
- vma = find_vma(mm, tmp_addr);
- if (!vma || tmp_addr + len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = tmp_addr;
- }
-
- if (mm->mmap_base < len)
- goto bottomup;
+ start_addr = addr = mm->free_area_cache;
- addr = mm->mmap_base-len;
+ if (addr < len)
+ goto fail;
+ addr -= len;
do {
addr = align_addr(addr, filp, ALIGN_TOPDOWN);
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = vma->vm_start-len;
} while (len < vma->vm_start);
+fail:
+ /*
+ * if hint left us with no space for the requested
+ * mapping then try again:
+ */
+ if (start_addr != mm->mmap_base) {
+ mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = 0;
+ goto try_again;
+ }
+
bottomup:
/*
* A failed mmap() very likely causes application failure,
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 7ac7943..5c7f8c2 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,6 +5,14 @@
#include <linux/cache.h>
#include <asm/asm-offsets.h>
+#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+
+#ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#else
+# define __SYSCALL_X32(nr, sym, compat) /* nothing */
+#endif
+
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e2410e2..6410744 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -272,7 +272,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
offsetof(struct acpi_table_facs, firmware_waking_vector);
}
-void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
{
static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
/* S0,1,2: */ -1, -1, -1,
@@ -281,7 +281,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
/* S5: */ TB_SHUTDOWN_S5 };
if (!tboot_enabled())
- return;
+ return 0;
tboot_copy_fadt(&acpi_gbl_FADT);
tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -292,10 +292,11 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
pr_warning("unsupported sleep state 0x%x\n", sleep_state);
- return;
+ return -1;
}
tboot_shutdown(acpi_shutdown_map[sleep_state]);
+ return 0;
}
static atomic_t ap_wfs_count;
@@ -345,6 +346,8 @@ static __init int tboot_late_init(void)
atomic_set(&ap_wfs_count, 0);
register_hotcpu_notifier(&tboot_cpu_notifier);
+
+ acpi_os_set_prepare_sleep(&tboot_sleep);
return 0;
}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index 9e540fe..ab40954 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -34,6 +34,7 @@
#include <asm/tce.h>
#include <asm/calgary.h>
#include <asm/proto.h>
+#include <asm/cacheflush.h>
/* flush a tce at 'tceaddr' to main memory */
static inline void flush_tce(void* tceaddr)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dd5fbf4..c6eba2b 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc);
*/
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- /* Keep nmi watchdog up to date */
- inc_irq_stat(irq0_irqs);
-
global_clock_event->event_handler(global_clock_event);
/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b85..9d9d2f9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -6,7 +6,6 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -163,7 +162,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
{
const struct desc_struct *tls;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
(count % sizeof(struct user_desc)) != 0)
return -EINVAL;
@@ -198,7 +197,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
(count % sizeof(struct user_desc)) != 0)
return -EINVAL;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4bbe04d..92d5756 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -50,10 +50,11 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
-#include <asm/system.h>
+#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/mce.h>
#include <asm/mach_traps.h>
@@ -119,7 +120,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
* traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
- if (trapnr < 6)
+ if (trapnr < X86_TRAP_UD)
goto vm86_trap;
goto trap_signal;
}
@@ -132,7 +133,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
trap_signal:
#endif
/*
- * We want error_code and trap_no set for userspace faults and
+ * We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
* kernelspace faults which are fixed up. die() gives the
* process no chance to handle the signal and notice the
@@ -141,7 +142,7 @@ trap_signal:
* delivered, faults. See also do_general_protection below.
*/
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+ tsk->thread.trap_nr = trapnr;
#ifdef CONFIG_X86_64
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
@@ -164,7 +165,7 @@ trap_signal:
kernel_trap:
if (!fixup_exception(regs)) {
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+ tsk->thread.trap_nr = trapnr;
die(str, regs, error_code);
}
return;
@@ -203,27 +204,31 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
do_trap(trapnr, signr, str, regs, error_code, &info); \
}
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
+ regs->ip)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
+ regs->ip)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
+ coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
#ifdef CONFIG_X86_32
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
#endif
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
+ BUS_ADRALN, 0)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
{
if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- 12, SIGBUS) == NOTIFY_STOP)
+ X86_TRAP_SS, SIGBUS) == NOTIFY_STOP)
return;
preempt_conditional_sti(regs);
- do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+ do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
preempt_conditional_cli(regs);
}
@@ -233,10 +238,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
struct task_struct *tsk = current;
/* Return not checked because double check cannot be ignored */
- notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 8;
+ tsk->thread.trap_nr = X86_TRAP_DF;
/*
* This is always a kernel trap and never fixable (and thus must
@@ -264,7 +269,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
goto gp_in_kernel;
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
+ tsk->thread.trap_nr = X86_TRAP_GP;
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
printk_ratelimit()) {
@@ -291,24 +296,29 @@ gp_in_kernel:
return;
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
- error_code, 13, SIGSEGV) == NOTIFY_STOP)
+ tsk->thread.trap_nr = X86_TRAP_GP;
+ if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
return;
die("general protection fault", regs, error_code);
}
/* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* ftrace must be first, everything else may cause a recursive crash */
+ if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
+ return;
+#endif
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
return;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
+ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
return;
/*
@@ -317,7 +327,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
*/
debug_stack_usage_inc();
preempt_conditional_sti(regs);
- do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
}
@@ -422,8 +432,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_sti(regs);
if (regs->flags & X86_VM_MASK) {
- handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, 1);
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+ X86_TRAP_DB);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
return;
@@ -460,7 +470,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
struct task_struct *task = current;
siginfo_t info;
unsigned short err;
- char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+ char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+ "simd exception";
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
return;
@@ -470,7 +481,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
if (!fixup_exception(regs)) {
task->thread.error_code = error_code;
- task->thread.trap_no = trapnr;
+ task->thread.trap_nr = trapnr;
die(str, regs, error_code);
}
return;
@@ -480,12 +491,12 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
* Save the info for the exception handler and clear the error.
*/
save_init_fpu(task);
- task->thread.trap_no = trapnr;
+ task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
info.si_errno = 0;
info.si_addr = (void __user *)regs->ip;
- if (trapnr == 16) {
+ if (trapnr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -529,10 +540,11 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
info.si_code = FPE_FLTRES;
} else {
/*
- * If we're using IRQ 13, or supposedly even some trap 16
- * implementations, it's possible we get a spurious trap...
+ * If we're using IRQ 13, or supposedly even some trap
+ * X86_TRAP_MF implementations, it's possible
+ * we get a spurious trap, which is not an error.
*/
- return; /* Spurious trap, no error */
+ return;
}
force_sig_info(SIGFPE, &info, task);
}
@@ -543,13 +555,13 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
ignore_fpu_irq = 1;
#endif
- math_error(regs, error_code, 16);
+ math_error(regs, error_code, X86_TRAP_MF);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
- math_error(regs, error_code, 19);
+ math_error(regs, error_code, X86_TRAP_XF);
}
dotraplinkage void
@@ -643,20 +655,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
info.si_errno = 0;
info.si_code = ILL_BADSTK;
info.si_addr = NULL;
- if (notify_die(DIE_TRAP, "iret exception",
- regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+ if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+ X86_TRAP_IRET, SIGILL) == NOTIFY_STOP)
return;
- do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ &info);
}
#endif
/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
+ set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
/* int3 can be called from all */
- set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
- set_intr_gate(14, &page_fault);
+ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ set_intr_gate(X86_TRAP_PF, &page_fault);
load_idt(&idt_descr);
}
@@ -672,30 +685,30 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
- set_intr_gate(0, &divide_error);
- set_intr_gate_ist(2, &nmi, NMI_STACK);
+ set_intr_gate(X86_TRAP_DE, &divide_error);
+ set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
- set_system_intr_gate(4, &overflow);
- set_intr_gate(5, &bounds);
- set_intr_gate(6, &invalid_op);
- set_intr_gate(7, &device_not_available);
+ set_system_intr_gate(X86_TRAP_OF, &overflow);
+ set_intr_gate(X86_TRAP_BR, &bounds);
+ set_intr_gate(X86_TRAP_UD, &invalid_op);
+ set_intr_gate(X86_TRAP_NM, &device_not_available);
#ifdef CONFIG_X86_32
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+ set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
- set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+ set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
- set_intr_gate(9, &coprocessor_segment_overrun);
- set_intr_gate(10, &invalid_TSS);
- set_intr_gate(11, &segment_not_present);
- set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(13, &general_protection);
- set_intr_gate(15, &spurious_interrupt_bug);
- set_intr_gate(16, &coprocessor_error);
- set_intr_gate(17, &alignment_check);
+ set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
+ set_intr_gate(X86_TRAP_TS, &invalid_TSS);
+ set_intr_gate(X86_TRAP_NP, &segment_not_present);
+ set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(X86_TRAP_GP, &general_protection);
+ set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
+ set_intr_gate(X86_TRAP_MF, &coprocessor_error);
+ set_intr_gate(X86_TRAP_AC, &alignment_check);
#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18, &machine_check, MCE_STACK);
+ set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
- set_intr_gate(19, &simd_coprocessor_error);
+ set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
@@ -720,7 +733,7 @@ void __init trap_init(void)
#ifdef CONFIG_X86_64
memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
- set_nmi_gate(1, &debug);
- set_nmi_gate(3, &int3);
+ set_nmi_gate(X86_TRAP_DB, &debug);
+ set_nmi_gate(X86_TRAP_BP, &int3);
#endif
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201..fc0a147 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
if (cpu_khz) {
*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
- *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+ *offset = ns_now - mult_frac(tsc_now, *scale,
+ (1UL << CYC2NS_SCALE_FACTOR));
}
sched_clock_idle_wakeup_event(0);
@@ -629,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
static unsigned long long cyc2ns_suspend;
-void save_sched_clock_state(void)
+void tsc_save_sched_clock_state(void)
{
if (!sched_clock_stable)
return;
@@ -645,7 +646,7 @@ void save_sched_clock_state(void)
* that sched_clock() continues from the point where it was left off during
* suspend.
*/
-void restore_sched_clock_state(void)
+void tsc_restore_sched_clock_state(void)
{
unsigned long long offset;
unsigned long flags;
@@ -932,6 +933,16 @@ static int __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
+
+ /*
+ * Trust the results of the earlier calibration on systems
+ * exporting a reliable TSC.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ return 0;
+ }
+
schedule_delayed_work(&tsc_irqwork, 0);
return 0;
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9eba29b..fc25e60 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps;
/*
* TSC-warp measurement loop running on both CPUs:
*/
-static __cpuinit void check_tsc_warp(void)
+static __cpuinit void check_tsc_warp(unsigned int timeout)
{
cycles_t start, now, prev, end;
int i;
@@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void)
start = get_cycles();
rdtsc_barrier();
/*
- * The measurement runs for 20 msecs:
+ * The measurement runs for 'timeout' msecs:
*/
- end = start + tsc_khz * 20ULL;
+ end = start + (cycles_t) tsc_khz * timeout;
now = start;
for (i = 0; ; i++) {
@@ -99,6 +99,25 @@ static __cpuinit void check_tsc_warp(void)
}
/*
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket doesn't get reset at the same time.
+ */
+static inline unsigned int loop_timeout(int cpu)
+{
+ return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+}
+
+/*
* Source CPU calls into this - it waits for the freshly booted
* target CPU to arrive and then starts the measurement:
*/
@@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
*/
atomic_inc(&start_count);
- check_tsc_warp();
+ check_tsc_warp(loop_timeout(cpu));
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
@@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
- check_tsc_warp();
+ check_tsc_warp(loop_timeout(smp_processor_id()));
/*
* Ok, we are done:
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab..255f58a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
spinlock_t *ptl;
int i;
+ down_write(&mm->mmap_sem);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
}
pte_unmap_unlock(pte, ptl);
out:
+ up_write(&mm->mmap_sem);
flush_tlb();
}
@@ -567,7 +569,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
}
if (trapno != 1)
return 1; /* we let this handle by the calling routine */
- current->thread.trap_no = trapno;
+ current->thread.trap_nr = trapno;
current->thread.error_code = error_code;
force_sig(SIGTRAP, current);
return 0;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index b07ba93..7515cf0e 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -52,10 +52,7 @@
#include "vsyscall_trace.h"
DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
-{
- .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-};
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
@@ -80,20 +77,15 @@ early_param("vsyscall", vsyscall_setup);
void update_vsyscall_tz(void)
{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* sys_tz has changed */
vsyscall_gtod_data.sys_tz = sys_tz;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
struct clocksource *clock, u32 mult)
{
- unsigned long flags;
+ struct timespec monotonic;
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ write_seqcount_begin(&vsyscall_gtod_data.seq);
/* copy vsyscall data */
vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
@@ -101,12 +93,19 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = mult;
vsyscall_gtod_data.clock.shift = clock->shift;
+
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
+
+ monotonic = timespec_add(*wall_time, *wtm);
+ vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec;
+ vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec;
+
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+ vsyscall_gtod_data.monotonic_time_coarse =
+ timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+ write_seqcount_end(&vsyscall_gtod_data.seq);
}
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
@@ -153,7 +152,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
thread->error_code = 6; /* user fault, no page, write */
thread->cr2 = ptr;
- thread->trap_no = 14;
+ thread->trap_nr = X86_TRAP_PF;
memset(&info, 0, sizeof(info));
info.si_signo = SIGSEGV;
@@ -217,9 +216,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
current_thread_info()->sig_on_uaccess_error = 1;
/*
- * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+ * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
* 64-bit, so we don't need to special-case it here. For all the
- * vsyscalls, 0 means "don't write anything" not "write it at
+ * vsyscalls, NULL means "don't write anything" not "write it at
* address 0".
*/
ret = -EFAULT;
@@ -248,7 +247,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
- 0);
+ NULL);
break;
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06c..35c5e54 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,6 +18,7 @@
#include <asm/e820.h>
#include <asm/time.h>
#include <asm/irq.h>
+#include <asm/io_apic.h>
#include <asm/pat.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
@@ -91,8 +92,8 @@ struct x86_init_ops x86_init __initdata = {
};
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+ .early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
- .fixup_cpu_id = x86_default_fixup_cpu_id,
};
static void default_nmi_init(void) { };
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
.get_nmi_reason = default_get_nmi_reason,
- .i8042_detect = default_i8042_detect
+ .i8042_detect = default_i8042_detect,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
};
EXPORT_SYMBOL_GPL(x86_platform);
@@ -117,3 +120,10 @@ struct x86_msi_ops x86_msi = {
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
};
+
+struct x86_io_apic_ops x86_io_apic_ops = {
+ .init = native_io_apic_init_mappings,
+ .read = native_io_apic_read,
+ .write = native_io_apic_write,
+ .modify = native_io_apic_modify,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 7110911..e62728e 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -6,6 +6,7 @@
#include <linux/bootmem.h>
#include <linux/compat.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#ifdef CONFIG_IA32_EMULATION
#include <asm/sigcontext32.h>
#endif