From 250c22777fe1ccd7ac588579a6c16db4c0161cc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:17:24 +0200 Subject: x86_64: move kernel Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index cff3d1d..4946764 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -174,7 +174,7 @@ no_longmode: hlt jmp 1b -#include "../../../x86_64/kernel/verify_cpu_64.S" +#include "../../kernel/verify_cpu_64.S" /* Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 577d08f..45855c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -1,5 +1,5 @@ ifeq ($(CONFIG_X86_32),y) include ${srctree}/arch/x86/kernel/Makefile_32 else -include ${srctree}/arch/x86_64/kernel/Makefile_64 +include ${srctree}/arch/x86/kernel/Makefile_64 endif diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index 5096f48..cb25523 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 @@ -83,6 +83,4 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE $(call if_changed,syscall) -k8-y += ../../x86_64/kernel/k8.o -stacktrace-y += ../../x86_64/kernel/stacktrace.o diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64 new file mode 100644 index 0000000..6e6b590 --- /dev/null +++ b/arch/x86/kernel/Makefile_64 @@ -0,0 +1,54 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head_64.o head64.o init_task_64.o vmlinux.lds +EXTRA_AFLAGS := -traditional +obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \ + ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \ + x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \ + setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \ + pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \ + perfctr-watchdog.o + +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_X86_MCE) += mce_64.o therm_throt.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o +obj-$(CONFIG_MTRR) += ../../x86/kernel/cpu/mtrr/ +obj-$(CONFIG_ACPI) += ../../x86/kernel/acpi/ +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o +obj-y += apic_64.o nmi_64.o +obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o +obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash_64.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o +obj-$(CONFIG_PM) += suspend_64.o +obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o +obj-$(CONFIG_CPU_FREQ) += ../../x86/kernel/cpu/cpufreq/ +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_IOMMU) += pci-gart_64.o aperture_64.o +obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o +obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o +obj-$(CONFIG_KPROBES) += kprobes_64.o +obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o +obj-$(CONFIG_X86_VSMP) += vsmp_64.o +obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_AUDIT) += audit_64.o + +obj-$(CONFIG_MODULES) += module_64.o +obj-$(CONFIG_PCI) += early-quirks_64.o + +obj-y += topology.o +obj-y += intel_cacheinfo.o +obj-y += addon_cpuid_features.o +obj-y += pcspeaker.o + +CFLAGS_vsyscall_64.o := $(PROFILING) -g0 + +therm_throt-y += ../../x86/kernel/cpu/mcheck/therm_throt.o +intel_cacheinfo-y += ../../x86/kernel/cpu/intel_cacheinfo.o +addon_cpuid_features-y += ../../x86/kernel/cpu/addon_cpuid_features.o +perfctr-watchdog-y += ../../x86/kernel/cpu/perfctr-watchdog.o diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 5e3b3f5..8b4357e 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -269,7 +269,7 @@ no_longmode: movb $0xbc,%al ; outb %al,$0x80 jmp no_longmode -#include "../../../x86_64/kernel/verify_cpu_64.S" +#include "../verify_cpu_64.S" /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c new file mode 100644 index 0000000..8f681ca --- /dev/null +++ b/arch/x86/kernel/aperture_64.c @@ -0,0 +1,298 @@ +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture or only set the + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int iommu_aperture; +int iommu_aperture_disabled __initdata = 0; +int iommu_aperture_allowed __initdata = 0; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata = 0; + +int fix_aperture __initdata = 1; + +static struct resource gart_resource = { + .name = "GART", + .flags = IORESOURCE_MEM, +}; + +static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) +{ + gart_resource.start = aper_base; + gart_resource.end = aper_base + aper_size - 1; + insert_resource(&iomem_resource, &gart_resource); +} + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +static u32 __init allocate_aperture(void) +{ + u32 aper_size; + void *p; + + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means an 2GB aperture won't + * have much chance of finding a place in the lower 4GB of memory. + * Unfortunately we cannot move it up because that would make the + * IOMMU useless. + */ + p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); + if (!p || __pa(p)+aper_size > 0xffffffff) { + printk("Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); + if (p) + free_bootmem(__pa(p), aper_size); + return 0; + } + printk("Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, __pa(p)); + insert_aperture_resource((u32)__pa(p), aper_size); + return (u32)__pa(p); +} + +static int __init aperture_valid(u64 aper_base, u32 aper_size) +{ + if (!aper_base) + return 0; + if (aper_size < 64*1024*1024) { + printk("Aperture too small (%d MB)\n", aper_size>>20); + return 0; + } + if (aper_base + aper_size > 0x100000000UL) { + printk("Aperture beyond 4GB. Ignoring.\n"); + return 0; + } + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk("Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + return 1; +} + +/* Find a PCI capability */ +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ + u8 pos; + int bytes; + if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + return 0; + pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + pos &= ~3; + id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + + printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + if (apsizereg == 0xffffffff) { + printk("APSIZE in AGP bridge unreadable\n"); + return 0; + } + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(num,slot,func, 0x10); + aper_hi = read_pci_config(num,slot,func,0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); + + if (!aperture_valid(aper, (32*1024*1024) << *order)) + return 0; + return (u32)aper; +} + +/* Look for an AGP bridge. Windows only expects the aperture in the + AGP bridge and some BIOS forget to initialize the Northbridge too. + Work around this here. + + Do an PCI bus scan by hand because we're running before the PCI + subsystem. + + All K8 AGP bridges are AGPv3 compliant, so we can do this scan + generically. It's probably overkill to always scan all slots because + the AGP bridges should be always an own bus on the HT hierarchy, + but do it here for future safety. */ +static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int num, slot, func; + + /* Poor man's PCI discovery */ + for (num = 0; num < 256; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(num,slot,func,cap,order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + printk("No AGP bridge found\n"); + return 0; +} + +void __init iommu_hole_init(void) +{ + int fix, num; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int valid_agp = 0; + + if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) + return; + + printk(KERN_INFO "Checking aperture...\n"); + + fix = 0; + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + iommu_detected = 1; + iommu_aperture = 1; + + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, + aper_base, aper_size>>20); + + if (!aperture_valid(aper_base, aper_size)) { + fix = 1; + break; + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); + } + return; + } + + if (!fallback_aper_force) + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if (swiotlb && !valid_agp) { + /* Do nothing */ + } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || + force_iommu || + valid_agp || + fallback_aper_force) { + printk("Your BIOS doesn't leave a aperture memory hole\n"); + printk("Please enable the IOMMU option in the BIOS setup\n"); + printk("This costs you %d MB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* Could disable AGP and IOMMU here, but it's probably + not worth it. But the later users cannot deal with + bad apertures and turning on the aperture over memory + causes very strange problems, so it's better to + panic early. */ + panic("Not enough memory for aperture"); + } + } else { + return; + } + + /* Fix up the north bridges */ + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + /* Don't enable translation yet. That is done later. + Assume this BIOS didn't initialise the GART so + just overwrite all previous bits */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c new file mode 100644 index 0000000..925758d --- /dev/null +++ b/arch/x86/kernel/apic_64.c @@ -0,0 +1,1253 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int apic_mapped; +int apic_verbosity; +int apic_runs_main_timer; +int apic_calibrate_pmtmr __initdata; + +int disable_apic_timer __initdata; + +/* Local APIC timer works in C2? */ +int local_apic_timer_c2_ok; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +static struct resource *ioapic_resources; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +/* + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as + * IPIs in place of local APIC timers + */ +static cpumask_t timer_interrupt_broadcast_ipi_mask; + +/* Using APIC to generate smp_local_timer_interrupt? */ +int using_apic_timer __read_mostly = 0; + +static void apic_pm_activate(void); + +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned int safe_apic_wait_icr_idle(void) +{ + unsigned int send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; + apic_write(APIC_LVT0, v); +} + +int get_maxlvt(void) +{ + unsigned int v, maxlvt; + + v = apic_read(APIC_LVR); + maxlvt = GET_APIC_MAXLVT(v); + return maxlvt; +} + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +} + +void clear_local_APIC(void) +{ + int maxlvt; + unsigned int v; + + maxlvt = get_maxlvt(); + + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* + * Clean APIC state for other OSs: + */ + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write(APIC_LVTPC, APIC_LVT_MASKED); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); +} + +void disconnect_bsp_APIC(int virt_wire_setup) +{ + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +void disable_local_APIC(void) +{ + unsigned int value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +void __init sync_Arb_IDs(void) +{ + /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (ver >= 0x14) /* P4 or higher */ + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG + | APIC_DM_INIT); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + value = apic_read(APIC_LVR); + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + apic_write(APIC_LVT1, value); +} + +void __cpuinit setup_local_APIC (void) +{ + unsigned int value, maxlvt; + int i, j; + + value = apic_read(APIC_LVR); + + BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); + + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write(APIC_TASKPRI, value); + + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1< 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, + "ESR value after enabling vector: %08x, after %08x\n", + oldvalue, value); + } + + nmi_watchdog_default(); + setup_apic_nmi_watchdog(NULL); + apic_pm_activate(); +} + +#ifdef CONFIG_PM + +static struct { + /* 'active' is true if the local APIC was enabled by us and + not the BIOS; this signifies that we are also responsible + for disabling it before entering apm/acpi suspend */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = get_maxlvt(); + + local_irq_save(flags); + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; +} + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __cpuinit apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +static int __init apic_set_verbosity(char *str) +{ + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; + } + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", str); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ + +static int __init detect_init_APIC (void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_id = 0; + return 0; +} + +#ifdef CONFIG_X86_IO_APIC +static struct resource * __init ioapic_setup_resources(void) +{ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk("IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif + +void __init init_apic_mappings(void) +{ + unsigned long apic_phys; + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_mapped = 1; + apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + + { + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + struct resource *ioapic_res; + + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } + } +} + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +#define APIC_DIVISOR 16 + +static void __setup_APIC_LVTT(unsigned int clocks) +{ + unsigned int lvtt_value, tmp_value; + int cpu = smp_processor_id(); + + lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + + if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + apic_write(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void setup_APIC_timer(unsigned int clocks) +{ + unsigned long flags; + + local_irq_save(flags); + + /* wait for irq slice */ + if (hpet_address && hpet_use_timer) { + u32 trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_T0_CMP) == trigger) + /* do nothing */ ; + } else { + int c1, c2; + outb_p(0x00, 0x43); + c2 = inb_p(0x40); + c2 |= inb_p(0x40) << 8; + do { + c1 = c2; + outb_p(0x00, 0x43); + c2 = inb_p(0x40); + c2 |= inb_p(0x40) << 8; + } while (c2 - c1 < 300); + } + __setup_APIC_LVTT(clocks); + /* Turn off PIT interrupt if we use APIC timer as main timer. + Only works with the PM timer right now + TBD fix it for HPET too. */ + if ((pmtmr_ioport != 0) && + smp_processor_id() == boot_cpu_id && + apic_runs_main_timer == 1 && + !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { + stop_timer_interrupt(); + apic_runs_main_timer++; + } + local_irq_restore(flags); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + */ + __setup_APIC_LVTT(4000000000); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + printk("result %d\n", result); + + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + return result * APIC_DIVISOR / HZ; +} + +static unsigned int calibration_result; + +void __init setup_boot_APIC_clock (void) +{ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } + + printk(KERN_INFO "Using local APIC timer interrupts.\n"); + using_apic_timer = 1; + + local_irq_disable(); + + calibration_result = calibrate_APIC_clock(); + /* + * Now set up the timer for real. + */ + setup_APIC_timer(calibration_result); + + local_irq_enable(); +} + +void __cpuinit setup_secondary_APIC_clock(void) +{ + local_irq_disable(); /* FIXME: Do we need this? --RR */ + setup_APIC_timer(calibration_result); + local_irq_enable(); +} + +void disable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + /* + * When an illegal vector value (0-15) is written to an LVT + * entry and delivery mode is Fixed, the APIC may signal an + * illegal vector error, with out regard to whether the mask + * bit is set or whether an interrupt is actually seen on input. + * + * Boot sequence might call this function when the LVTT has + * '0' vector value. So make sure vector field is set to + * valid value. + */ + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + } +} + +void enable_APIC_timer(void) +{ + int cpu = smp_processor_id(); + + if (using_apic_timer && + !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED); + } +} + +void switch_APIC_timer_to_ipi(void *cpumask) +{ + cpumask_t mask = *(cpumask_t *)cpumask; + int cpu = smp_processor_id(); + + if (cpu_isset(cpu, mask) && + !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { + disable_APIC_timer(); + cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); + } +} +EXPORT_SYMBOL(switch_APIC_timer_to_ipi); + +void smp_send_timer_broadcast_ipi(void) +{ + int cpu = smp_processor_id(); + cpumask_t mask; + + cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask); + + if (cpu_isset(cpu, mask)) { + cpu_clear(cpu, mask); + add_pda(apic_timer_irqs, 1); + smp_local_timer_interrupt(); + } + + if (!cpus_empty(mask)) { + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); + } +} + +void switch_ipi_to_APIC_timer(void *cpumask) +{ + cpumask_t mask = *(cpumask_t *)cpumask; + int cpu = smp_processor_id(); + + if (cpu_isset(cpu, mask) && + cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { + cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask); + enable_APIC_timer(); + } +} +EXPORT_SYMBOL(switch_ipi_to_APIC_timer); + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) +{ + unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + apic_write(reg, v); +} + +#undef APIC_DIVISOR + +/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * + * We do profiling in every local tick, statistics/rescheduling + * happen only every 'profiling multiplier' ticks. The default + * multiplier is 1 and it can be changed by writing the new multiplier + * value into /proc/profile. + */ + +void smp_local_timer_interrupt(void) +{ + profile_tick(CPU_PROFILING); +#ifdef CONFIG_SMP + update_process_times(user_mode(get_irq_regs())); +#endif + if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) + main_timer_handler(); + /* + * We take the 'long' return path, and there every subsystem + * grabs the appropriate locks (kernel lock/ irq lock). + * + * We might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + smp_local_timer_interrupt(); + irq_exit(); + set_irq_regs(old_regs); +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__cpuinit int apic_is_clustered_box(void) +{ + int i, clusters, zeros; + unsigned id; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since + * clusters are allocated sequentially, count zeros only if they are + * bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + exit_idle(); + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ + +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + exit_idle(); + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +int disable_apic; + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } + + verify_local_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); + + setup_local_APIC(); + + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; + setup_boot_APIC_clock(); + check_nmi_watchdog(); + return 0; +} + +static __init int setup_disableapic(char *str) +{ + disable_apic = 1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static __init int setup_nolapic(char *str) +{ + return setup_disableapic(str); +} +early_param("nolapic", setup_nolapic); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static __init int setup_noapictimer(char *str) +{ + if (str[0] != ' ' && str[0] != 0) + return 0; + disable_apic_timer = 1; + return 1; +} + +static __init int setup_apicmaintimer(char *str) +{ + apic_runs_main_timer = 1; + nohpet = 1; + return 1; +} +__setup("apicmaintimer", setup_apicmaintimer); + +static __init int setup_noapicmaintimer(char *str) +{ + apic_runs_main_timer = -1; + return 1; +} +__setup("noapicmaintimer", setup_noapicmaintimer); + +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return setup_apicmaintimer(NULL); +} +__setup("apicpmtimer", setup_apicpmtimer); + +__setup("noapictimer", setup_noapictimer); + diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c new file mode 100644 index 0000000..778953b --- /dev/null +++ b/arch/x86/kernel/asm-offsets_64.c @@ -0,0 +1,85 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +#define __NO_STUBS 1 +#undef __SYSCALL +#undef _ASM_X86_64_UNISTD_H_ +#define __SYSCALL(nr, sym) [nr] = 1, +static char syscalls[] = { +#include +}; + +int main(void) +{ +#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) + ENTRY(state); + ENTRY(flags); + ENTRY(thread); + ENTRY(pid); + BLANK(); +#undef ENTRY +#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) + ENTRY(flags); + ENTRY(addr_limit); + ENTRY(preempt_count); + ENTRY(status); + BLANK(); +#undef ENTRY +#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) + ENTRY(kernelstack); + ENTRY(oldrsp); + ENTRY(pcurrent); + ENTRY(irqcount); + ENTRY(cpunumber); + ENTRY(irqstackptr); + ENTRY(data_offset); + BLANK(); +#undef ENTRY +#ifdef CONFIG_IA32_EMULATION +#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + ENTRY(eax); + ENTRY(ebx); + ENTRY(ecx); + ENTRY(edx); + ENTRY(esi); + ENTRY(edi); + ENTRY(ebp); + ENTRY(esp); + ENTRY(eip); + BLANK(); +#undef ENTRY + DEFINE(IA32_RT_SIGFRAME_sigcontext, + offsetof (struct rt_sigframe32, uc.uc_mcontext)); + BLANK(); +#endif + DEFINE(pbe_address, offsetof(struct pbe, address)); + DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); + DEFINE(pbe_next, offsetof(struct pbe, next)); + BLANK(); + DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); + BLANK(); + DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + return 0; +} diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c new file mode 100644 index 0000000..06d3e5a --- /dev/null +++ b/arch/x86/kernel/audit_64.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include + +static unsigned dir_class[] = { +#include +~0U +}; + +static unsigned read_class[] = { +#include +~0U +}; + +static unsigned write_class[] = { +#include +~0U +}; + +static unsigned chattr_class[] = { +#include +~0U +}; + +static unsigned signal_class[] = { +#include +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_IA32_EMULATION + if (arch == AUDIT_ARCH_I386) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + extern int ia32_classify_syscall(unsigned); + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_IA32_EMULATION + extern __u32 ia32_dir_class[]; + extern __u32 ia32_write_class[]; + extern __u32 ia32_read_class[]; + extern __u32 ia32_chattr_class[]; + extern __u32 ia32_signal_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c new file mode 100644 index 0000000..4e5e9d3 --- /dev/null +++ b/arch/x86/kernel/bugs_64.c @@ -0,0 +1,24 @@ +/* + * arch/x86_64/kernel/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + */ + +#include +#include +#include +#include +#include +#include + +void __init check_bugs(void) +{ + identify_cpu(&boot_cpu_data); + mtrr_bp_init(); +#if !defined(CONFIG_SMP) + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); +} diff --git a/arch/x86/kernel/crash_64.c b/arch/x86/kernel/crash_64.c new file mode 100644 index 0000000..13432a1 --- /dev/null +++ b/arch/x86/kernel/crash_64.c @@ -0,0 +1,135 @@ +/* + * Architecture specific (x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ + struct pt_regs *regs; + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); + + /* + * Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NOTIFY_STOP; + local_irq_disable(); + + crash_save_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + for(;;) + halt(); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(NMI_VECTOR); +} + +/* + * This code is a best effort heuristic to get the + * other cpus to stop executing. So races with + * cpu hotplug shouldn't matter. + */ + +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ + + /* + * Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* + * This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); + + if(cpu_has_apic) + disable_local_APIC(); + + disable_IO_APIC(); + + crash_save_cpu(regs, smp_processor_id()); +} diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c new file mode 100644 index 0000000..942deac --- /dev/null +++ b/arch/x86/kernel/crash_dump_64.c @@ -0,0 +1,47 @@ +/* + * kernel/crash_dump.c - Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include +#include + +#include +#include + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + + if (userbuf) { + if (copy_to_user(buf, (vaddr + offset), csize)) { + iounmap(vaddr); + return -EFAULT; + } + } else + memcpy(buf, (vaddr + offset), csize); + + iounmap(vaddr); + return csize; +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c new file mode 100644 index 0000000..0f4d5e2 --- /dev/null +++ b/arch/x86/kernel/e820_64.c @@ -0,0 +1,725 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * Venkatesh Pallipadi + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct e820map e820; + +/* + * PFN of last memory page. + */ +unsigned long end_pfn; +EXPORT_SYMBOL(end_pfn); + +/* + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. + * The direct mapping extends to end_pfn_map, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long end_pfn_map; + +/* + * Last pfn which the user wants to use. + */ +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; + +extern struct resource code_resource, data_resource; + +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + unsigned long addr = *addrp, last = addr + size; + + /* various gunk below that needed for SMP startup */ + if (addr < 0x8000) { + *addrp = PAGE_ALIGN(0x8000); + return 1; + } + + /* direct mapping tables of the kernel */ + if (last >= table_start<= INITRD_START && + addr < INITRD_START+INITRD_SIZE) { + *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); + return 1; + } +#endif + /* kernel code */ + if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { + *addrp = PAGE_ALIGN(__pa_symbol(&_end)); + return 1; + } + + if (last >= ebda_addr && addr < ebda_addr + ebda_size) { + *addrp = PAGE_ALIGN(ebda_addr + ebda_size); + return 1; + } + +#ifdef CONFIG_NUMA + /* NUMA memory to node map */ + if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { + *addrp = nodemap_addr + nodemap_size; + return 1; + } +#endif + /* XXX ramdisk image here? */ + return 0; +} + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + if (ei->type != E820_RAM) + continue; + if (addr < start) + addr = start; + if (addr > ei->addr + ei->size) + continue; + while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) + ; + last = PAGE_ALIGN(addr) + size; + if (last > ei->addr + ei->size) + continue; + if (last > end) + continue; + return addr; + } + return -1UL; +} + +/* + * Find the highest page frame number we have available + */ +unsigned long __init e820_end_of_ram(void) +{ + unsigned long end_pfn = 0; + end_pfn = find_max_pfn_with_active_regions(); + + if (end_pfn > end_pfn_map) + end_pfn_map = end_pfn; + if (end_pfn_map > MAXMEM>>PAGE_SHIFT) + end_pfn_map = MAXMEM>>PAGE_SHIFT; + if (end_pfn > end_user_pfn) + end_pfn = end_user_pfn; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; + + printk("end_pfn_map = %lu\n", end_pfn_map); + return end_pfn; +} + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +void __init e820_reserve_resources(void) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, &code_resource); + request_resource(res, &data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif + } + } +} + +/* + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for software + * suspend and suspend to RAM. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long paddr; + + paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (paddr < ei->addr) + register_nosave_region(PFN_DOWN(paddr), + PFN_UP(ei->addr)); + + paddr = round_down(ei->addr + ei->size, PAGE_SIZE); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), + PFN_DOWN(paddr)); + + if (paddr >= (end_pfn << PAGE_SHIFT)) + break; + } +} + +/* + * Finds an active region in the address range from start_pfn to end_pfn and + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. + */ +static int __init e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) +{ + *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; + + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) + end_pfn_map = *ei_endpfn; + + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || + *ei_startpfn >= end_pfn) + return 0; + + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > end_pfn) + *ei_endpfn = end_pfn; + + /* Obey end_user_pfn to save on memmap */ + if (*ei_startpfn >= end_user_pfn) + return 0; + if (*ei_endpfn > end_user_pfn) + *ei_endpfn = end_user_pfn; + + return 1; +} + +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + int i; + + for (i = 0; i < e820.nr_map; i++) + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(unsigned long start, unsigned long size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn; + unsigned long ei_endpfn; + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + } + return end - start - (ram << PAGE_SHIFT); +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; iaddr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if > , swap */ + /* or, if current= & last=, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; ipbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; itype > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long start = biosmap->addr; + unsigned long size = biosmap->size; + unsigned long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + +void early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +void __init setup_memory_region(void) +{ + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + sanitize_e820_map(E820_MAP, &E820_MAP_NR); + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + early_panic("Cannot find a valid memory map"); + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map("BIOS-e820"); +} + +static int __init parse_memopt(char *p) +{ + if (!p) + return -EINVAL; + end_user_pfn = memparse(p, &p); + end_user_pfn >>= PAGE_SHIFT; + return 0; +} +early_param("mem", parse_memopt); + +static int userdef __initdata; + +static int __init parse_memmap_opt(char *p) +{ + char *oldp; + unsigned long long start_at, mem_size; + + if (!strcmp(p, "exactmap")) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + e820_register_active_regions(0, 0, -1UL); + saved_max_pfn = e820_end_of_ram(); + remove_all_active_ranges(); +#endif + end_pfn_map = 0; + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + if (*p == '@') { + start_at = memparse(p+1, &p); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + end_user_pfn = (mem_size >> PAGE_SHIFT); + } + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void __init finish_e820_parsing(void) +{ + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } +} + +unsigned long pci_mem_start = 0xaeedbabe; +EXPORT_SYMBOL(pci_mem_start); + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long last; + int i; + int found = 0; + + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + + if (!found) { + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks_64.c new file mode 100644 index 0000000..13aa4fd --- /dev/null +++ b/arch/x86/kernel/early-quirks_64.c @@ -0,0 +1,127 @@ +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include +#include +#include +#include +#include +#include +#include + +static void __init via_bugs(void) +{ +#ifdef CONFIG_IOMMU + if ((end_pfn > MAX_DMA32_PFN || force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); + iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI + +static int __init nvidia_hpet_check(struct acpi_table_header *header) +{ + return 0; +} +#endif + +static void __init nvidia_bugs(void) +{ +#ifdef CONFIG_ACPI + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. + * We don't know yet how to detect this automatically, but + * at least allow a command line override. + */ + if (acpi_use_timer_override) + return; + + if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif + /* RED-PEN skip them on mptables too? */ + +} + +static void __init ati_bugs(void) +{ + if (timer_over_8254 == 1) { + timer_over_8254 = 0; + printk(KERN_INFO + "ATI board detected. Disabling timer routing over 8254.\n"); + } +} + +struct chipset { + u16 vendor; + void (*f)(void); +}; + +static struct chipset early_qrk[] __initdata = { + { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, via_bugs }, + { PCI_VENDOR_ID_ATI, ati_bugs }, + {} +}; + +void __init early_quirks(void) +{ + int num, slot, func; + + if (!early_pci_allowed()) + return; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + int i; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + + for (i = 0; early_qrk[i].f; i++) + if (early_qrk[i].vendor == vendor) { + early_qrk[i].f(); + return; + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 92f812b..fd9aff3 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -1,2 +1,259 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include "../../x86_64/kernel/early_printk.c" +/* Simple VGA output */ + +#ifdef __i386__ +#include +#else +#include +#endif +#define VGABASE (__ISA_IO_base + 0xb8000) + +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + if (*s == '\n') + early_serial_putc('\r'); + early_serial_putc(*s); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s,"0x",2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static int bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s,"ttyS",4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Console interface to a host file on AMD's SimNow! */ + +static int simnow_fd; + +enum { + MAGIC1 = 0xBACCD00A, + MAGIC2 = 0xCA110000, + XOPEN = 5, + XWRITE = 4, +}; + +static noinline long simnow(long cmd, long a, long b, long c) +{ + long ret; + asm volatile("cpuid" : + "=a" (ret) : + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); + return ret; +} + +static void __init simnow_init(char *str) +{ + char *fn = "klog"; + if (*str == '=') + fn = ++str; + /* error ignored */ + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); +} + +static void simnow_write(struct console *con, const char *s, unsigned n) +{ + simnow(XWRITE, simnow_fd, (unsigned long)s, n); +} + +static struct console simnow_console = { + .name = "simnow", + .write = simnow_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap,fmt); + n = vscnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int __initdata keep_early; + +static int __init setup_early_printk(char *buf) +{ + if (!buf) + return 0; + + if (early_console_initialized) + return 0; + early_console_initialized = 1; + + if (strstr(buf, "keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3) + && SCREEN_INFO.orig_video_isVGA == 1) { + max_xpos = SCREEN_INFO.orig_video_cols; + max_ypos = SCREEN_INFO.orig_video_lines; + current_ypos = SCREEN_INFO.orig_y; + early_console = &early_vga_console; + } else if (!strncmp(buf, "simnow", 6)) { + simnow_init(buf + 6); + early_console = &simnow_console; + keep_early = 1; +#ifdef CONFIG_HVC_XEN + } else if (!strncmp(buf, "xen", 3)) { + early_console = &xenboot_console; +#endif + } + + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); + return 0; +} +early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S new file mode 100644 index 0000000..1d232e5 --- /dev/null +++ b/arch/x86/kernel/entry_64.S @@ -0,0 +1,1172 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers upto R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + +#ifndef CONFIG_PREEMPT +#define retint_kernel retint_restore_args +#endif + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp + movq %gs:pda_oldrsp,\tmp + movq \tmp,RSP(%rsp) + movq $__USER_DS,SS(%rsp) + movq $__USER_CS,CS(%rsp) + movq $-1,RCX(%rsp) + movq R11(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp,offset=0 + movq RSP-\offset(%rsp),\tmp + movq \tmp,%gs:pda_oldrsp + movq EFLAGS-\offset(%rsp),\tmp + movq \tmp,R11-\offset(%rsp) + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorl %eax, %eax + pushq %rax /* ss */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET ss,0*/ + pushq %rax /* rsp */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rsp,0 + pushq $(1<<9) /* eflags - interrupts on */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET rflags,0*/ + pushq $__KERNEL_CS /* cs */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET cs,0*/ + pushq \child_rip /* rip */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip,0 + pushq %rax /* orig rax */ + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + + .macro CFI_DEFAULT_STACK start=1 + .if \start + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8 + .else + CFI_DEF_CFA_OFFSET SS+8 + .endif + CFI_REL_OFFSET r15,R15 + CFI_REL_OFFSET r14,R14 + CFI_REL_OFFSET r13,R13 + CFI_REL_OFFSET r12,R12 + CFI_REL_OFFSET rbp,RBP + CFI_REL_OFFSET rbx,RBX + CFI_REL_OFFSET r11,R11 + CFI_REL_OFFSET r10,R10 + CFI_REL_OFFSET r9,R9 + CFI_REL_OFFSET r8,R8 + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP + /*CFI_REL_OFFSET cs,CS*/ + /*CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP + /*CFI_REL_OFFSET ss,SS*/ + .endm +/* + * A newly forked process directly context switches into this. + */ +/* rdi: prev */ +ENTRY(ret_from_fork) + CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 + call schedule_tail + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz rff_trace +rff_action: + RESTORE_REST + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + je int_ret_from_sys_call + testl $_TIF_IA32,threadinfo_flags(%rcx) + jnz int_ret_from_sys_call + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET + jmp ret_from_sys_call +rff_trace: + movq %rsp,%rdi + call syscall_trace_leave + GET_THREAD_INFO(%rcx) + jmp rff_action + CFI_ENDPROC +END(ret_from_fork) + +/* + * System call entry. Upto 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are off on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,PDA_STACKOFFSET + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ + swapgs + movq %rsp,%gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ + sti + SAVE_ARGS 8,1 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + CFI_REL_OFFSET rip,RIP-ARGOFFSET + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + jnz tracesys + cmpq $__NR_syscall_max,%rax + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ +ret_from_sys_call: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + GET_THREAD_INFO(%rcx) + cli + TRACE_IRQS_OFF + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz sysret_careful + CFI_REMEMBER_STATE + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON + movq RIP-ARGOFFSET(%rsp),%rcx + CFI_REGISTER rip,rcx + RESTORE_ARGS 0,-ARG_SKIP,1 + /*CFI_REGISTER rflags,r11*/ + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq + + CFI_RESTORE_STATE + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + TRACE_IRQS_ON + sti + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + TRACE_IRQS_ON + sti + testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + jz 1f + + /* Really a signal */ + /* edx: work flags (arg3) */ + leaq do_notify_resume(%rip),%rax + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call ptregscall_common +1: movl $_TIF_NEED_RESCHED,%edi + /* Use IRET because user could have changed frame. This + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ + cli + TRACE_IRQS_OFF + jmp int_with_check + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + + /* Do syscall tracing */ +tracesys: + SAVE_REST + movq $-ENOSYS,RAX(%rsp) + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpq $__NR_syscall_max,%rax + movq $-ENOSYS,%rcx + cmova %rcx,%rax + ja 1f + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) +1: movq %rax,RAX-ARGOFFSET(%rsp) + /* Use IRET because user could have changed frame */ + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ + .globl int_ret_from_sys_call +int_ret_from_sys_call: + cli + TRACE_IRQS_OFF + testl $3,CS-ARGOFFSET(%rsp) + je retint_restore_args + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +int_with_check: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,threadinfo_status(%rcx) + jmp retint_swapgs + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + sti + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + cli + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + TRACE_IRQS_ON + sti + SAVE_REST + /* Check for syscall exit trace */ + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + jz int_signal + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + jmp int_restore_rest + +int_signal: + testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_NEED_RESCHED,%edi +int_restore_rest: + RESTORE_REST + cli + TRACE_IRQS_OFF + jmp int_with_check + CFI_ENDPROC +END(system_call) + +/* + * Certain special system calls that need to save a complete full stack frame. + */ + + .macro PTREGSCALL label,func,arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ptregscall_common +END(\label) + .endm + + CFI_STARTPROC + + PTREGSCALL stub_clone, sys_clone, %r8 + PTREGSCALL stub_fork, sys_fork, %rdi + PTREGSCALL stub_vfork, sys_vfork, %rdi + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx + PTREGSCALL stub_iopl, sys_iopl, %rsi + +ENTRY(ptregscall_common) + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 + SAVE_REST + movq %r11, %r15 + CFI_REGISTER rip, r15 + FIXUP_TOP_OF_STACK %r11 + call *%rax + RESTORE_TOP_OF_STACK %r11 + movq %r15, %r11 + CFI_REGISTER rip, r11 + RESTORE_REST + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip, 0 + ret + CFI_ENDPROC +END(ptregscall_common) + +ENTRY(stub_execve) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execve) + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_rt_sigreturn) + +/* + * initial frame state for interrupts and exceptions + */ + .macro _frame ref + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-\ref + /*CFI_REL_OFFSET ss,SS-\ref*/ + CFI_REL_OFFSET rsp,RSP-\ref + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ + /*CFI_REL_OFFSET cs,CS-\ref*/ + CFI_REL_OFFSET rip,RIP-\ref + .endm + +/* initial frame state for interrupts (and exceptions without error code) */ +#define INTR_FRAME _frame RIP +/* initial frame state for exceptions with error code (and interrupts with + vector already pushed) */ +#define XCPT_FRAME _frame ORIG_RAX + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): interrupt number */ + .macro interrupt func + cld + SAVE_ARGS + leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler + pushq %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp, 0 + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + testl $3,CS(%rdi) + je 1f + swapgs + /* irqcount is used to check if a CPU is already on an interrupt + stack or not. While this is essentially redundant with preempt_count + it is a little cheaper to use a separate counter in the PDA + (short of moving irq_enter into assembly, which would be too + much work) */ +1: incl %gs:pda_irqcount + cmoveq %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder + /* + * We entered an interrupt context - irqs are off: + */ + TRACE_IRQS_OFF + call \func + .endm + +ENTRY(common_interrupt) + XCPT_FRAME + interrupt do_IRQ + /* 0(%rsp): oldrsp-ARGOFFSET */ +ret_from_intr: + cli + TRACE_IRQS_OFF + decl %gs:pda_irqcount + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 +exit_intr: + GET_THREAD_INFO(%rcx) + testl $3,CS-ARGOFFSET(%rsp) + je retint_kernel + + /* Interrupt came from user space */ + /* + * Has a correct top of stack, but a partial stack frame + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + CFI_REMEMBER_STATE + jnz retint_careful +retint_swapgs: + /* + * The iretq could re-enable interrupts: + */ + cli + TRACE_IRQS_IRETQ + swapgs + jmp restore_args + +retint_restore_args: + cli + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ +restore_args: + RESTORE_ARGS 0,8,0 +iret_label: + iretq + + .section __ex_table,"a" + .quad iret_label,bad_iret + .previous + .section .fixup,"ax" + /* force a signal here? this matches i386 behaviour */ + /* running with kernel gs */ +bad_iret: + movq $11,%rdi /* SIGSEGV */ + TRACE_IRQS_ON + sti + jmp do_exit + .previous + + /* edi: workmask, edx: work */ +retint_careful: + CFI_RESTORE_STATE + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + sti + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + GET_THREAD_INFO(%rcx) + cli + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + jz retint_swapgs + TRACE_IRQS_ON + sti + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + cli + TRACE_IRQS_OFF + movl $_TIF_NEED_RESCHED,%edi + GET_THREAD_INFO(%rcx) + jmp retint_check + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ +ENTRY(retint_kernel) + cmpl $0,threadinfo_preempt_count(%rcx) + jnz retint_restore_args + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) + jnc retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp exit_intr +#endif + + CFI_ENDPROC +END(common_interrupt) + +/* + * APIC interrupts. + */ + .macro apicinterrupt num,func + INTR_FRAME + pushq $~(\num) + CFI_ADJUST_CFA_OFFSET 8 + interrupt \func + jmp ret_from_intr + CFI_ENDPROC + .endm + +ENTRY(thermal_interrupt) + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +END(thermal_interrupt) + +ENTRY(threshold_interrupt) + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt +END(threshold_interrupt) + +#ifdef CONFIG_SMP +ENTRY(reschedule_interrupt) + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt +END(reschedule_interrupt) + + .macro INVALIDATE_ENTRY num +ENTRY(invalidate_interrupt\num) + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt +END(invalidate_interrupt\num) + .endm + + INVALIDATE_ENTRY 0 + INVALIDATE_ENTRY 1 + INVALIDATE_ENTRY 2 + INVALIDATE_ENTRY 3 + INVALIDATE_ENTRY 4 + INVALIDATE_ENTRY 5 + INVALIDATE_ENTRY 6 + INVALIDATE_ENTRY 7 + +ENTRY(call_function_interrupt) + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +END(call_function_interrupt) +ENTRY(irq_move_cleanup_interrupt) + apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt +END(irq_move_cleanup_interrupt) +#endif + +ENTRY(apic_timer_interrupt) + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt +END(apic_timer_interrupt) + +ENTRY(error_interrupt) + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt +END(error_interrupt) + +ENTRY(spurious_interrupt) + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt +END(spurious_interrupt) + +/* + * Exception entry points. + */ + .macro zeroentry sym + INTR_FRAME + pushq $0 /* push error code/oldrax */ + CFI_ADJUST_CFA_OFFSET 8 + pushq %rax /* push real oldrax to the rdi slot */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 + leaq \sym(%rip),%rax + jmp error_entry + CFI_ENDPROC + .endm + + .macro errorentry sym + XCPT_FRAME + pushq %rax + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 + leaq \sym(%rip),%rax + jmp error_entry + CFI_ENDPROC + .endm + + /* error code is on the stack already */ + /* handle NMI like exceptions that can happen everywhere */ + .macro paranoidentry sym, ist=0, irqtrace=1 + SAVE_ALL + cld + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f + swapgs + xorl %ebx,%ebx +1: + .if \ist + movq %gs:pda_data_offset, %rbp + .endif + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi + movq $-1,ORIG_RAX(%rsp) + .if \ist + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + .endif + call \sym + .if \ist + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + .endif + cli + .if \irqtrace + TRACE_IRQS_OFF + .endif + .endm + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ +paranoid_exit\trace: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore\trace + testl $3,CS(%rsp) + jnz paranoid_userspace\trace +paranoid_swapgs\trace: + .if \trace + TRACE_IRQS_IRETQ 0 + .endif + swapgs +paranoid_restore\trace: + RESTORE_ALL 8 + iretq +paranoid_userspace\trace: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs\trace + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule\trace + movl %ebx,%edx /* arg3: thread flags */ + .if \trace + TRACE_IRQS_ON + .endif + sti + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace +paranoid_schedule\trace: + .if \trace + TRACE_IRQS_ON + .endif + sti + call schedule + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace + CFI_ENDPROC + .endm + +/* + * Exception entry point. This expects an error code/orig_rax on the stack + * and the exception handler in %rax. + */ +KPROBE_ENTRY(error_entry) + _frame RDI + CFI_REL_OFFSET rax,0 + /* rdi slot contains rax, oldrax contains error code */ + cld + subq $14*8,%rsp + CFI_ADJUST_CFA_OFFSET (14*8) + movq %rsi,13*8(%rsp) + CFI_REL_OFFSET rsi,RSI + movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + CFI_REGISTER rax,rsi + movq %rdx,12*8(%rsp) + CFI_REL_OFFSET rdx,RDX + movq %rcx,11*8(%rsp) + CFI_REL_OFFSET rcx,RCX + movq %rsi,10*8(%rsp) /* store rax */ + CFI_REL_OFFSET rax,RAX + movq %r8, 9*8(%rsp) + CFI_REL_OFFSET r8,R8 + movq %r9, 8*8(%rsp) + CFI_REL_OFFSET r9,R9 + movq %r10,7*8(%rsp) + CFI_REL_OFFSET r10,R10 + movq %r11,6*8(%rsp) + CFI_REL_OFFSET r11,R11 + movq %rbx,5*8(%rsp) + CFI_REL_OFFSET rbx,RBX + movq %rbp,4*8(%rsp) + CFI_REL_OFFSET rbp,RBP + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12,R12 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13,R13 + movq %r14,1*8(%rsp) + CFI_REL_OFFSET r14,R14 + movq %r15,(%rsp) + CFI_REL_OFFSET r15,R15 + xorl %ebx,%ebx + testl $3,CS(%rsp) + je error_kernelspace +error_swapgs: + swapgs +error_sti: + movq %rdi,RDI(%rsp) + CFI_REL_OFFSET rdi,RDI + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) + call *%rax + /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +error_exit: + movl %ebx,%eax + RESTORE_REST + cli + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + movl threadinfo_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + /* + * The iret might restore flags: + */ + TRACE_IRQS_IRETQ + swapgs + RESTORE_ARGS 0,8,0 + jmp iret_label + CFI_ENDPROC + +error_kernelspace: + incl %ebx + /* There are two places in the kernel that can potentially fault with + usergs. Handle them here. The exception handlers after + iret run with kernel gs again, so don't set the user space flag. + B stepping K8s sometimes report an truncated RIP for IRET + exceptions returning to compat mode. Check for these here too. */ + leaq iret_label(%rip),%rbp + cmpq %rbp,RIP(%rsp) + je error_swapgs + movl %ebp,%ebp /* zero extend */ + cmpq %rbp,RIP(%rsp) + je error_swapgs + cmpq $gs_change,RIP(%rsp) + je error_swapgs + jmp error_sti +KPROBE_END(error_entry) + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(load_gs_index) + CFI_STARTPROC + pushf + CFI_ADJUST_CFA_OFFSET 8 + cli + swapgs +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + swapgs + popf + CFI_ADJUST_CFA_OFFSET -8 + ret + CFI_ENDPROC +ENDPROC(load_gs_index) + + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + swapgs /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* + * Create a kernel thread. + * + * C extern interface: + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + * + * asm input arguments: + * rdi: fn, rsi: arg, rdx: flags + */ +ENTRY(kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq kernel_thread_flags(%rip),%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + + # clone now + call do_fork + movq %rax,RAX(%rsp) + xorl %edi,%edi + + /* + * It isn't worth to check for reschedule here, + * so internally to the x86_64 port you can rely on kernel_thread() + * not to reschedule the child before returning, this avoids the need + * of hacks for example to fork off the per-CPU idle tasks. + * [Hopefully no generic code relies on the reschedule -AK] + */ + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(kernel_thread) + +child_rip: + pushq $0 # fake return address + CFI_STARTPROC + /* + * Here we are in the child and the registers are set as they were + * at kernel_thread() invocation in the parent. + */ + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + # exit + xorl %edi, %edi + call do_exit + CFI_ENDPROC +ENDPROC(child_rip) + +/* + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. + * + * C extern interface: + * extern long execve(char *name, char **argv, char **envp) + * + * asm input arguments: + * rdi: name, rsi: argv, rdx: envp + * + * We want to fallback into: + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * + * do_sys_execve asm fallback arguments: + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + */ +ENTRY(kernel_execve) + CFI_STARTPROC + FAKE_STACK_FRAME $0 + SAVE_ALL + call sys_execve + movq %rax, RAX(%rsp) + RESTORE_REST + testq %rax,%rax + je int_ret_from_sys_call + RESTORE_ARGS + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(kernel_execve) + +KPROBE_ENTRY(page_fault) + errorentry do_page_fault +KPROBE_END(page_fault) + +ENTRY(coprocessor_error) + zeroentry do_coprocessor_error +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + zeroentry do_simd_coprocessor_error +END(simd_coprocessor_error) + +ENTRY(device_not_available) + zeroentry math_state_restore +END(device_not_available) + + /* runs on exception stack */ +KPROBE_ENTRY(debug) + INTR_FRAME + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_debug, DEBUG_STACK + paranoidexit +KPROBE_END(debug) + + /* runs on exception stack */ +KPROBE_ENTRY(nmi) + INTR_FRAME + pushq $-1 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_nmi, 0, 0 +#ifdef CONFIG_TRACE_IRQFLAGS + paranoidexit 0 +#else + jmp paranoid_exit1 + CFI_ENDPROC +#endif +KPROBE_END(nmi) + +KPROBE_ENTRY(int3) + INTR_FRAME + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_int3, DEBUG_STACK + jmp paranoid_exit1 + CFI_ENDPROC +KPROBE_END(int3) + +ENTRY(overflow) + zeroentry do_overflow +END(overflow) + +ENTRY(bounds) + zeroentry do_bounds +END(bounds) + +ENTRY(invalid_op) + zeroentry do_invalid_op +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + zeroentry do_coprocessor_segment_overrun +END(coprocessor_segment_overrun) + +ENTRY(reserved) + zeroentry do_reserved +END(reserved) + + /* runs on exception stack */ +ENTRY(double_fault) + XCPT_FRAME + paranoidentry do_double_fault + jmp paranoid_exit1 + CFI_ENDPROC +END(double_fault) + +ENTRY(invalid_TSS) + errorentry do_invalid_TSS +END(invalid_TSS) + +ENTRY(segment_not_present) + errorentry do_segment_not_present +END(segment_not_present) + + /* runs on exception stack */ +ENTRY(stack_segment) + XCPT_FRAME + paranoidentry do_stack_segment + jmp paranoid_exit1 + CFI_ENDPROC +END(stack_segment) + +KPROBE_ENTRY(general_protection) + errorentry do_general_protection +KPROBE_END(general_protection) + +ENTRY(alignment_check) + errorentry do_alignment_check +END(alignment_check) + +ENTRY(divide_error) + zeroentry do_divide_error +END(divide_error) + +ENTRY(spurious_interrupt_bug) + zeroentry do_spurious_interrupt_bug +END(spurious_interrupt_bug) + +#ifdef CONFIG_X86_MCE + /* runs on exception stack */ +ENTRY(machine_check) + INTR_FRAME + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_machine_check + jmp paranoid_exit1 + CFI_ENDPROC +END(machine_check) +#endif + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(call_softirq) + CFI_STARTPROC + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + incl %gs:pda_irqcount + cmove %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + decl %gs:pda_irqcount + ret + CFI_ENDPROC +ENDPROC(call_softirq) + +KPROBE_ENTRY(ignore_sysret) + CFI_STARTPROC + mov $-ENOSYS,%eax + sysret + CFI_ENDPROC +ENDPROC(ignore_sysret) diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c new file mode 100644 index 0000000..47496a4 --- /dev/null +++ b/arch/x86/kernel/genapic_64.c @@ -0,0 +1,66 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_ACPI +#include +#endif + +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly + = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(x86_cpu_to_apicid); + +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +struct genapic __read_mostly *genapic = &apic_flat; + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init setup_apic_routing(void) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + genapic = &apic_physflat; + else +#endif + + if (cpus_weight(cpu_possible_map) <= 8) + genapic = &apic_flat; + else + genapic = &apic_physflat; + + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +} + +/* Same for both flat and physical. */ + +void send_IPI_self(int vector) +{ + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c new file mode 100644 index 0000000..ecb01ee --- /dev/null +++ b/arch/x86/kernel/genapic_flat_64.c @@ -0,0 +1,194 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Flat APIC subarch code. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static cpumask_t flat_target_cpus(void) +{ + return cpu_online_map; +} + +static cpumask_t flat_vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static void flat_init_apic_ldr(void) +{ + unsigned long val; + unsigned long num, id; + + num = smp_processor_id(); + id = 1UL << num; + x86_cpu_to_log_apicid[num] = id; + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write(APIC_LDR, val); +} + +static void flat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long flags; + + local_irq_save(flags); + __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +static void flat_send_IPI_allbutself(int vector) +{ +#ifdef CONFIG_HOTPLUG_CPU + int hotplug = 1; +#else + int hotplug = 0; +#endif + if (hotplug || vector == NMI_VECTOR) { + cpumask_t allbutme = cpu_online_map; + + cpu_clear(smp_processor_id(), allbutme); + + if (!cpus_empty(allbutme)) + flat_send_IPI_mask(allbutme, vector); + } else if (num_online_cpus() > 1) { + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); + } +} + +static void flat_send_IPI_all(int vector) +{ + if (vector == NMI_VECTOR) + flat_send_IPI_mask(cpu_online_map, vector); + else + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static int flat_apic_id_registered(void) +{ + return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); +} + +static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +struct genapic apic_flat = { + .name = "flat", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .target_cpus = flat_target_cpus, + .vector_allocation_domain = flat_vector_allocation_domain, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr, + .send_IPI_all = flat_send_IPI_all, + .send_IPI_allbutself = flat_send_IPI_allbutself, + .send_IPI_mask = flat_send_IPI_mask, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; + +/* + * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ + +static cpumask_t physflat_target_cpus(void) +{ + return cpu_online_map; +} + +static cpumask_t physflat_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + + +static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + send_IPI_mask_sequence(cpumask, vector); +} + +static void physflat_send_IPI_allbutself(int vector) +{ + cpumask_t allbutme = cpu_online_map; + + cpu_clear(smp_processor_id(), allbutme); + physflat_send_IPI_mask(allbutme, vector); +} + +static void physflat_send_IPI_all(int vector) +{ + physflat_send_IPI_mask(cpu_online_map, vector); +} + +static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return x86_cpu_to_apicid[cpu]; + else + return BAD_APICID; +} + +struct genapic apic_physflat = { + .name = "physical flat", + .int_delivery_mode = dest_Fixed, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .target_cpus = physflat_target_cpus, + .vector_allocation_domain = physflat_vector_allocation_domain, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ + .send_IPI_all = physflat_send_IPI_all, + .send_IPI_allbutself = physflat_send_IPI_allbutself, + .send_IPI_mask = physflat_send_IPI_mask, + .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c new file mode 100644 index 0000000..6c34bdd --- /dev/null +++ b/arch/x86/kernel/head64.c @@ -0,0 +1,86 @@ +/* + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb(); +} + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); +} + +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ +#define OLD_CL_MAGIC_ADDR 0x20 +#define OLD_CL_MAGIC 0xA33F +#define OLD_CL_OFFSET 0x22 + +static void __init copy_bootdata(char *real_mode_data) +{ + unsigned long new_data; + char * command_line; + + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); + if (!new_data) { + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { + return; + } + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); + } + command_line = __va(new_data); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); +} + +void __init x86_64_start_kernel(char * real_mode_data) +{ + int i; + + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + + /* Make NULL pointers segfault */ + zap_identity_mappings(); + + for (i = 0; i < IDT_ENTRIES; i++) + set_intr_gate(i, early_idt_handler); + asm volatile("lidt %0" :: "m" (idt_descr)); + + early_printk("Kernel alive\n"); + + for (i = 0; i < NR_CPUS; i++) + cpu_pda(i) = &boot_cpu_pda[i]; + + pda_init(0); + copy_bootdata(__va(real_mode_data)); +#ifdef CONFIG_SMP + cpu_set(0, cpu_online_map); +#endif + start_kernel(); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S new file mode 100644 index 0000000..b6167fe --- /dev/null +++ b/arch/x86/kernel/head_64.S @@ -0,0 +1,416 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* we are not able to switch in one step to the final KERNEL ADRESS SPACE + * because we need identity-mapped pages. + * + */ + + .text + .section .text.head + .code64 + .globl startup_64 +startup_64: + + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86_64/boot/compressed/head.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. + */ + + /* Compute the delta between the address I am compiled to run at and the + * address I am actually running at. + */ + leaq _text(%rip), %rbp + subq $_text - __START_KERNEL_map, %rbp + + /* Is the address not 2M aligned? */ + movq %rbp, %rax + andl $~LARGE_PAGE_MASK, %eax + testl %eax, %eax + jnz bad_address + + /* Is the address too large? */ + leaq _text(%rip), %rdx + movq $PGDIR_SIZE, %rax + cmpq %rax, %rdx + jae bad_address + + /* Fixup the physical addresses in the page table + */ + addq %rbp, init_level4_pgt + 0(%rip) + addq %rbp, init_level4_pgt + (258*8)(%rip) + addq %rbp, init_level4_pgt + (511*8)(%rip) + + addq %rbp, level3_ident_pgt + 0(%rip) + + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + addq %rbp, level3_kernel_pgt + (511*8)(%rip) + + addq %rbp, level2_fixmap_pgt + (506*8)(%rip) + + /* Add an Identity mapping if I am above 1G */ + leaq _text(%rip), %rdi + andq $LARGE_PAGE_MASK, %rdi + + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andq $(PTRS_PER_PUD - 1), %rax + jz ident_complete + + leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx + leaq level3_ident_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) + + movq %rdi, %rax + shrq $PMD_SHIFT, %rax + andq $(PTRS_PER_PMD - 1), %rax + leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq level2_spare_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) +ident_complete: + + /* Fixup the kernel text+data virtual addresses + */ + leaq level2_kernel_pgt(%rip), %rdi + leaq 4096(%rdi), %r8 + /* See if it is a valid page table entry */ +1: testq $1, 0(%rdi) + jz 2f + addq %rbp, 0(%rdi) + /* Go to the next page */ +2: addq $8, %rdi + cmp %r8, %rdi + jne 1b + + /* Fixup phys_base */ + addq %rbp, phys_base(%rip) + +#ifdef CONFIG_SMP + addq %rbp, trampoline_level4_pgt + 0(%rip) + addq %rbp, trampoline_level4_pgt + (511*8)(%rip) +#endif +#ifdef CONFIG_ACPI_SLEEP + addq %rbp, wakeup_level4_pgt + 0(%rip) + addq %rbp, wakeup_level4_pgt + (511*8)(%rip) +#endif + + /* Due to ENTRY(), sometimes the empty space gets filled with + * zeros. Better take a jmp than relying on empty space being + * filled with 0x90 (nop) + */ + jmp secondary_startup_64 +ENTRY(secondary_startup_64) + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded a mapped page table. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. + */ + + /* Enable PAE mode and PGE */ + xorq %rax, %rax + btsq $5, %rax + btsq $7, %rax + movq %rax, %cr4 + + /* Setup early boot stage 4 level pagetables. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax + movq %rax, %cr3 + + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + jmp *%rax +1: + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax +1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ +#define CR0_PM 1 /* protected mode */ +#define CR0_MP (1<<1) +#define CR0_ET (1<<4) +#define CR0_NE (1<<5) +#define CR0_WP (1<<16) +#define CR0_AM (1<<18) +#define CR0_PAGING (1<<31) + movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax + /* Make changes effective */ + movq %rax, %cr0 + + /* Setup a boot time stack */ + movq init_rsp(%rip),%rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt cpu_gdt_descr(%rip) + + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + + /* + * Setup up a dummy PDA. this is just for some early bootup code + * that does in_interrupt() + */ + movl $MSR_GS_BASE,%ecx + movq $empty_zero_page,%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + + /* esi is pointer to real mode structure with interesting info. + pass it to C */ + movl %esi, %edi + + /* Finally jump to run C code and to be on real kernel address + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. + */ + movq initial_code(%rip),%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq + + /* SMP bootup changes these two */ +#ifndef CONFIG_HOTPLUG_CPU + .pushsection .init.data +#endif + .align 8 + .globl initial_code +initial_code: + .quad x86_64_start_kernel +#ifndef CONFIG_HOTPLUG_CPU + .popsection +#endif + .globl init_rsp +init_rsp: + .quad init_thread_union+THREAD_SIZE-8 + +bad_address: + jmp bad_address + +ENTRY(early_idt_handler) + cmpl $2,early_recursion_flag(%rip) + jz 1f + incl early_recursion_flag(%rip) + xorl %eax,%eax + movq 8(%rsp),%rsi # get rip + movq (%rsp),%rdx + movq %cr2,%rcx + leaq early_idt_msg(%rip),%rdi + call early_printk + cmpl $2,early_recursion_flag(%rip) + jz 1f + call dump_stack +#ifdef CONFIG_KALLSYMS + leaq early_idt_ripmsg(%rip),%rdi + movq 8(%rsp),%rsi # get rip again + call __print_symbol +#endif +1: hlt + jmp 1b +early_recursion_flag: + .long 0 + +early_idt_msg: + .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" +early_idt_ripmsg: + .asciz "RIP %s\n" + +.balign PAGE_SIZE + +#define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ +ENTRY(name) + +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << 21) + (PERM) ; \ + i = i + 1 ; \ + .endr + + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 511,8,0 + +NEXT_PAGE(level3_kernel_pgt) + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 + +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + +NEXT_PAGE(level2_kernel_pgt) + /* 40MB kernel mapping. The kernel code cannot be bigger than that. + When you change this change KERNEL_TEXT_SIZE in page.h too. */ + /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) + /* Module mapping starts here */ + .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 + +NEXT_PAGE(level2_spare_pgt) + .fill 512,8,0 + +#undef PMDS +#undef NEXT_PAGE + + .data + .align 16 + .globl cpu_gdt_descr +cpu_gdt_descr: + .word gdt_end-cpu_gdt_table-1 +gdt: + .quad cpu_gdt_table +#ifdef CONFIG_SMP + .rept NR_CPUS-1 + .word 0 + .quad 0 + .endr +#endif + +ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ + + .section .data.page_aligned, "aw" + .align PAGE_SIZE + +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ + +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ + .quad 0x00af9b000000ffff /* __KERNEL_CS */ + .quad 0x00cf93000000ffff /* __KERNEL_DS */ + .quad 0x00cffb000000ffff /* __USER32_CS */ + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affb000000ffff /* __USER_CS */ + .quad 0x0 /* unused */ + .quad 0,0 /* TSS */ + .quad 0,0 /* LDT */ + .quad 0,0,0 /* three TLS descriptors */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ +gdt_end: + /* asm/segment.h:GDT_ENTRIES must match this */ + /* This should be a multiple of the cache line size */ + /* GDTs of other CPUs are now dynamically allocated */ + + /* zero the remaining page */ + .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 + + .section .bss, "aw", @nobits + .align L1_CACHE_BYTES +ENTRY(idt_table) + .skip 256 * 16 + + .section .bss.page_aligned, "aw", @nobits + .align PAGE_SIZE +ENTRY(empty_zero_page) + .skip PAGE_SIZE diff --git a/arch/x86/kernel/hpet_64.c b/arch/x86/kernel/hpet_64.c new file mode 100644 index 0000000..e2d1b91 --- /dev/null +++ b/arch/x86/kernel/hpet_64.c @@ -0,0 +1,493 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +int nohpet __initdata; + +unsigned long hpet_address; +unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ + +int hpet_use_timer; /* Use counter of hpet for time keeping, + * otherwise PIT + */ + +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; + + if (!hpet_address) + return 0; + + memset(&hd, 0, sizeof(hd)); + + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; + + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + } + + hpet_alloc(&hd); + return 0; +} +fs_initcall(late_hpet_init); +#endif + +int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; + +/* + * Stop the timers and reset the main counter. + */ + + cfg = hpet_readl(HPET_CFG); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ + cfg |= HPET_CFG_LEGACY; + } +/* + * Go! + */ + + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + return 0; +} + +static cycle_t read_hpet(void) +{ + return (cycle_t)hpet_readl(HPET_COUNTER); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .vread = vread_hpet, +}; + +int __init hpet_arch_init(void) +{ + unsigned int id; + u64 tmp; + + if (!hpet_address) + return -1; + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ + + id = hpet_readl(HPET_ID); + + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < 100000 || hpet_period > 100000000) + return -1; + + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; + + hpet_use_timer = (id & HPET_ID_LEGSUP); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + clocksource_register(&clocksource_hpet); + + return hpet_timer_stop_set_go(hpet_tick); +} + +int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ + +#define TICK_COUNT 100000000 +#define SMI_THRESHOLD 50000 +#define MAX_TRIES 5 + +/* + * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none + * occurs between the reads of the hpet & TSC. + */ +static void __init read_hpet_tsc(int *hpet, int *tsc) +{ + int tsc1, tsc2, hpet1, i; + + for (i = 0; i < MAX_TRIES; i++) { + tsc1 = get_cycles_sync(); + hpet1 = hpet_readl(HPET_COUNTER); + tsc2 = get_cycles_sync(); + if ((tsc2 - tsc1) < SMI_THRESHOLD) + break; + } + *hpet = hpet1; + *tsc = tsc2; +} + +unsigned int __init hpet_calibrate_tsc(void) +{ + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + + read_hpet_tsc(&hpet_start, &tsc_start); + + do { + local_irq_disable(); + read_hpet_tsc(&hpet_now, &tsc_now); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} + +#ifdef CONFIG_HPET_EMULATE_RTC +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include + +#define DEFAULT_RTC_INT_FREQ 64 +#define RTC_NUM_INTS 1 + +static unsigned long UIE_on; +static unsigned long prev_update_sec; + +static unsigned long AIE_on; +static struct rtc_time alarm_time; + +static unsigned long PIE_on; +static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; +static unsigned long PIE_count; + +static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ +static unsigned int hpet_t1_cmp; /* cached comparator register */ + +int is_hpet_enabled(void) +{ + return hpet_address != 0; +} + +/* + * Timer 1 for RTC, we do not use periodic interrupt feature, + * even if HPET supports periodic interrupts on Timer 1. + * The reason being, to set up a periodic interrupt in HPET, we need to + * stop the main counter. And if we do that everytime someone diables/enables + * RTC, we will have adverse effect on main kernel timer running on Timer 0. + * So, for the time being, simulate the periodic interrupt in software. + * + * hpet_rtc_timer_init() is called for the first time and during subsequent + * interuppts reinit happens through hpet_rtc_timer_reinit(). + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + /* + * Set the counter 1 and enable the interrupts. + */ + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + local_irq_save(flags); + + cnt = hpet_readl(HPET_COUNTER); + cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int cfg, cnt, ticks_per_int, lost_ints; + + if (unlikely(!(PIE_on | AIE_on | UIE_on))) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + /* It is more accurate to use the comparator value than current count.*/ + ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; + hpet_t1_cmp += ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + /* + * If the interrupt handler was delayed too long, the write above tries + * to schedule the next interrupt in the past and the hardware would + * not interrupt until the counter had wrapped around. + * So we have to check that the comparator wasn't set to a past time. + */ + cnt = hpet_readl(HPET_COUNTER); + if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { + lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; + /* Make sure that, even with the time needed to execute + * this code, the next scheduled interrupt has been moved + * back to the future: */ + lost_ints++; + + hpet_t1_cmp += lost_ints * ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + if (PIE_on) + PIE_count += lost_ints; + + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); + } +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + if (bit_mask & RTC_UIE) + UIE_on = 0; + if (bit_mask & RTC_PIE) + PIE_on = 0; + if (bit_mask & RTC_AIE) + AIE_on = 0; + + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + int timer_init_reqd = 0; + + if (!is_hpet_enabled()) + return 0; + + if (!(PIE_on | AIE_on | UIE_on)) + timer_init_reqd = 1; + + if (bit_mask & RTC_UIE) { + UIE_on = 1; + } + if (bit_mask & RTC_PIE) { + PIE_on = 1; + PIE_count = 0; + } + if (bit_mask & RTC_AIE) { + AIE_on = 1; + } + + if (timer_init_reqd) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + alarm_time.tm_hour = hrs; + alarm_time.tm_min = min; + alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + if (!is_hpet_enabled()) + return 0; + + PIE_freq = freq; + PIE_count = 0; + + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + if (!is_hpet_enabled()) + return 0; + + return 1; +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + int call_rtc_interrupt = 0; + + hpet_rtc_timer_reinit(); + + if (UIE_on | AIE_on) { + rtc_get_rtc_time(&curr_time); + } + if (UIE_on) { + if (curr_time.tm_sec != prev_update_sec) { + /* Set update int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag = RTC_UF; + prev_update_sec = curr_time.tm_sec; + } + } + if (PIE_on) { + PIE_count++; + if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { + /* Set periodic int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_PF; + PIE_count = 0; + } + } + if (AIE_on) { + if ((curr_time.tm_sec == alarm_time.tm_sec) && + (curr_time.tm_min == alarm_time.tm_min) && + (curr_time.tm_hour == alarm_time.tm_hour)) { + /* Set alarm int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_AF; + } + } + if (call_rtc_interrupt) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +#endif + +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 1; +} + +__setup("nohpet", nohpet_setup); diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c new file mode 100644 index 0000000..1d58c13 --- /dev/null +++ b/arch/x86/kernel/i387_64.c @@ -0,0 +1,151 @@ +/* + * linux/arch/x86_64/kernel/i387.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * + * x86-64 rework 2002 Andi Kleen. + * Does direct fxsave in and out of user space now for signal handlers. + * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, + * the 64bit user space sees a FXSAVE frame directly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; + +void mxcsr_feature_mask_init(void) +{ + unsigned int mask; + clts(); + memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) mask = 0x0000ffbf; + mxcsr_feature_mask &= mask; + stts(); +} + +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ +void __cpuinit fpu_init(void) +{ + unsigned long oldcr0 = read_cr0(); + extern void __bad_fxsave_alignment(void); + + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) + __bad_fxsave_alignment(); + set_in_cr4(X86_CR4_OSFXSR); + set_in_cr4(X86_CR4_OSXMMEXCPT); + + write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ + + mxcsr_feature_mask_init(); + /* clean state in init */ + current_thread_info()->status = 0; + clear_used_math(); +} + +void init_fpu(struct task_struct *child) +{ + if (tsk_used_math(child)) { + if (child == current) + unlazy_fpu(child); + return; + } + memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + child->thread.i387.fxsave.cwd = 0x37f; + child->thread.i387.fxsave.mxcsr = 0x1f80; + /* only the device not available exception or ptrace can call init_fpu */ + set_stopped_child_used_math(child); +} + +/* + * Signal frame handlers. + */ + +int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.i387.fxsave)); + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n",buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *)buf); + if (err) return err; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * ptrace request handlers. + */ + +int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) +{ + init_fpu(tsk); + return __copy_to_user(buf, &tsk->thread.i387.fxsave, + sizeof(struct user_i387_struct)) ? -EFAULT : 0; +} + +int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) +{ + if (__copy_from_user(&tsk->thread.i387.fxsave, buf, + sizeof(struct user_i387_struct))) + return -EFAULT; + return 0; +} + +/* + * FPU state for core dumps. + */ + +int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) +{ + struct task_struct *tsk = current; + + if (!used_math()) + return 0; + + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); + return 1; +} + +int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) +{ + int fpvalid = !!tsk_used_math(tsk); + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); +} + return fpvalid; +} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c new file mode 100644 index 0000000..948cae6 --- /dev/null +++ b/arch/x86/kernel/i8259_64.c @@ -0,0 +1,544 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +static struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +static unsigned int cached_irq_mask = 0xffff; + +#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define cached_21 (__byte(0,cached_irq_mask)) +#define cached_A1 (__byte(1,cached_irq_mask)) + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,0xA0); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(0xA1); /* DUMMY - (do we need this?) */ + outb(cached_A1,0xA1); + outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ + outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + } else { + inb(0x21); /* DUMMY - (do we need this?) */ + outb(cached_21,0x21); + outb(0x60+irq,0x20); /* 'Specific EOI' to master */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-2 */ + + /* + * outb_p - this has to work on a wide range of PC hardware. + */ + outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ + outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ + if (auto_eoi) + outb_p(0x03, 0x21); /* master does Auto EOI */ + else + outb_p(0x01, 0x21); /* master expects normal EOI */ + + outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ + outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ + outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode + is to be investigated) */ + + if (auto_eoi) + /* + * in AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_21, 0x21); /* restore master IRQ mask */ + outb(cached_A1, 0xA1); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + set_kset_name("i8259"), + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +static void setup_timer_hardware(void) +{ + outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff , 0x40); /* LSB */ + udelay(10); + outb(LATCH >> 8 , 0x40); /* MSB */ +} + +static int timer_resume(struct sys_device *dev) +{ + setup_timer_hardware(); + return 0; +} + +void i8254_timer_resume(void) +{ + setup_timer_hardware(); +} + +static struct sysdev_class timer_sysclass = { + set_kset_name("timer_pit"), + .resume = timer_resume, +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int __init init_timer_sysfs(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(init_timer_sysfs); + +void __init init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* + * Set the clock to HZ Hz, we already have a valid + * vector now: + */ + setup_timer_hardware(); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c new file mode 100644 index 0000000..4ff33d4 --- /dev/null +++ b/arch/x86/kernel/init_task_64.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial task structure. + * + * We need to make sure that this is 8192-byte aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + +/* Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c new file mode 100644 index 0000000..966fa10 --- /dev/null +++ b/arch/x86/kernel/io_apic_64.c @@ -0,0 +1,2202 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_ACPI +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct irq_cfg { + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; + u8 vector; + u8 move_in_progress : 1; +}; + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { + [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +}; + +static int assign_irq_vector(int irq, cpumask_t mask); + +#define __apicdebuginit __init + +int sis_apic_bug; /* not actually supported, dummy for compile */ + +static int no_timer_check; + +static int disable_timer_pin_1 __initdata; + +int timer_over_8254 __initdata = 1; + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +static DEFINE_SPINLOCK(ioapic_lock); +DEFINE_SPINLOCK(vector_lock); + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + short apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); +} + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + */ +static inline void io_apic_modify(unsigned int apic, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(value, &io_apic->data); +} + +static int io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + int pending = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + entry = irq_2_pin + irq; + for (;;) { + unsigned int reg; + int pin; + + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + pending |= (reg >> 14) & 1; + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + return pending; +} + +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_pin_list *entry = irq_2_pin + irq; \ + \ + BUG_ON(irq >= NR_IRQS); \ + for (;;) { \ + unsigned int reg; \ + pin = entry->pin; \ + if (pin == -1) \ + break; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = irq_2_pin + entry->next; \ + } \ +} + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu; + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + BUG_ON(irq >= NR_IRQS); + for (;;) { + unsigned int reg; + apic = entry->apic; + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~0x000000ff; + reg |= vector; + io_apic_modify(apic, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + irq_desc[irq].affinity = mask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + BUG_ON(irq >= NR_IRQS); + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: ran out of irq_2_pin entries!"); + } + entry->apic = apic; + entry->pin = pin; +} + + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) + /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) + /* mask = 0 */ + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + ioapic_mask_entry(apic, pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +int skip_ioapic_setup; +int ioapic_force; + +static int __init parse_noapic(char *str) +{ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 1; +} +__setup("disable_timer_pin_1", disable_timer_pin_setup); + +static int __init setup_disable_8254_timer(char *s) +{ + timer_over_8254 = -1; + return 1; +} +static int __init setup_enable_8254_timer(char *s) +{ + timer_over_8254 = 2; + return 1; +} + +__setup("disable_8254_timer", setup_disable_8254_timer); +__setup("enable_8254_timer", setup_enable_8254_timer); + + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + return apic; + } + } + + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + BUG_ON(best_guess >= NR_IRQS); + return best_guess; +} + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + break; + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mpc_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + } + BUG_ON(irq >= NR_IRQS); + return irq; +} + +static int __assign_irq_vector(int irq, cpumask_t mask) +{ + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; + + BUG_ON((unsigned)irq >= NR_IRQS); + cfg = &irq_cfg[irq]; + + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); + + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; + + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } + + for_each_cpu_mask(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; + + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= FIRST_SYSTEM_VECTOR) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; + if (vector == IA32_SYSCALL_VECTOR) + goto next; + for_each_cpu_mask(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + } + for_each_cpu_mask(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; +} + +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int err; + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, mask); + spin_unlock_irqrestore(&vector_lock, flags); + return err; +} + +static void __clear_irq_vector(int irq) +{ + struct irq_cfg *cfg; + cpumask_t mask; + int cpu, vector; + + BUG_ON((unsigned)irq >= NR_IRQS); + cfg = &irq_cfg[irq]; + BUG_ON(!cfg->vector); + + vector = cfg->vector; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cfg->domain = CPU_MASK_NONE; +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + + /* Mark the inuse vectors */ + for (irq = 0; irq < NR_IRQS; ++irq) { + if (!cpu_isset(cpu, irq_cfg[irq].domain)) + continue; + vector = irq_cfg[irq].vector; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + if (!cpu_isset(cpu, irq_cfg[irq].domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } +} + + +static struct irq_chip ioapic_chip; + +static void ioapic_register_intr(int irq, unsigned long trigger) +{ + if (trigger) { + irq_desc[irq].status |= IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, "fasteoi"); + } else { + irq_desc[irq].status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); + } +} + +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, + int trigger, int polarity) +{ + struct irq_cfg *cfg = irq_cfg + irq; + struct IO_APIC_route_entry entry; + cpumask_t mask; + + if (!IO_APIC_IRQ(irq)) + return; + + mask = TARGET_CPUS; + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(mask, cfg->domain, mask); + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", + apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, + irq, trigger, polarity); + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest = cpu_mask_to_apicid(mask); + entry.mask = 0; /* enable IRQ */ + entry.trigger = trigger; + entry.polarity = polarity; + entry.vector = cfg->vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry.mask = 1; + + ioapic_register_intr(irq, trigger); + if (irq < 16) + disable_8259A_irq(irq); + + ioapic_write_entry(apic, pin, entry); +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + irq = pin_2_irq(idx, apic, pin); + add_pin_to_irq(irq, apic, pin); + + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE," not connected.\n"); +} + +/* + * Set up the 8259A-master output pin as broadcast to all + * CPUs. + */ +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +void __apicdebuginit print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + + if (reg_01.bits.version >= 0x10) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + printk(KERN_DEBUG " %02x %03X ", + i, + entry.dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +#if 0 + +static __apicdebuginit void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1< 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void __apicdebuginit print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +#endif /* 0 */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i8259_apic, i8259_pin; + int i, apic; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest = GET_APIC_ID(apic_read(APIC_ID)); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (jiffies - t1 > 4) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +static int ioapic_retrigger_irq(unsigned int irq) +{ + struct irq_cfg *cfg = &irq_cfg[irq]; + cpumask_t mask; + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + cpus_clear(mask); + cpu_set(first_cpu(cfg->domain), mask); + + send_IPI_mask(mask, cfg->vector); + spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +#ifdef CONFIG_SMP +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); + exit_idle(); + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + if (irq >= NR_IRQS) + continue; + + desc = irq_desc + irq; + cfg = irq_cfg + irq; + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg + irq; + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_rax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } +} +#else +static inline void irq_complete_move(unsigned int irq) {} +#endif + +static void ack_apic_edge(unsigned int irq) +{ + irq_complete_move(irq); + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + + irq_complete_move(irq); +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } +#endif + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); + unmask_IO_APIC_irq(irq); + } +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else + /* Strange. Oh, well.. */ + irq_desc[irq].chip = &no_irq_chip; + } + } +} + +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type __read_mostly = { + .name = "local-APIC", + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq, +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + printk(KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(NULL); + + printk(" done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + apic = find_isa_irq_apic(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for modern platforms only. + */ +static inline void check_timer(void) +{ + struct irq_cfg *cfg = irq_cfg + 0; + int apic1, pin1, apic2, pin2; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + assign_irq_vector(0, TARGET_CPUS); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + if (timer_over_8254 > 0) + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (!no_timer_check && timer_irq_works()) { + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + return; + } + clear_IO_APIC_pin(apic1, pin1); + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " + "connected to IO-APIC\n"); + } + + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " + "through the 8259A ... "); + if (pin2 != -1) { + apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", + apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(apic2, pin2); + } + apic_printk(APIC_VERBOSE," failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].chip = &lapic_irq_type; + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + return; + } + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_VERBOSE," failed.\n"); + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + init_8259A(0); + make_8259A_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + return; + } + apic_printk(APIC_VERBOSE," failed :(.\n"); + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); +} + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1<<2) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* + * Dynamic irq allocate and deallocation + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq; + int new; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_cfg[new].vector != 0) + continue; + if (__assign_irq_vector(new, TARGET_CPUS) == 0) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * MSI mesage composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + struct irq_cfg *cfg = irq_cfg + irq; + int err; + unsigned dest; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (!err) { + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } + return err; +} + +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(irq, &msg); + irq_desc[irq].affinity = mask; +} +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + struct msi_msg msg; + int irq, ret; + irq = create_irq(); + if (irq < 0) + return irq; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +#endif /* CONFIG_PCI_MSI */ + +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + target_ht_irq(irq, dest, cfg->vector); + irq_desc[irq].affinity = mask; +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg = irq_cfg + irq; + int err; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (!err) { + struct ht_irq_msg msg; + unsigned dest; + + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + } + return err; +} +#endif /* CONFIG_HT_IRQ */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +#define IO_APIC_MAX_ID 0xFE + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +{ + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + + return 0; +} + +#endif /* CONFIG_ACPI */ + + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + if (!irq_cfg[irq].vector) + setup_IO_APIC_irq(ioapic, pin, irq, + irq_trigger(irq_entry), + irq_polarity(irq_entry)); + else + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c new file mode 100644 index 0000000..653efa3 --- /dev/null +++ b/arch/x86/kernel/ioport_64.c @@ -0,0 +1,119 @@ +/* + * linux/arch/x86_64/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + int i; + if (new_value) + for (i = base; i < base + extent; i++) + __set_bit(i, bitmap); + else + for (i = base; i < base + extent; i++) + clear_bit(i, bitmap); +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + unsigned int i, max_long, bytes, bytes_updated; + struct thread_struct * t = ¤t->thread; + struct tss_struct * tss; + unsigned long *bitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + } + + /* + * do it in the per-thread copy and in the TSS ... + * + * Disable preemption via get_cpu() - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ + tss = &per_cpu(init_tss, get_cpu()); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = 0; + for (i = 0; i < IO_BITMAP_LONGS; i++) + if (t->io_bitmap_ptr[i] != ~0UL) + max_long = i; + + bytes = (max_long + 1) * sizeof(long); + bytes_updated = max(bytes, t->io_bitmap_max); + + t->io_bitmap_max = bytes; + + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + + put_cpu(); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + unsigned int old = (regs->eflags >> 12) & 3; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); + return 0; +} diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c new file mode 100644 index 0000000..39cb3fa --- /dev/null +++ b/arch/x86/kernel/irq_64.c @@ -0,0 +1,213 @@ +/* + * linux/arch/x86_64/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +atomic_t irq_err_count; + +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ + u64 curbase = (u64)task_stack_page(current); + static unsigned long warned = -60*HZ; + + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && + regs->rsp < curbase + sizeof(struct thread_info) + 128 && + time_after(jiffies, warned + 60*HZ)) { + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", + current->comm, curbase, regs->rsp); + show_stack(NULL,NULL); + warned = jiffies; + } +} +#endif + +/* + * Generic, controller-independent functions: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %8s", irq_desc[i].chip->name); + seq_printf(p, "-%-8s", irq_desc[i].name); + + seq_printf(p, " %s", action->name); + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); + seq_putc(p, '\n'); + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); + seq_putc(p, '\n'); + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + } + return 0; +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_rax; + unsigned irq; + + exit_idle(); + irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + stack_overflow_check(regs); +#endif + + if (likely(irq < NR_IRQS)) + generic_handle_irq(irq); + else { + if (!disable_apic) + ack_APIC_irq(); + + if (printk_ratelimit()) + printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", + __func__, smp_processor_id(), vector); + } + + irq_exit(); + + set_irq_regs(old_regs); + return 1; +} + +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + int break_affinity = 0; + int set_affinity = 1; + + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&irq_desc[irq].lock); + + if (!irq_has_action(irq) || + cpus_equal(irq_desc[irq].affinity, map)) { + spin_unlock(&irq_desc[irq].lock); + continue; + } + + cpus_and(mask, irq_desc[irq].affinity, map); + if (cpus_empty(mask)) { + break_affinity = 1; + mask = map; + } + + if (irq_desc[irq].chip->mask) + irq_desc[irq].chip->mask(irq); + + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); + else if (!(warned++)) + set_affinity = 0; + + if (irq_desc[irq].chip->unmask) + irq_desc[irq].chip->unmask(irq); + + spin_unlock(&irq_desc[irq].lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif + +extern void call_softirq(void); + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + pending = local_softirq_pending(); + /* Switch to interrupt stack */ + if (pending) { + call_softirq(); + WARN_ON_ONCE(softirq_count()); + } + local_irq_restore(flags); +} +EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c new file mode 100644 index 0000000..7377ccb --- /dev/null +++ b/arch/x86/kernel/k8.c @@ -0,0 +1,123 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ +#include +#include +#include +#include +#include +#include +#include + +int num_k8_northbridges; +EXPORT_SYMBOL(num_k8_northbridges); + +static u32 *flush_words; + +struct pci_device_id k8_nb_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} +}; +EXPORT_SYMBOL(k8_nb_ids); + +struct pci_dev **k8_northbridges; +EXPORT_SYMBOL(k8_northbridges); + +static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_ids[0], dev)); + return dev; +} + +int cache_k8_northbridges(void) +{ + int i; + struct pci_dev *dev; + + if (num_k8_northbridges) + return 0; + + dev = NULL; + while ((dev = next_k8_northbridge(dev)) != NULL) + num_k8_northbridges++; + + k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), + GFP_KERNEL); + if (!k8_northbridges) + return -ENOMEM; + + if (!num_k8_northbridges) { + k8_northbridges[0] = NULL; + return 0; + } + + flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges); + return -ENOMEM; + } + + dev = NULL; + i = 0; + while ((dev = next_k8_northbridge(dev)) != NULL) { + k8_northbridges[i] = dev; + pci_read_config_dword(dev, 0x9c, &flush_words[i++]); + } + k8_northbridges[i] = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(cache_k8_northbridges); + +/* Ignores subdevice/subvendor but as far as I can figure out + they're useless anyways */ +int __init early_is_k8_nb(u32 device) +{ + struct pci_device_id *id; + u32 vendor = device & 0xffff; + device >>= 16; + for (id = k8_nb_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return 1; + return 0; +} + +void k8_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + /* Avoid races between AGP and IOMMU. In theory it's not needed + but I'm not sure if the hardware won't lose flush requests + when another is pending. This whole thing is so expensive anyways + that it doesn't matter to serialize more. -AK */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < num_k8_northbridges; i++) { + pci_write_config_dword(k8_northbridges[i], 0x9c, + flush_words[i]|1); + flushed++; + } + for (i = 0; i < num_k8_northbridges; i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(k8_northbridges[i], + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + printk("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(k8_flush_garts); + diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c new file mode 100644 index 0000000..a30e004 --- /dev/null +++ b/arch/x86/kernel/kprobes_64.c @@ -0,0 +1,749 @@ +/* + * Kernel Probes (KProbes) + * arch/x86_64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston and Prasanna S Panchamukhi + * adapted for x86_64 + * 2005-Mar Roland McGrath + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Rusty Lynch + * Added function return probes functionality + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +void jprobe_return_end(void); +static void __kprobes arch_copy_kprobe(struct kprobe *p); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static __always_inline int is_IF_modifier(kprobe_opcode_t *insn) +{ + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) + return 1; + return 0; +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on x86_64. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) { + return -ENOMEM; + } + arch_copy_kprobe(p); + return 0; +} + +/* + * Determine if the instruction uses the %rip-relative addressing mode. + * If it does, return the address of the 32-bit displacement word. + * If not, return null. + */ +static s32 __kprobes *is_riprel(u8 *insn) +{ +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 64)) + static const u64 onebyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ + W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ + W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ + W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ + W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ + W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ + W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ + W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; + static const u64 twobyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ + W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ + W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ + W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ + W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ + W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ + W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ + W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ + W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + int need_modrm; + + /* Skip legacy instruction prefixes. */ + while (1) { + switch (*insn) { + case 0x66: + case 0x67: + case 0x2e: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x36: + case 0xf0: + case 0xf3: + case 0xf2: + ++insn; + continue; + } + break; + } + + /* Skip REX instruction prefix. */ + if ((*insn & 0xf0) == 0x40) + ++insn; + + if (*insn == 0x0f) { /* Two-byte opcode. */ + ++insn; + need_modrm = test_bit(*insn, twobyte_has_modrm); + } else { /* One-byte opcode. */ + need_modrm = test_bit(*insn, onebyte_has_modrm); + } + + if (need_modrm) { + u8 modrm = *++insn; + if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ + /* Displacement follows ModRM byte. */ + return (s32 *) ++insn; + } + } + + /* No %rip-relative addressing mode here. */ + return NULL; +} + +static void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + s32 *ripdisp; + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); + ripdisp = is_riprel(p->ainsn.insn); + if (ripdisp) { + /* + * The copied instruction uses the %rip-relative + * addressing mode. Adjust the displacement for the + * difference between the original location of this + * instruction and the location of the copy that will + * actually be run. The tricky bit here is making sure + * that the sign extension happens correctly in this + * calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the + * %rip value and yield the same 64-bit result that the + * sign-extension of the original signed 32-bit + * displacement would have given. + */ + s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ + *ripdisp = disp; + } + p->opcode = *p->addr; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, &p->opcode, 1); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + mutex_lock(&kprobe_mutex); + free_insn_slot(p->ainsn.insn, 0); + mutex_unlock(&kprobe_mutex); +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; + kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; + kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->ainsn.insn)) + kcb->kprobe_saved_rflags &= ~IF_MASK; +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + /*single step inline if the instruction is an int3*/ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->rip = (unsigned long)p->addr; + else + regs->rip = (unsigned long)p->ainsn.insn; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)regs->rsp; + + ri->ret_addr = (kprobe_opcode_t *) *sara; + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} + +int __kprobes kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); + struct kprobe_ctlblk *kcb; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + if (kcb->kprobe_status == KPROBE_HIT_SS && + *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { + regs->eflags &= ~TF_MASK; + regs->eflags |= kcb->kprobe_saved_rflags; + goto no_kprobe; + } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { + /* TODO: Provide re-entrancy from + * post_kprobes_handler() and avoid exception + * stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->rip = (unsigned long)p->addr; + reset_current_kprobe(); + ret = 1; + } else { + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the + * handler. We here save the original kprobe + * variables and just single step on instruction + * of the new probe without calling any user + * handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } + } else { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* The breakpoint instruction was removed by + * another cpu right after we hit, no further + * handling of this interrupt is appropriate + */ + regs->rip = (unsigned long)addr; + ret = 1; + goto no_kprobe; + } + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->rip = (unsigned long)addr; + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + "nop\n"); + } + +/* + * Called when we hit the probe point at kretprobe_trampoline + */ +int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + regs->rip = orig_ret_address; + + reset_current_kprobe(); + spin_unlock_irqrestore(&kretprobe_lock, flags); + preempt_enable_no_resched(); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new rip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed eflags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + */ +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, struct kprobe_ctlblk *kcb) +{ + unsigned long *tos = (unsigned long *)regs->rsp; + unsigned long next_rip = 0; + unsigned long copy_rip = (unsigned long)p->ainsn.insn; + unsigned long orig_rip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /*skip the REX prefix*/ + if (*insn >= 0x40 && *insn <= 0x4f) + insn++; + + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(TF_MASK | IF_MASK); + *tos |= kcb->kprobe_old_rflags; + break; + case 0xc3: /* ret/lret */ + case 0xcb: + case 0xc2: + case 0xca: + regs->eflags &= ~TF_MASK; + /* rip is already adjusted, no more changes required*/ + return; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_rip + (*tos - copy_rip); + break; + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* call absolute, indirect */ + /* Fix return addr; rip is correct. */ + next_rip = regs->rip; + *tos = orig_rip + (*tos - copy_rip); + } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + /* rip is correct. */ + next_rip = regs->rip; + } + break; + case 0xea: /* jmp absolute -- rip is correct */ + next_rip = regs->rip; + break; + default: + break; + } + + regs->eflags &= ~TF_MASK; + if (next_rip) { + regs->rip = next_rip; + } else { + regs->rip = orig_rip + (regs->rip - copy_rip); + } +} + +int __kprobes post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs, kcb); + regs->eflags |= kcb->kprobe_saved_rflags; + + /* Restore the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + const struct exception_table_entry *fixup; + + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the rip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->rip = (unsigned long)cur->addr; + regs->eflags |= kcb->kprobe_old_rflags; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accouting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return 1; + } + + /* + * fixup() could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode(args->regs)) + return ret; + + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_GPF: + case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id() */ + preempt_disable(); + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + preempt_enable(); + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_rsp = (long *) regs->rsp; + addr = (unsigned long)(kcb->jprobe_saved_rsp); + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); + regs->eflags &= ~IF_MASK; + regs->rip = (unsigned long)(jp->entry); + return 1; +} + +void __kprobes jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + asm volatile (" xchg %%rbx,%%rsp \n" + " int3 \n" + " .globl jprobe_return_end \n" + " jprobe_return_end: \n" + " nop \n"::"b" + (kcb->jprobe_saved_rsp):"memory"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + u8 *addr = (u8 *) (regs->rip - 1); + unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { + if ((long *)regs->rsp != kcb->jprobe_saved_rsp) { + struct pt_regs *saved_regs = + container_of(kcb->jprobe_saved_rsp, + struct pt_regs, rsp); + printk("current rsp %p does not match saved rsp %p\n", + (long *)regs->rsp, kcb->jprobe_saved_rsp); + printk("Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk("Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + return 1; + + return 0; +} diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c new file mode 100644 index 0000000..bc9ffd5 --- /dev/null +++ b/arch/x86/kernel/ldt_64.c @@ -0,0 +1,252 @@ +/* + * linux/arch/x86_64/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) +{ + void *oldldt; + void *newldt; + unsigned oldsize; + + if (mincount <= (unsigned)pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + wmb(); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + + preempt_disable(); + mask = cpumask_of_cpu(smp_processor_id()); + load_LDT(pc); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#else + load_LDT(pc); +#endif + } + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * + * Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + /* Arbitrary number */ + /* x86-64 default LDT is all zeros */ + if (bytecount > 128) + bytecount = 128; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct task_struct *me = current; + struct mm_struct * mm = me->mm; + __u32 entry_1, entry_2, *lp; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, bytecount)) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= (unsigned)mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + *lp = entry_1; + *(lp+1) = entry_2; + error = 0; + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c new file mode 100644 index 0000000..c3a5547 --- /dev/null +++ b/arch/x86/kernel/machine_kexec_64.c @@ -0,0 +1,259 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + +static void init_level2_page(pmd_t *level2p, unsigned long addr) +{ + unsigned long end_addr; + + addr &= PAGE_MASK; + end_addr = addr + PUD_SIZE; + while (addr < end_addr) { + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; + } +} + +static int init_level3_page(struct kimage *image, pud_t *level3p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + PGDIR_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pmd_t *level2p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level2p = (pmd_t *)page_address(page); + init_level2_page(level2p, addr); + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + addr += PUD_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + pud_clear(level3p++); + addr += PUD_SIZE; + } +out: + return result; +} + + +static int init_level4_page(struct kimage *image, pgd_t *level4p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pud_t *level3p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level3p = (pud_t *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); + if (result) { + goto out; + } + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + addr += PGDIR_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + pgd_clear(level4p++); + addr += PGDIR_SIZE; + } +out: + return result; +} + + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + pgd_t *level4p; + level4p = (pgd_t *)__va(start_pgtable); + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +} + +static void set_idt(void *newidt, u16 limit) +{ + struct desc_ptr curidt; + + /* x86-64 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + __asm__ __volatile__ ( + "lidtq %0\n" + : : "m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, u16 limit) +{ + struct desc_ptr curgdt; + + /* x86-64 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + __asm__ __volatile__ ( + "lgdtq %0\n" + : : "m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) : "memory" + ); +} + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + return; +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = virt_to_phys(&kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); +} + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init setup_crashkernel(char *arg) +{ + unsigned long size, base; + char *p; + if (!arg) + return -EINVAL; + size = memparse(arg, &p); + if (arg == p) + return -EINVAL; + if (*p == '@') { + base = memparse(p+1, &p); + /* FIXME: Do I want a sanity check to validate the + * memory range? Yes you do, but it's too early for + * e820 -AK */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", setup_crashkernel); + diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c new file mode 100644 index 0000000..a66d607 --- /dev/null +++ b/arch/x86/kernel/mce_64.c @@ -0,0 +1,875 @@ +/* + * Machine check handler. + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MISC_MCELOG_MINOR 227 +#define NR_BANKS 6 + +atomic_t mce_entry; + +static int mce_dont_init; + +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ +static int tolerant = 1; +static int banks; +static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long notify_user; +static int rip_msr; +static int mce_bootlog = 1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; + +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + atomic_inc(&mce_events); + mce->finished = 0; + wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + /* The rmb forces the compiler to reload next in each + iteration */ + rmb(); + for (;;) { + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + set_bit(0, ¬ify_user); +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG "HARDWARE ERROR\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->rip) { + printk(KERN_EMERG + "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->rip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->rip); + printk("\n"); + } + printk(KERN_EMERG "TSC %Lx ", m->tsc); + if (m->addr) + printk("ADDR %Lx ", m->addr); + if (m->misc) + printk("MISC %Lx ", m->misc); + printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +} + +static void mce_panic(char *msg, struct mce *backup, unsigned long start) +{ + int i; + + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + panic(msg); +} + +static int mce_available(struct cpuinfo_x86 *c) +{ + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) +{ + if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { + m->rip = regs->rip; + m->cs = regs->cs; + } else { + m->rip = 0; + m->cs = 0; + } + if (rip_msr) { + /* Assume the RIP in the MSR is exact. Is this true? */ + m->mcgstatus |= MCG_STATUS_EIPV; + rdmsrl(rip_msr, m->rip); + m->cs = 0; + } +} + +/* + * The actual machine check handler + */ + +void do_machine_check(struct pt_regs * regs, long error_code) +{ + struct mce m, panicm; + u64 mcestart = 0; + int i; + int panicm_found = 0; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + + atomic_inc(&mce_entry); + + if (regs) + notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); + if (!banks) + goto out2; + + memset(&m, 0, sizeof(struct mce)); + m.cpu = smp_processor_id(); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + no_way_out = 1; + + rdtscll(mcestart); + barrier(); + + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + if (m.status & MCI_STATUS_EN) { + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } + } + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + mce_get_rip(&m, regs); + if (error_code >= 0) + rdtscll(m.tsc); + if (error_code != -2) + mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, + and that there is only a single one. */ + if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + + add_taint(TAINT_MACHINE_CHECK); + } + + /* Never do anything final in the polling timer */ + if (!regs) + goto out; + + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) + panicm = m; + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) + mce_panic("Machine check", &panicm, mcestart); + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { + int user_space = 0; + + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) + user_space = panicm.rip && (panicm.cs & 3); + + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * do_exit() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { + do_exit(SIGBUS); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } + } + + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + + out: + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ + +static int check_interval = 5 * 60; /* 5 minutes */ +static int next_interval; /* in jiffies */ +static void mcheck_timer(struct work_struct *work); +static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); + +static void mcheck_check_cpu(void *info) +{ + if (mce_available(¤t_cpu_data)) + do_machine_check(NULL, 0); +} + +static void mcheck_timer(struct work_struct *work) +{ + on_each_cpu(mcheck_check_cpu, NULL, 1, 1); + + /* + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. + */ + if (mce_notify_user()) { + next_interval = max(next_interval/2, HZ/100); + } else { + next_interval = min(next_interval*2, + (int)round_jiffies_relative(check_interval*HZ)); + } + + schedule_delayed_work(&mcheck_work, next_interval); +} + +/* + * This is only called from process context. This is where we do + * anything we need to alert userspace about new MCEs. This is called + * directly from the poller and also from entry.S and idle, thanks to + * TIF_MCE_NOTIFY. + */ +int mce_notify_user(void) +{ + clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { + static unsigned long last_print; + unsigned long now = jiffies; + + wake_up_interruptible(&mce_wait); + if (trigger[0]) + call_usermodehelper(trigger, trigger_argv, NULL, + UMH_NO_WAIT); + + if (time_after_eq(now, last_print + (check_interval*HZ))) { + last_print = now; + printk(KERN_INFO "Machine check events logged\n"); + } + + return 1; + } + return 0; +} + +/* see if the idle task needs to notify userspace */ +static int +mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +{ + /* IDLE_END should be safe - interrupts are back on */ + if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) + mce_notify_user(); + + return NOTIFY_OK; +} + +static struct notifier_block mce_idle_notifier = { + .notifier_call = mce_idle_callback, +}; + +static __init int periodic_mcheck_init(void) +{ + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); + idle_notifier_register(&mce_idle_notifier); + return 0; +} +__initcall(periodic_mcheck_init); + + +/* + * Initialize Machine Checks for a CPU. + */ +static void mce_init(void *dummy) +{ + u64 cap; + int i; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + banks = cap & 0xff; + if (banks > NR_BANKS) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); + banks = NR_BANKS; + } + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + do_machine_check(NULL, mce_bootlog ? -1 : -2); + + set_in_cr4(X86_CR4_MCE); + + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { + /* disable GART TBL walk error reporting, which trips off + incorrectly with the IOMMU & 3ware & Cerberus. */ + clear_bit(10, &bank[4]); + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; + } + +} + +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off. + */ +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +{ + static cpumask_t mce_cpus = CPU_MASK_NONE; + + mce_cpu_quirks(c); + + if (mce_dont_init || + cpu_test_and_set(smp_processor_id(), mce_cpus) || + !mce_available(c)) + return; + + mce_init(NULL); + mce_cpu_features(c); +} + +/* + * Character device to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) +{ + unsigned long *cpu_tsc; + static DECLARE_MUTEX(mce_read_sem); + unsigned next; + char __user *buf = ubuf; + int i, err; + + cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + down(&mce_read_sem); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + up(&mce_read_sem); + kfree(cpu_tsc); + return -EINVAL; + } + + err = 0; + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); + buf += sizeof(struct mce); + timeout: + ; + } + + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + + synchronize_sched(); + + /* Collect entries that were still getting written before the synchronize. */ + + on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + up(&mce_read_sem); + kfree(cpu_tsc); + return err ? -EFAULT : buf - ubuf; +} + +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + +static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, + .read = mce_read, + .poll = mce_poll, + .ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + +/* + * Old style boot options parsing. Only for compatibility. + */ + +static int __init mcheck_disable(char *str) +{ + mce_dont_init = 1; + return 1; +} + +/* mce=off disables machine check. Note you can reenable it later + using sysfs. + mce=TOLERANCELEVEL (number, see above) + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ +static int __init mcheck_enable(char *str) +{ + if (*str == '=') + str++; + if (!strcmp(str, "off")) + mce_dont_init = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; + else if (isdigit(str[0])) + get_option(&str, &tolerant); + else + printk("mce= argument %s ignored. Please use /sys", str); + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ +static int mce_resume(struct sys_device *dev) +{ + mce_init(NULL); + return 0; +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + if (next_interval) + cancel_delayed_work(&mcheck_work); + /* Timer race is harmless here */ + on_each_cpu(mce_init, NULL, 1, 1); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); +} + +static struct sysdev_class mce_sysclass = { + .resume = mce_resume, + set_kset_name("machinecheck"), +}; + +DEFINE_PER_CPU(struct sys_device, device_mce); + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + if (end == buf) return -EINVAL; \ + var = new; \ + start; \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +/* TBD should generate these dynamically based on number of available banks */ +ACCESSOR(bank0ctl,bank[0],mce_restart()) +ACCESSOR(bank1ctl,bank[1],mce_restart()) +ACCESSOR(bank2ctl,bank[2],mce_restart()) +ACCESSOR(bank3ctl,bank[3],mce_restart()) +ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(bank5ctl,bank[5],mce_restart()) + +static ssize_t show_trigger(struct sys_device *s, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +{ + char *p; + int len; + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + if (*p) *p = 0; + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); +ACCESSOR(tolerant,tolerant,) +ACCESSOR(check_interval,check_interval,mce_restart()) +static struct sysdev_attribute *mce_attributes[] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, + &attr_tolerant, &attr_check_interval, &attr_trigger, + NULL +}; + +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + int i; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + for (i = 0; mce_attributes[i]; i++) + sysdev_create_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); + } + return err; +} + +static void mce_remove_device(unsigned int cpu) +{ + int i; + + for (i = 0; mce_attributes[i]; i++) + sysdev_remove_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); + sysdev_unregister(&per_cpu(device_mce,cpu)); + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + mce_create_device(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + mce_remove_device(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + +static __init int mce_init_device(void) +{ + int err; + int i = 0; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + err = sysdev_class_register(&mce_sysclass); + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_hotcpu_notifier(&mce_cpu_notifier); + misc_register(&mce_log_device); + return err; +} + +device_initcall(mce_init_device); diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c new file mode 100644 index 0000000..2f8a7f1 --- /dev/null +++ b/arch/x86/kernel/mce_amd_64.c @@ -0,0 +1,689 @@ +/* + * (c) 2005, 2006 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * + * Support : jacob.shin@amd.com + * + * April 2006 + * - added support for AMD Family 0x10 processors + * + * All MC4_MISCi registers are shared between multi-cores + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PFX "mce_threshold: " +#define VERSION "version 1.1.1" +#define NR_BANKS 6 +#define NR_BLOCKS 9 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +struct threshold_block { + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; +}; + +/* defaults used early on boot */ +static struct threshold_block threshold_defaults = { + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, +}; + +struct threshold_bank { + struct kobject kobj; + struct threshold_block *blocks; + cpumask_t cpus; +}; +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + +#ifdef CONFIG_SMP +static unsigned char shared_bank[NR_BANKS] = { + 0, 0, 0, 0, 1 +}; +#endif + +static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +/* + * CPU Initialization + */ + +/* must be called with correct cpu affinity */ +static void threshold_restart_bank(struct threshold_block *b, + int reset, u16 old_limit) +{ + u32 mci_misc_hi, mci_misc_lo; + + rdmsr(b->address, mci_misc_lo, mci_misc_hi); + + if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + reset = 1; /* limit cannot be lower than err count */ + + if (reset) { /* reset err count and overflow bit */ + mci_misc_hi = + (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - b->threshold_limit); + } else if (old_limit) { /* change limit w/o reset */ + int new_count = (mci_misc_hi & THRESHOLD_MAX) + + (old_limit - b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + b->interrupt_enable ? + (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (mci_misc_hi &= ~MASK_INT_TYPE_HI); + + mci_misc_hi |= MASK_COUNT_EN_HI; + wrmsr(b->address, mci_misc_lo, mci_misc_hi); +} + +/* cpu init entry point, called from mce.c with preempt off */ +void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + unsigned int bank, block; + unsigned int cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; + + for (bank = 0; bank < NR_BANKS; ++bank) { + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } + else + ++address; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); +#ifdef CONFIG_SMP + if (shared_bank[bank] && c->cpu_core_id) + break; +#endif + high &= ~MASK_LVTOFF_HI; + high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; + wrmsr(address, low, high); + + setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); + + threshold_defaults.address = address; + threshold_restart_bank(&threshold_defaults, 0, 0); + } + } +} + +/* + * APIC Interrupt Handler + */ + +/* + * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +asmlinkage void mce_threshold_interrupt(void) +{ + unsigned int bank, block; + struct mce m; + u32 low = 0, high = 0, address = 0; + + ack_APIC_irq(); + exit_idle(); + irq_enter(); + + memset(&m, 0, sizeof(m)); + rdtscll(m.tsc); + m.cpu = smp_processor_id(); + + /* assume first bank caused it */ + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } + else + ++address; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + /* Log the machine check that caused the threshold + event. */ + do_machine_check(NULL, 0); + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, + m.status); + m.bank = K8_MCE_THRESHOLD_BASE + + bank * NR_BLOCKS + + block; + mce_log(&m); + goto out; + } + } + } +out: + irq_exit(); +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t(*show) (struct threshold_block *, char *); + ssize_t(*store) (struct threshold_block *, const char *, size_t count); +}; + +static cpumask_t affinity_set(unsigned int cpu) +{ + cpumask_t oldmask = current->cpus_allowed; + cpumask_t newmask = CPU_MASK_NONE; + cpu_set(cpu, newmask); + set_cpus_allowed(current, newmask); + return oldmask; +} + +static void affinity_restore(cpumask_t oldmask) +{ + set_cpus_allowed(current, oldmask); +} + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t store_interrupt_enable(struct threshold_block *b, + const char *buf, size_t count) +{ + char *end; + cpumask_t oldmask; + unsigned long new = simple_strtoul(buf, &end, 0); + if (end == buf) + return -EINVAL; + b->interrupt_enable = !!new; + + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 0, 0); + affinity_restore(oldmask); + + return end - buf; +} + +static ssize_t store_threshold_limit(struct threshold_block *b, + const char *buf, size_t count) +{ + char *end; + cpumask_t oldmask; + u16 old; + unsigned long new = simple_strtoul(buf, &end, 0); + if (end == buf) + return -EINVAL; + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + old = b->threshold_limit; + b->threshold_limit = new; + + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 0, old); + affinity_restore(oldmask); + + return end - buf; +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + u32 high, low; + cpumask_t oldmask; + oldmask = affinity_set(b->cpu); + rdmsr(b->address, low, high); + affinity_restore(oldmask); + return sprintf(buf, "%x\n", + (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); +} + +static ssize_t store_error_count(struct threshold_block *b, + const char *buf, size_t count) +{ + cpumask_t oldmask; + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 1, 0); + affinity_restore(oldmask); + return 1; +} + +#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; + +#define RW_ATTR(name) \ +static struct threshold_attr name = \ + THRESHOLD_ATTR(name, 0644, show_## name, store_## name) + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); +RW_ATTR(error_count); + +static struct attribute *default_attrs[] = { + &interrupt_enable.attr, + &threshold_limit.attr, + &error_count.attr, + NULL +}; + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; +} + +static struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +static __cpuinit int allocate_threshold_blocks(unsigned int cpu, + unsigned int bank, + unsigned int block, + u32 address) +{ + int err; + u32 low, high; + struct threshold_block *b = NULL; + + if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe(address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + else + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + + kobject_set_name(&b->kobj, "misc%i", block); + b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; + b->kobj.ktype = &threshold_ktype; + err = kobject_register(&b->kobj); + if (err) + goto out_free; +recurse: + if (!block) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + return 0; + address += MCG_XBLK_ADDR; + } else + ++address; + + err = allocate_threshold_blocks(cpu, bank, ++block, address); + if (err) + goto out_free; + + return err; + +out_free: + if (b) { + kobject_unregister(&b->kobj); + kfree(b); + } + return err; +} + +/* symlinks sibling shared banks to first core. first core owns dir/files. */ +static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + int i, err = 0; + struct threshold_bank *b = NULL; + cpumask_t oldmask = CPU_MASK_NONE; + char name[32]; + + sprintf(name, "threshold_bank%i", bank); + +#ifdef CONFIG_SMP + if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ + i = first_cpu(cpu_core_map[cpu]); + + /* first core not up yet */ + if (cpu_data[i].cpu_core_id) + goto out; + + /* already linked */ + if (per_cpu(threshold_banks, cpu)[bank]) + goto out; + + b = per_cpu(threshold_banks, i)[bank]; + + if (!b) + goto out; + + err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, + &b->kobj, name); + if (err) + goto out; + + b->cpus = cpu_core_map[cpu]; + per_cpu(threshold_banks, cpu)[bank] = b; + goto out; + } +#endif + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + + kobject_set_name(&b->kobj, "threshold_bank%i", bank); + b->kobj.parent = &per_cpu(device_mce, cpu).kobj; +#ifndef CONFIG_SMP + b->cpus = CPU_MASK_ALL; +#else + b->cpus = cpu_core_map[cpu]; +#endif + err = kobject_register(&b->kobj); + if (err) + goto out_free; + + per_cpu(threshold_banks, cpu)[bank] = b; + + oldmask = affinity_set(cpu); + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + affinity_restore(oldmask); + + if (err) + goto out_free; + + for_each_cpu_mask(i, b->cpus) { + if (i == cpu) + continue; + + err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + &b->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, i)[bank] = b; + } + + goto out; + +out_free: + per_cpu(threshold_banks, cpu)[bank] = NULL; + kfree(b); +out: + return err; +} + +/* create dir/files for all valid threshold banks */ +static __cpuinit int threshold_create_device(unsigned int cpu) +{ + unsigned int bank; + int err = 0; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto out; + } +out: + return err; +} + +/* + * let's be hotplug friendly. + * in case of multiple core processors, the first core always takes ownership + * of shared sysfs dir/files, and rest of the cores will be symlinked to it. + */ + +static void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_unregister(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + +static void threshold_remove_bank(unsigned int cpu, int bank) +{ + int i = 0; + struct threshold_bank *b; + char name[32]; + + b = per_cpu(threshold_banks, cpu)[bank]; + + if (!b) + return; + + if (!b->blocks) + goto free_out; + + sprintf(name, "threshold_bank%i", bank); + +#ifdef CONFIG_SMP + /* sibling symlink */ + if (shared_bank[bank] && b->blocks->cpu != cpu) { + sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } +#endif + + /* remove all sibling symlinks before unregistering */ + for_each_cpu_mask(i, b->cpus) { + if (i == cpu) + continue; + + sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + per_cpu(threshold_banks, i)[bank] = NULL; + } + + deallocate_threshold_block(cpu, bank); + +free_out: + kobject_unregister(&b->kobj); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; +} + +static void threshold_remove_device(unsigned int cpu) +{ + unsigned int bank; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + threshold_remove_bank(cpu, bank); + } +} + +/* get notified when a cpu comes on/off */ +static int threshold_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + /* cpu was unsigned int to begin with */ + unsigned int cpu = (unsigned long)hcpu; + + if (cpu >= NR_CPUS) + goto out; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + threshold_create_device(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + threshold_remove_device(cpu); + break; + default: + break; + } + out: + return NOTIFY_OK; +} + +static struct notifier_block threshold_cpu_notifier = { + .notifier_call = threshold_cpu_callback, +}; + +static __init int threshold_init_device(void) +{ + unsigned lcpu = 0; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + int err = threshold_create_device(lcpu); + if (err) + return err; + } + register_hotcpu_notifier(&threshold_cpu_notifier); + return 0; +} + +device_initcall(threshold_init_device); diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c new file mode 100644 index 0000000..6551505 --- /dev/null +++ b/arch/x86/kernel/mce_intel_64.c @@ -0,0 +1,89 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void smp_thermal_interrupt(void) +{ + __u64 msr_val; + + ack_APIC_irq(); + + exit_idle(); + irq_enter(); + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & 1)) + mce_log_therm_throt_event(smp_processor_id(), msr_val); + + irq_exit(); +} + +static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + int tm2 = 0; + unsigned int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; + + if (!cpu_has(c, X86_FEATURE_ACC)) + return; + + /* first check if TM1 is already enabled by the BIOS, in which + * case there might be some SMM goo which handles it, so we can't even + * put a handler since it might be delivered via SMI already. + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already " + "installed\n", cpu, (h & APIC_VECTOR_MASK)); + return; + } + + h = THERMAL_APIC_VECTOR; + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); + return; +} + +void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); +} diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c new file mode 100644 index 0000000..a888e67 --- /dev/null +++ b/arch/x86/kernel/module_64.c @@ -0,0 +1,185 @@ +/* Kernel module help for x86-64 + Copyright (C) 2001 Rusty Russell. + Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DEBUGP(fmt...) + +#ifndef CONFIG_UML +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +void *module_alloc(unsigned long size) +{ + struct vm_struct *area; + + if (!size) + return NULL; + size = PAGE_ALIGN(size); + if (size > MODULES_LEN) + return NULL; + + area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!area) + return NULL; + + return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); +} +#endif + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)loc = val; + break; + case R_X86_64_32: + *(u32 *)loc = val; + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)loc = val; + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + default: + printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +int apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk("non add relocation not supported\n"); + return -ENOSYS; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks= s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + return module_bug_finalize(hdr, sechdrs, me); +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); + module_bug_cleanup(mod); +} diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c new file mode 100644 index 0000000..8bf0ca0 --- /dev/null +++ b/arch/x86/kernel/mpparse_64.c @@ -0,0 +1,852 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000 Ingo Molnar + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* Have we found an MP table */ +int smp_found_config; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; + +static int mp_current_pci_id = 0; +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; +unsigned long mp_lapic_addr = 0; + + + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_id = -1U; +/* Internal processor count */ +unsigned int num_processors __cpuinitdata = 0; + +unsigned disabled_cpus __cpuinitdata; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + + +/* + * Intel MP BIOS table parsing routines: + */ + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +{ + int cpu; + cpumask_t tmp_map; + char *bootup_cpu = ""; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_id = m->mpc_apicid; + } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + num_processors++; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + + physid_set(m->mpc_apicid, phys_cpu_present_map); + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + /* + * bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + */ + cpu = 0; + } + bios_cpu_apicid[cpu] = m->mpc_apicid; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); +} + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + Dprintk("Bus #%d is %s\n", m->mpc_busid, str); + + if (strncmp(str, "ISA", 3) == 0) { + set_bit(m->mpc_busid, mp_bus_not_pci); + } else if (strncmp(str, "PCI", 3) == 0) { + clear_bit(m->mpc_busid, mp_bus_not_pci); + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else { + printk(KERN_ERR "Unknown bustype %s\n", str); + } +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk("I/O APIC #%d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicaddr); + + if (bad_ioapic(m->mpc_apicaddr)) + return; + + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries >= MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk("MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->mpc_signature[0], + mpc->mpc_signature[1], + mpc->mpc_signature[2], + mpc->mpc_signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk("MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(str,mpc->mpc_oem,8); + str[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); + + memcpy(str,mpc->mpc_productid,12); + str[12] = 0; + printk("MPTABLE: Product ID: %s ",str); + + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); + + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + } + } + setup_apic_routing(); + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicver = 0; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = 0; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + extern void __bad_mpf_size(void); + unsigned int *bp = phys_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + __bad_mpf_size(); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) + reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_smp_config(void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + */ + + address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + if (smp_scan_config(address, 0x1000)) + return; + + /* If we have come this far, we did not find an MP table */ + printk(KERN_INFO "No mptable found.\n"); +} + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +void __init mp_register_lapic_address(u64 address) +{ + mp_lapic_addr = (unsigned long) address; + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_id == -1U) + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); +} + +void __cpuinit mp_register_lapic (u8 id, u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (id == boot_cpu_id) + boot_cpu = 1; + + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = 0; + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + + MP_processor_info(&processor); +} + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_start; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + +static int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_start) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +static u8 uniq_ioapic_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} + +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); + mp_ioapics[idx].mpc_apicver = 0; + + /* + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI IRQs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_start = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_start, + mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; +} + +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); +} + +void __init mp_config_acpi_legacy_irqs(void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + set_bit(MP_ISA_BUS, mp_bus_not_pci); + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } +} + +int mp_register_gsi(u32 gsi, int triggering, int polarity) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1< + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +int unknown_nmi_panic; +int nmi_watchdog_enabled; +int panic_on_unrecovered_nmi; + +static cpumask_t backtrace_mask = CPU_MASK_NONE; + +/* nmi_active: + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled + */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +int panic_on_timeout; + +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; + +static DEFINE_PER_CPU(short, wd_enabled); + +/* local prototypes */ +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); + +/* Run after command line and cpu_init init, but before all other checks */ +void nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; + nmi_watchdog = NMI_NONE; +} + +static int endflag __initdata = 0; + +#ifdef CONFIG_SMP +/* The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ +static __init void nmi_cpu_busy(void *data) +{ + local_irq_enable_in_hardirq(); + /* Intentionally don't use cpu_relax here. This is + to make sure that the performance counter really ticks, + even if there is a simulator or similar that catches the + pause instruction. On a real HT machine this is fine because + all other CPUs are busy with "useless" delay loops and don't + care if they get somewhat less cycles. */ + while (endflag == 0) + mb(); +} +#endif + +int __init check_nmi_watchdog (void) +{ + int *counts; + int cpu; + + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!counts) + return -1; + + printk(KERN_INFO "testing NMI watchdog ... "); + +#ifdef CONFIG_SMP + if (nmi_watchdog == NMI_LOCAL_APIC) + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif + + for (cpu = 0; cpu < NR_CPUS; cpu++) + counts[cpu] = cpu_pda(cpu)->__nmi_count; + local_irq_enable(); + mdelay((20*1000)/nmi_hz); // wait 20 ticks + + for_each_online_cpu(cpu) { + if (!per_cpu(wd_enabled, cpu)) + continue; + if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { + printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", + cpu, + counts[cpu], + cpu_pda(cpu)->__nmi_count); + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); + } + } + if (!atomic_read(&nmi_active)) { + kfree(counts); + atomic_set(&nmi_active, -1); + endflag = 1; + return -1; + } + endflag = 1; + printk("OK.\n"); + + /* now that we know it works we can reduce NMI frequency to + something more reasonable; makes a difference in some configs */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); + + kfree(counts); + return 0; +} + +int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + if (!strncmp(str,"panic",5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) + return 0; + + nmi_watchdog = nmi; + return 1; +} + +__setup("nmi_watchdog=", setup_nmi_watchdog); + + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } + return 0; +} + +static struct sysdev_class nmi_sysclass = { + set_kset_name("lapic_nmi"), + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +void setup_apic_nmi_watchdog(void *unused) +{ + if (__get_cpu_var(wd_enabled) == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; + return; + } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + } +} + +void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + if (__get_cpu_var(wd_enabled) == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; + atomic_dec(&nmi_active); +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + */ + +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); + +void touch_nmi_watchdog(void) +{ + if (nmi_watchdog > 0) { + unsigned cpu; + + /* + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. + */ + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } + } + + touch_softlockup_watchdog(); +} + +int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +{ + int sum; + int touched = 0; + int cpu = smp_processor_id(); + int rc = 0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } + + sum = read_pda(apic_timer_irqs); + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + +#ifdef CONFIG_X86_MCE + /* Could check oops_in_progress here too, but it's safer + not too */ + if (atomic_read(&mce_entry) > 0) + touched = 1; +#endif + /* if the apic timer isn't firing, this cpu isn't doing much */ + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, + panic_on_timeout); + } else { + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); + } + + /* see if the nmi watchdog went off */ + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; + } + return rc; +} + +static unsigned ignore_nmis; + +asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) +{ + nmi_enter(); + add_pda(__nmi_count,1); + if (!ignore_nmis) + default_do_nmi(regs); + nmi_exit(); +} + +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf, regs, 1); /* Always panic here */ + return 0; +} + +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else { + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + +#endif + +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c new file mode 100644 index 0000000..71da01e --- /dev/null +++ b/arch/x86/kernel/pci-calgary_64.c @@ -0,0 +1,1578 @@ +/* + * Derived from arch/powerpc/kernel/iommu.c + * + * Copyright IBM Corporation, 2006-2007 + * Copyright (C) 2006 Jon Mason + * + * Author: Jon Mason + * Author: Muli Ben-Yehuda + + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT +int use_calgary __read_mostly = 1; +#else +int use_calgary __read_mostly = 0; +#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ + +#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 +#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 + +/* register offsets inside the host bridge space */ +#define CALGARY_CONFIG_REG 0x0108 +#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ +#define PHB_PLSSR_OFFSET 0x0120 +#define PHB_CONFIG_RW_OFFSET 0x0160 +#define PHB_IOBASE_BAR_LOW 0x0170 +#define PHB_IOBASE_BAR_HIGH 0x0180 +#define PHB_MEM_1_LOW 0x0190 +#define PHB_MEM_1_HIGH 0x01A0 +#define PHB_IO_ADDR_SIZE 0x01B0 +#define PHB_MEM_1_SIZE 0x01C0 +#define PHB_MEM_ST_OFFSET 0x01D0 +#define PHB_AER_OFFSET 0x0200 +#define PHB_CONFIG_0_HIGH 0x0220 +#define PHB_CONFIG_0_LOW 0x0230 +#define PHB_CONFIG_0_END 0x0240 +#define PHB_MEM_2_LOW 0x02B0 +#define PHB_MEM_2_HIGH 0x02C0 +#define PHB_MEM_2_SIZE_HIGH 0x02D0 +#define PHB_MEM_2_SIZE_LOW 0x02E0 +#define PHB_DOSHOLE_OFFSET 0x08E0 + +/* CalIOC2 specific */ +#define PHB_SAVIOR_L2 0x0DB0 +#define PHB_PAGE_MIG_CTRL 0x0DA8 +#define PHB_PAGE_MIG_DEBUG 0x0DA0 +#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 + +/* PHB_CONFIG_RW */ +#define PHB_TCE_ENABLE 0x20000000 +#define PHB_SLOT_DISABLE 0x1C000000 +#define PHB_DAC_DISABLE 0x01000000 +#define PHB_MEM2_ENABLE 0x00400000 +#define PHB_MCSR_ENABLE 0x00100000 +/* TAR (Table Address Register) */ +#define TAR_SW_BITS 0x0000ffffffff800fUL +#define TAR_VALID 0x0000000000000008UL +/* CSR (Channel/DMA Status Register) */ +#define CSR_AGENT_MASK 0xffe0ffff +/* CCR (Calgary Configuration Register) */ +#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +/* PMCR/PMDR (Page Migration Control/Debug Registers */ +#define PMR_SOFTSTOP 0x80000000 +#define PMR_SOFTSTOPFAULT 0x40000000 +#define PMR_HARDSTOP 0x20000000 + +#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ +#define MAX_NUM_CHASSIS 8 /* max number of chassis */ +/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) +#define PHBS_PER_CALGARY 4 + +/* register offsets in Calgary's internal register space */ +static const unsigned long tar_offsets[] = { + 0x0580 /* TAR0 */, + 0x0588 /* TAR1 */, + 0x0590 /* TAR2 */, + 0x0598 /* TAR3 */ +}; + +static const unsigned long split_queue_offsets[] = { + 0x4870 /* SPLIT QUEUE 0 */, + 0x5870 /* SPLIT QUEUE 1 */, + 0x6870 /* SPLIT QUEUE 2 */, + 0x7870 /* SPLIT QUEUE 3 */ +}; + +static const unsigned long phb_offsets[] = { + 0x8000 /* PHB0 */, + 0x9000 /* PHB1 */, + 0xA000 /* PHB2 */, + 0xB000 /* PHB3 */ +}; + +/* PHB debug registers */ + +static const unsigned long phb_debug_offsets[] = { + 0x4000 /* PHB 0 DEBUG */, + 0x5000 /* PHB 1 DEBUG */, + 0x6000 /* PHB 2 DEBUG */, + 0x7000 /* PHB 3 DEBUG */ +}; + +/* + * STUFF register for each debug PHB, + * byte 1 = start bus number, byte 2 = end bus number + */ + +#define PHB_DEBUG_STUFF_OFFSET 0x0020 + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; +static int translate_empty_slots __read_mostly = 0; +static int calgary_detected __read_mostly = 0; + +static struct rio_table_hdr *rio_table_hdr __initdata; +static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; +static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; + +struct calgary_bus_info { + void *tce_space; + unsigned char translation_disabled; + signed char phbid; + void __iomem *bbar; +}; + +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calgary_tce_cache_blast(struct iommu_table *tbl); +static void calgary_dump_error_regs(struct iommu_table *tbl); +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calioc2_tce_cache_blast(struct iommu_table *tbl); +static void calioc2_dump_error_regs(struct iommu_table *tbl); + +static struct cal_chipset_ops calgary_chip_ops = { + .handle_quirks = calgary_handle_quirks, + .tce_cache_blast = calgary_tce_cache_blast, + .dump_error_regs = calgary_dump_error_regs +}; + +static struct cal_chipset_ops calioc2_chip_ops = { + .handle_quirks = calioc2_handle_quirks, + .tce_cache_blast = calioc2_tce_cache_blast, + .dump_error_regs = calioc2_dump_error_regs +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; + +/* enable this to stress test the chip's TCE cache */ +#ifdef CONFIG_IOMMU_DEBUG +int debugging __read_mostly = 1; + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + unsigned long idx = start; + + BUG_ON(start >= end); + + while (idx < end) { + if (!!test_bit(idx, bitmap) != expected) + return idx; + ++idx; + } + + /* all bits have the expected value */ + return ~0UL; +} +#else /* debugging is disabled */ +int debugging __read_mostly = 0; + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + return ~0UL; +} + +#endif /* CONFIG_IOMMU_DEBUG */ + +static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) +{ + unsigned int npages; + + npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); + npages >>= PAGE_SHIFT; + + return npages; +} + +static inline int translate_phb(struct pci_dev* dev) +{ + int disabled = bus_info[dev->bus->number].translation_disabled; + return !disabled; +} + +static void iommu_range_reserve(struct iommu_table *tbl, + unsigned long start_addr, unsigned int npages) +{ + unsigned long index; + unsigned long end; + unsigned long badbit; + unsigned long flags; + + index = start_addr >> PAGE_SHIFT; + + /* bail out if we're asked to reserve a region we don't cover */ + if (index >= tbl->it_size) + return; + + end = index + npages; + if (end > tbl->it_size) /* don't go off the table */ + end = tbl->it_size; + + spin_lock_irqsave(&tbl->it_lock, flags); + + badbit = verify_bit_range(tbl->it_map, 0, index, end); + if (badbit != ~0UL) { + if (printk_ratelimit()) + printk(KERN_ERR "Calgary: entry already allocated at " + "0x%lx tbl %p dma 0x%lx npages %u\n", + badbit, tbl, start_addr, npages); + } + + set_bit_string(tbl->it_map, index, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, + unsigned int npages) +{ + unsigned long flags; + unsigned long offset; + + BUG_ON(npages == 0); + + spin_lock_irqsave(&tbl->it_lock, flags); + + offset = find_next_zero_string(tbl->it_map, tbl->it_hint, + tbl->it_size, npages); + if (offset == ~0UL) { + tbl->chip_ops->tce_cache_blast(tbl); + offset = find_next_zero_string(tbl->it_map, 0, + tbl->it_size, npages); + if (offset == ~0UL) { + printk(KERN_WARNING "Calgary: IOMMU full.\n"); + spin_unlock_irqrestore(&tbl->it_lock, flags); + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else + return bad_dma_address; + } + } + + set_bit_string(tbl->it_map, offset, npages); + tbl->it_hint = offset + npages; + BUG_ON(tbl->it_hint > tbl->it_size); + + spin_unlock_irqrestore(&tbl->it_lock, flags); + + return offset; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, + unsigned int npages, int direction) +{ + unsigned long entry; + dma_addr_t ret = bad_dma_address; + + entry = iommu_range_alloc(tbl, npages); + + if (unlikely(entry == bad_dma_address)) + goto error; + + /* set the return dma address */ + ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); + + /* put the TCEs in the HW table */ + tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, + direction); + + return ret; + +error: + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return bad_dma_address; +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry; + unsigned long badbit; + unsigned long badend; + unsigned long flags; + + /* were we called with bad_dma_address? */ + badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + WARN_ON(1); + return; + } + + entry = dma_addr >> PAGE_SHIFT; + + BUG_ON(entry + npages > tbl->it_size); + + tce_free(tbl, entry, npages); + + spin_lock_irqsave(&tbl->it_lock, flags); + + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); + if (badbit != ~0UL) { + if (printk_ratelimit()) + printk(KERN_ERR "Calgary: bit is off at 0x%lx " + "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", + badbit, tbl, dma_addr, entry, npages); + } + + __clear_bit_string(tbl->it_map, entry, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static inline struct iommu_table *find_iommu_table(struct device *dev) +{ + struct pci_dev *pdev; + struct pci_bus *pbus; + struct iommu_table *tbl; + + pdev = to_pci_dev(dev); + + pbus = pdev->bus; + + /* is the device behind a bridge? Look for the root bus */ + while (pbus->parent) + pbus = pbus->parent; + + tbl = pci_iommu(pbus); + + BUG_ON(tbl && (tbl->it_busno != pbus->number)); + + return tbl; +} + +static void calgary_unmap_sg(struct device *dev, + struct scatterlist *sglist, int nelems, int direction) +{ + struct iommu_table *tbl = find_iommu_table(dev); + + if (!translate_phb(to_pci_dev(dev))) + return; + + while (nelems--) { + unsigned int npages; + dma_addr_t dma = sglist->dma_address; + unsigned int dmalen = sglist->dma_length; + + if (dmalen == 0) + break; + + npages = num_dma_pages(dma, dmalen); + iommu_free(tbl, dma, npages); + sglist++; + } +} + +static int calgary_nontranslate_map_sg(struct device* dev, + struct scatterlist *sg, int nelems, int direction) +{ + int i; + + for (i = 0; i < nelems; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + s->dma_length = s->length; + } + return nelems; +} + +static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct iommu_table *tbl = find_iommu_table(dev); + unsigned long vaddr; + unsigned int npages; + unsigned long entry; + int i; + + if (!translate_phb(to_pci_dev(dev))) + return calgary_nontranslate_map_sg(dev, sg, nelems, direction); + + for (i = 0; i < nelems; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + + vaddr = (unsigned long)page_address(s->page) + s->offset; + npages = num_dma_pages(vaddr, s->length); + + entry = iommu_range_alloc(tbl, npages); + if (entry == bad_dma_address) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; + } + + s->dma_address = (entry << PAGE_SHIFT) | s->offset; + + /* insert into HW table */ + tce_build(tbl, entry, npages, vaddr & PAGE_MASK, + direction); + + s->dma_length = s->length; + } + + return nelems; +error: + calgary_unmap_sg(dev, sg, nelems, direction); + for (i = 0; i < nelems; i++) { + sg[i].dma_address = bad_dma_address; + sg[i].dma_length = 0; + } + return 0; +} + +static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, + size_t size, int direction) +{ + dma_addr_t dma_handle = bad_dma_address; + unsigned long uaddr; + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); + + uaddr = (unsigned long)vaddr; + npages = num_dma_pages(uaddr, size); + + if (translate_phb(to_pci_dev(dev))) + dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + else + dma_handle = virt_to_bus(vaddr); + + return dma_handle; +} + +static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct iommu_table *tbl = find_iommu_table(dev); + unsigned int npages; + + if (!translate_phb(to_pci_dev(dev))) + return; + + npages = num_dma_pages(dma_handle, size); + iommu_free(tbl, dma_handle, npages); +} + +static void* calgary_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + struct iommu_table *tbl = find_iommu_table(dev); + + size = PAGE_ALIGN(size); /* size rounded up to full pages */ + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + goto error; + memset(ret, 0, size); + + if (translate_phb(to_pci_dev(dev))) { + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + + *dma_handle = mapping; + } else /* non translated slot */ + *dma_handle = virt_to_bus(ret); + + return ret; + +free: + free_pages((unsigned long)ret, get_order(size)); + ret = NULL; +error: + return ret; +} + +static const struct dma_mapping_ops calgary_dma_ops = { + .alloc_coherent = calgary_alloc_coherent, + .map_single = calgary_map_single, + .unmap_single = calgary_unmap_single, + .map_sg = calgary_map_sg, + .unmap_sg = calgary_unmap_sg, +}; + +static inline void __iomem * busno_to_bbar(unsigned char num) +{ + return bus_info[num].bbar; +} + +static inline int busno_to_phbid(unsigned char num) +{ + return bus_info[num].phbid; +} + +static inline unsigned long split_queue_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return split_queue_offsets[idx]; +} + +static inline unsigned long tar_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return tar_offsets[idx]; +} + +static inline unsigned long phb_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return phb_offsets[idx]; +} + +static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) +{ + unsigned long target = ((unsigned long)bar) | offset; + return (void __iomem*)target; +} + +static inline int is_calioc2(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALIOC2); +} + +static inline int is_calgary(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALGARY); +} + +static inline int is_cal_pci_dev(unsigned short device) +{ + return (is_calgary(device) || is_calioc2(device)); +} + +static void calgary_tce_cache_blast(struct iommu_table *tbl) +{ + u64 val; + u32 aer; + int i = 0; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + + /* disable arbitration on the bus */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + aer = readl(target); + writel(0, target); + + /* read plssr to ensure it got there */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + val = readl(target); + + /* poll split queues until all DMA activity is done */ + target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); + do { + val = readq(target); + i++; + } while ((val & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "Calgary: PCI bus not quiesced, " + "continuing anyway\n"); + + /* invalidate TCE cache */ + target = calgary_reg(bbar, tar_offset(tbl->it_busno)); + writeq(tbl->tar_val, target); + + /* enable arbitration */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + writel(aer, target); + (void)readl(target); /* flush */ +} + +static void calioc2_tce_cache_blast(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u64 val64; + u32 val; + int i = 0; + int count = 1; + unsigned char bus = tbl->it_busno; + +begin: + printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " + "sequence - count %d\n", bus, count); + + /* 1. using the Page Migration Control reg set SoftStop */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); + val |= PMR_SOFTSTOP; + printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + + /* 2. poll split queues until all DMA activity is done */ + printk(KERN_DEBUG "2a. starting to poll split queues\n"); + target = calgary_reg(bbar, split_queue_offset(bus)); + do { + val64 = readq(target); + i++; + } while ((val64 & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " + "continuing anyway\n"); + + /* 3. poll Page Migration DEBUG for SoftStopFault */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); + + /* 4. if SoftStopFault - goto (1) */ + if (val & PMR_SOFTSTOPFAULT) { + if (++count < 100) + goto begin; + else { + printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " + "aborting TCE cache flush sequence!\n"); + return; /* pray for the best */ + } + } + + /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); + + /* 6. invalidate TCE cache */ + printk(KERN_DEBUG "6. invalidating TCE cache\n"); + target = calgary_reg(bbar, tar_offset(bus)); + writeq(tbl->tar_val, target); + + /* 7. Re-read PMCR */ + printk(KERN_DEBUG "7a. Re-reading PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); + + /* 8. Remove HardStop */ + printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = 0; + printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); +} + +static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, + u64 limit) +{ + unsigned int numpages; + + limit = limit | 0xfffff; + limit++; + + numpages = ((limit - start) >> PAGE_SHIFT); + iommu_range_reserve(pci_iommu(dev->bus), start, numpages); +} + +static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) +{ + void __iomem *target; + u64 low, high, sizelow; + u64 start, limit; + struct iommu_table *tbl = pci_iommu(dev->bus); + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* peripheral MEM_1 region */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); + sizelow = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) +{ + void __iomem *target; + u32 val32; + u64 low, high, sizelow, sizehigh; + u64 start, limit; + struct iommu_table *tbl = pci_iommu(dev->bus); + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* is it enabled? */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + if (!(val32 & PHB_MEM2_ENABLE)) + return; + + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); + sizelow = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); + sizehigh = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = (sizehigh << 32) | sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +/* + * some regions of the IO address space do not get translated, so we + * must not give devices IO addresses in those regions. The regions + * are the 640KB-1MB region and the two PCI peripheral memory holes. + * Reserve all of them in the IOMMU bitmap to avoid giving them out + * later. + */ +static void __init calgary_reserve_regions(struct pci_dev *dev) +{ + unsigned int npages; + u64 start; + struct iommu_table *tbl = pci_iommu(dev->bus); + + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ + iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + /* for CalIOC2 - avoid the entire first MB */ + if (is_calgary(dev->device)) { + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + } else { /* calioc2 */ + start = 0; + npages = (1 * 1024 * 1024) >> PAGE_SHIFT; + } + iommu_range_reserve(tbl, start, npages); + + /* reserve the two PCI peripheral memory regions in IO space */ + calgary_reserve_peripheral_mem_1(dev); + calgary_reserve_peripheral_mem_2(dev); +} + +static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) +{ + u64 val64; + u64 table_phys; + void __iomem *target; + int ret; + struct iommu_table *tbl; + + /* build TCE tables for each PHB */ + ret = build_tce_table(dev, bbar); + if (ret) + return ret; + + tbl = pci_iommu(dev->bus); + tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; + tce_free(tbl, 0, tbl->it_size); + + if (is_calgary(dev->device)) + tbl->chip_ops = &calgary_chip_ops; + else if (is_calioc2(dev->device)) + tbl->chip_ops = &calioc2_chip_ops; + else + BUG(); + + calgary_reserve_regions(dev); + + /* set TARs for each PHB */ + target = calgary_reg(bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + + /* zero out all TAR bits under sw control */ + val64 &= ~TAR_SW_BITS; + table_phys = (u64)__pa(tbl->it_base); + + val64 |= table_phys; + + BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); + val64 |= (u64) specified_table_size; + + tbl->tar_val = cpu_to_be64(val64); + + writeq(tbl->tar_val, target); + readq(target); /* flush */ + + return 0; +} + +static void __init calgary_free_bus(struct pci_dev *dev) +{ + u64 val64; + struct iommu_table *tbl = pci_iommu(dev->bus); + void __iomem *target; + unsigned int bitmapsz; + + target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + val64 &= ~TAR_SW_BITS; + writeq(cpu_to_be64(val64), target); + readq(target); /* flush */ + + bitmapsz = tbl->it_size / BITS_PER_BYTE; + free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); + tbl->it_map = NULL; + + kfree(tbl); + + set_pci_iommu(dev->bus, NULL); + + /* Can't free bootmem allocated memory after system is up :-( */ + bus_info[dev->bus->number].tce_space = NULL; +} + +static void calgary_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 csr, plssr; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " + "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); +} + +static void calioc2_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + u32 csr, csmr, plssr, mck, rcstat; + void __iomem *target; + unsigned long phboff = phb_offset(tbl->it_busno); + unsigned long erroff; + u32 errregs[7]; + int i; + + /* dump CSR */ + target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + /* dump PLSSR */ + target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + /* dump CSMR */ + target = calgary_reg(bbar, phboff | 0x290); + csmr = be32_to_cpu(readl(target)); + /* dump mck */ + target = calgary_reg(bbar, phboff | 0x800); + mck = be32_to_cpu(readl(target)); + + printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", + tbl->it_busno); + + printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); + + /* dump rest of error regs */ + printk(KERN_EMERG "Calgary: "); + for (i = 0; i < ARRAY_SIZE(errregs); i++) { + /* err regs are at 0x810 - 0x870 */ + erroff = (0x810 + (i * 0x10)); + target = calgary_reg(bbar, phboff | erroff); + errregs[i] = be32_to_cpu(readl(target)); + printk("0x%08x@0x%lx ", errregs[i], erroff); + } + printk("\n"); + + /* root complex status */ + target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); + rcstat = be32_to_cpu(readl(target)); + printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, + PHB_ROOT_COMPLEX_STATUS); +} + +static void calgary_watchdog(unsigned long data) +{ + struct pci_dev *dev = (struct pci_dev *)data; + struct iommu_table *tbl = pci_iommu(dev->bus); + void __iomem *bbar = tbl->bbar; + u32 val32; + void __iomem *target; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + val32 = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + if (val32 & CSR_AGENT_MASK) { + tbl->chip_ops->dump_error_regs(tbl); + + /* reset error */ + writel(0, target); + + /* Disable bus that caused the error */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | + PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_SLOT_DISABLE; + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + } else { + /* Reset the timer */ + mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); + } +} + +static void __init calgary_set_split_completion_timeout(void __iomem *bbar, + unsigned char busnum, unsigned long timeout) +{ + u64 val64; + void __iomem *target; + unsigned int phb_shift = ~0; /* silence gcc */ + u64 mask; + + switch (busno_to_phbid(busnum)) { + case 0: phb_shift = (63 - 19); + break; + case 1: phb_shift = (63 - 23); + break; + case 2: phb_shift = (63 - 27); + break; + case 3: phb_shift = (63 - 35); + break; + default: + BUG_ON(busno_to_phbid(busnum)); + } + + target = calgary_reg(bbar, CALGARY_CONFIG_REG); + val64 = be64_to_cpu(readq(target)); + + /* zero out this PHB's timer bits */ + mask = ~(0xFUL << phb_shift); + val64 &= mask; + val64 |= (timeout << phb_shift); + writeq(cpu_to_be64(val64), target); + readq(target); /* flush */ +} + +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 val; + + /* + * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 + */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); + val = cpu_to_be32(readl(target)); + val |= 0x00800000; + writel(cpu_to_be32(val), target); +} + +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + + /* + * Give split completion a longer timeout on bus 1 for aic94xx + * http://bugzilla.kernel.org/show_bug.cgi?id=7180 + */ + if (is_calgary(dev->device) && (busnum == 1)) + calgary_set_split_completion_timeout(tbl->bbar, busnum, + CCR_2SEC_TIMEOUT); +} + +static void __init calgary_enable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = pci_iommu(dev->bus); + bbar = tbl->bbar; + + /* enable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; + + printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", + (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? + "Calgary" : "CalIOC2", busnum); + printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " + "bus.\n"); + + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + init_timer(&tbl->watchdog_timer); + tbl->watchdog_timer.function = &calgary_watchdog; + tbl->watchdog_timer.data = (unsigned long)dev; + mod_timer(&tbl->watchdog_timer, jiffies); +} + +static void __init calgary_disable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = pci_iommu(dev->bus); + bbar = tbl->bbar; + + /* disable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); + + printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + del_timer_sync(&tbl->watchdog_timer); +} + +static void __init calgary_init_one_nontraslated(struct pci_dev *dev) +{ + pci_dev_get(dev); + set_pci_iommu(dev->bus, NULL); + + /* is the device behind a bridge? */ + if (dev->bus->parent) + dev->bus->parent->self = dev; + else + dev->bus->self = dev; +} + +static int __init calgary_init_one(struct pci_dev *dev) +{ + void __iomem *bbar; + struct iommu_table *tbl; + int ret; + + BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); + + bbar = busno_to_bbar(dev->bus->number); + ret = calgary_setup_tar(dev, bbar); + if (ret) + goto done; + + pci_dev_get(dev); + + if (dev->bus->parent) { + if (dev->bus->parent->self) + printk(KERN_WARNING "Calgary: IEEEE, dev %p has " + "bus->parent->self!\n", dev); + dev->bus->parent->self = dev; + } else + dev->bus->self = dev; + + tbl = pci_iommu(dev->bus); + tbl->chip_ops->handle_quirks(tbl, dev); + + calgary_enable_translation(dev); + + return 0; + +done: + return ret; +} + +static int __init calgary_locate_bbars(void) +{ + int ret; + int rioidx, phb, bus; + void __iomem *bbar; + void __iomem *target; + unsigned long offset; + u8 start_bus, end_bus; + u32 val; + + ret = -ENODATA; + for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { + struct rio_detail *rio = rio_devs[rioidx]; + + if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) + continue; + + /* map entire 1MB of Calgary config space */ + bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); + if (!bbar) + goto error; + + for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { + offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; + target = calgary_reg(bbar, offset); + + val = be32_to_cpu(readl(target)); + + start_bus = (u8)((val & 0x00FF0000) >> 16); + end_bus = (u8)((val & 0x0000FF00) >> 8); + + if (end_bus) { + for (bus = start_bus; bus <= end_bus; bus++) { + bus_info[bus].bbar = bbar; + bus_info[bus].phbid = phb; + } + } else { + bus_info[start_bus].bbar = bbar; + bus_info[start_bus].phbid = phb; + } + } + } + + return 0; + +error: + /* scan bus_info and iounmap any bbars we previously ioremap'd */ + for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) + if (bus_info[bus].bbar) + iounmap(bus_info[bus].bbar); + + return ret; +} + +static int __init calgary_init(void) +{ + int ret; + struct pci_dev *dev = NULL; + void *tce_space; + + ret = calgary_locate_bbars(); + if (ret) + return ret; + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + if (!translate_phb(dev)) { + calgary_init_one_nontraslated(dev); + continue; + } + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space && !translate_empty_slots) + continue; + + ret = calgary_init_one(dev); + if (ret) + goto error; + } while (1); + + return ret; + +error: + do { + dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, + PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + if (!translate_phb(dev)) { + pci_dev_put(dev); + continue; + } + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + continue; + + calgary_disable_translation(dev); + calgary_free_bus(dev); + pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + } while (1); + + return ret; +} + +static inline int __init determine_tce_table_size(u64 ram) +{ + int ret; + + if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) + return specified_table_size; + + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order(ram >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + + return ret; +} + +static int __init build_detail_arrays(void) +{ + unsigned long ptr; + int i, scal_detail_size, rio_detail_size; + + if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + printk(KERN_WARNING + "Calgary: MAX_NUMNODES too low! Defined as %d, " + "but system has %d nodes.\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); + return -ENODEV; + } + + switch (rio_table_hdr->version){ + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + default: + printk(KERN_WARNING + "Calgary: Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); + return -EPROTO; + } + + ptr = ((unsigned long)rio_table_hdr) + 3; + for (i = 0; i < rio_table_hdr->num_scal_dev; + i++, ptr += scal_detail_size) + scal_devs[i] = (struct scal_detail *)ptr; + + for (i = 0; i < rio_table_hdr->num_rio_dev; + i++, ptr += rio_detail_size) + rio_devs[i] = (struct rio_detail *)ptr; + + return 0; +} + +static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) +{ + int dev; + u32 val; + + if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { + /* + * FIXME: properly scan for devices accross the + * PCI-to-PCI bridge on every CalIOC2 port. + */ + return 1; + } + + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff) + break; + } + return (val != 0xffffffff); +} + +void __init detect_calgary(void) +{ + int bus; + void *tbl; + int calgary_found = 0; + unsigned long ptr; + unsigned int offset, prev_offset; + int ret; + + /* + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ + if (swiotlb || no_iommu || iommu_detected) + return; + + if (!use_calgary) + return; + + if (!early_pci_allowed()) + return; + + printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); + + ptr = (unsigned long)phys_to_virt(get_bios_ebda()); + + rio_table_hdr = NULL; + prev_offset = 0; + offset = 0x180; + /* + * The next offset is stored in the 1st word. + * Only parse up until the offset increases: + */ + while (offset > prev_offset) { + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + prev_offset = offset; + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr) { + printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " + "in EBDA - bailing!\n"); + return; + } + + ret = build_detail_arrays(); + if (ret) { + printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); + return; + } + + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + calgary_found = 1; + } + } + + printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", + calgary_found ? "found" : "not found"); + + if (calgary_found) { + iommu_detected = 1; + calgary_detected = 1; + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " + "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, + debugging ? "enabled" : "disabled"); + } + return; + +cleanup: + for (--bus; bus >= 0; --bus) { + struct calgary_bus_info *info = &bus_info[bus]; + + if (info->tce_space) + free_tce_table(info->tce_space); + } +} + +int __init calgary_iommu_init(void) +{ + int ret; + + if (no_iommu || swiotlb) + return -ENODEV; + + if (!calgary_detected) + return -ENODEV; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + if (end_pfn > MAX_DMA32_PFN) + printk(KERN_ERR "WARNING more than 4GB of memory, " + "32bit PCI may malfunction.\n"); + return ret; + } + + force_iommu = 1; + bad_dma_address = 0x0; + dma_ops = &calgary_dma_ops; + + return 0; +} + +static int __init calgary_parse_options(char *p) +{ + unsigned int bridge; + size_t len; + char* endp; + + while (*p) { + if (!strncmp(p, "64k", 3)) + specified_table_size = TCE_TABLE_SIZE_64K; + else if (!strncmp(p, "128k", 4)) + specified_table_size = TCE_TABLE_SIZE_128K; + else if (!strncmp(p, "256k", 4)) + specified_table_size = TCE_TABLE_SIZE_256K; + else if (!strncmp(p, "512k", 4)) + specified_table_size = TCE_TABLE_SIZE_512K; + else if (!strncmp(p, "1M", 2)) + specified_table_size = TCE_TABLE_SIZE_1M; + else if (!strncmp(p, "2M", 2)) + specified_table_size = TCE_TABLE_SIZE_2M; + else if (!strncmp(p, "4M", 2)) + specified_table_size = TCE_TABLE_SIZE_4M; + else if (!strncmp(p, "8M", 2)) + specified_table_size = TCE_TABLE_SIZE_8M; + + len = strlen("translate_empty_slots"); + if (!strncmp(p, "translate_empty_slots", len)) + translate_empty_slots = 1; + + len = strlen("disable"); + if (!strncmp(p, "disable", len)) { + p += len; + if (*p == '=') + ++p; + if (*p == '\0') + break; + bridge = simple_strtol(p, &endp, 0); + if (p == endp) + break; + + if (bridge < MAX_PHB_BUS_NUM) { + printk(KERN_INFO "Calgary: disabling " + "translation for PHB %#x\n", bridge); + bus_info[bridge].translation_disabled = 1; + } + } + + p = strpbrk(p, ","); + if (!p) + break; + + p++; /* skip ',' */ + } + return 1; +} +__setup("calgary=", calgary_parse_options); + +static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) +{ + struct iommu_table *tbl; + unsigned int npages; + int i; + + tbl = pci_iommu(dev->bus); + + for (i = 0; i < 4; i++) { + struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; + + /* Don't give out TCEs that map MEM resources */ + if (!(r->flags & IORESOURCE_MEM)) + continue; + + /* 0-based? we reserve the whole 1st MB anyway */ + if (!r->start) + continue; + + /* cover the whole region */ + npages = (r->end - r->start) >> PAGE_SHIFT; + npages++; + + iommu_range_reserve(tbl, r->start, npages); + } +} + +static int __init calgary_fixup_tce_spaces(void) +{ + struct pci_dev *dev = NULL; + void *tce_space; + + if (no_iommu || swiotlb || !calgary_detected) + return -ENODEV; + + printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + if (!translate_phb(dev)) + continue; + + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space) + continue; + + calgary_fixup_one_tce_space(dev); + + } while (1); + + return 0; +} + +/* + * We need to be call after pcibios_assign_resources (fs_initcall level) + * and before device_initcall. + */ +rootfs_initcall(calgary_fixup_tce_spaces); diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c new file mode 100644 index 0000000..2971144 --- /dev/null +++ b/arch/x86/kernel/pci-dma_64.c @@ -0,0 +1,346 @@ +/* + * Dynamic DMA mapping support. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int iommu_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_merge); + +dma_addr_t bad_dma_address __read_mostly; +EXPORT_SYMBOL(bad_dma_address); + +/* This tells the BIO block layer to assume merging. Default to off + because we cannot guarantee merging later. */ +int iommu_bio_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_bio_merge); + +static int iommu_sac_force __read_mostly = 0; + +int no_iommu __read_mostly; +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly= 0; +#endif + +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible + to i386. */ +struct device fallback_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = DMA_32BIT_MASK, + .dma_mask = &fallback_dev.coherent_dma_mask, +}; + +/* Allocate DMA memory on node near device */ +noinline static void * +dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) +{ + struct page *page; + int node; +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + node = pcibus_to_node(to_pci_dev(dev)->bus); + else +#endif + node = numa_node_id(); + + if (node < first_node(node_online_map)) + node = first_node(node_online_map); + + page = alloc_pages_node(node, gfp, order); + return page ? page_address(page) : NULL; +} + +/* + * Allocate memory for a coherent mapping. + */ +void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp) +{ + void *memory; + unsigned long dma_mask = 0; + u64 bus; + + if (!dev) + dev = &fallback_dev; + dma_mask = dev->coherent_dma_mask; + if (dma_mask == 0) + dma_mask = DMA_32BIT_MASK; + + /* Device not DMA able */ + if (dev->dma_mask == NULL) + return NULL; + + /* Don't invoke OOM killer */ + gfp |= __GFP_NORETRY; + + /* Kludge to make it bug-to-bug compatible with i386. i386 + uses the normal dma_mask for alloc_coherent. */ + dma_mask &= *dev->dma_mask; + + /* Why <=? Even when the mask is smaller than 4GB it is often + larger than 16MB and in this case we have a chance of + finding fitting memory in the next higher zone first. If + not retry with true GFP_DMA. -AK */ + if (dma_mask <= DMA_32BIT_MASK) + gfp |= GFP_DMA32; + + again: + memory = dma_alloc_pages(dev, gfp, get_order(size)); + if (memory == NULL) + return NULL; + + { + int high, mmu; + bus = virt_to_bus(memory); + high = (bus + size) >= dma_mask; + mmu = high; + if (force_iommu && !(gfp & GFP_DMA)) + mmu = 1; + else if (high) { + free_pages((unsigned long)memory, + get_order(size)); + + /* Don't use the 16MB ZONE_DMA unless absolutely + needed. It's better to use remapping first. */ + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + /* Let low level make its own zone decisions */ + gfp &= ~(GFP_DMA32|GFP_DMA); + + if (dma_ops->alloc_coherent) + return dma_ops->alloc_coherent(dev, size, + dma_handle, gfp); + return NULL; + } + + memset(memory, 0, size); + if (!mmu) { + *dma_handle = virt_to_bus(memory); + return memory; + } + } + + if (dma_ops->alloc_coherent) { + free_pages((unsigned long)memory, get_order(size)); + gfp &= ~(GFP_DMA|GFP_DMA32); + return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + } + + if (dma_ops->map_simple) { + *dma_handle = dma_ops->map_simple(dev, memory, + size, + PCI_DMA_BIDIRECTIONAL); + if (*dma_handle != bad_dma_address) + return memory; + } + + if (panic_on_overflow) + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size); + free_pages((unsigned long)memory, get_order(size)); + return NULL; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +/* + * Unmap coherent memory. + * The caller must ensure that the device has finished accessing the mapping. + */ +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) +{ + if (dma_ops->unmap_single) + dma_ops->unmap_single(dev, bus, size, 0); + free_pages((unsigned long)vaddr, get_order(size)); +} +EXPORT_SYMBOL(dma_free_coherent); + +static int forbid_dac __read_mostly; + +int dma_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + + + + printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id); + return 0; + } +#endif + + if (dma_ops->dma_supported) + return dma_ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_24BIT_MASK) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { + printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + *dev->dma_mask = mask; + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +/* + * See for the iommu kernel parameter + * documentation. + */ +__init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p,"off",3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p,"force",5)) + force_iommu = 1; + if (!strncmp(p,"noforce",7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge",8)) { + iommu_bio_merge = 4096; + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic",5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic",7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge",5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge",7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac",8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = -1; + +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft",4)) + swiotlb = 1; +#endif + +#ifdef CONFIG_IOMMU + gart_parse_options(p); +#endif + +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +void __init pci_iommu_alloc(void) +{ + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ +#ifdef CONFIG_IOMMU + iommu_hole_init(); +#endif + +#ifdef CONFIG_CALGARY_IOMMU + detect_calgary(); +#endif + +#ifdef CONFIG_SWIOTLB + pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ +#ifdef CONFIG_CALGARY_IOMMU + calgary_iommu_init(); +#endif + +#ifdef CONFIG_IOMMU + gart_iommu_init(); +#endif + + no_iommu_init(); + return 0; +} + +void pci_iommu_shutdown(void) +{ + gart_iommu_shutdown(); +} + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +#endif +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c new file mode 100644 index 0000000..4918c57 --- /dev/null +++ b/arch/x86/kernel/pci-gart_64.c @@ -0,0 +1,740 @@ +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/DMA-mapping.txt for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +u32 *iommu_gatt_base; /* Remapping table */ + +/* If this is disabled the IOMMU will use an optimized flushing strategy + of only flushing when an mapping is reused. With it true the GART is flushed + for every mapping. Problem is that doing the lazy flush seems to trigger + bugs with some popular PCI cards, in particular 3ware (but has been also + also seen with Qlogic at least). */ +int iommu_fullflush = 1; + +/* Allocation bitmap for the remapping area */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#define to_pages(addr,size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static int need_flush; /* global flush state. set for each gart wrap */ + +static unsigned long alloc_iommu(int size) +{ + unsigned long offset, flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + if (offset == -1) { + need_flush = 1; + offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); + } + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = 1; + } + } + if (iommu_fullflush) + need_flush = 1; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); + __clear_bit_string(iommu_gart_bitmap, offset, size); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(void) +{ + unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + k8_flush_garts(); + need_flush = 0; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +#ifdef CONFIG_IOMMU_LEAK + +#define SET_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0); +#define CLEAR_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; + +/* Debugging aid for drivers that don't free their IOMMU tables */ +static void **iommu_leak_tab; +static int leak_trace; +int iommu_leak_pages = 20; +void dump_leak(void) +{ + int i; + static int dump; + if (dump || !iommu_leak_tab) return; + dump = 1; + show_stack(NULL,NULL); + /* Very crude. dump some from the end of the table too */ + printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i+=2) { + printk("%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); + printk("%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk("\n"); +} +#else +#define SET_LEAK(x) +#define CLEAR_LEAK(x) +#endif + +static void iommu_full(struct device *dev, size_t size, int dir) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); + + if (size > PAGE_SIZE*EMERGENCY_PAGES) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); + } + +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size > mask; + int mmu = high; + if (force_iommu) + mmu = 1; + return mmu; +} + +static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size > mask; + int mmu = high; + return mmu; +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + size_t size, int dir) +{ + unsigned long npages = to_pages(phys_mem, size); + unsigned long iommu_page = alloc_iommu(npages); + int i; + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return bad_dma_address; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + SET_LEAK(iommu_page + i); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +static dma_addr_t gart_map_simple(struct device *dev, char *buf, + size_t size, int dir) +{ + dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); + flush_gart(); + return map; +} + +/* Map a single area into the IOMMU */ +static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +{ + unsigned long phys_mem, bus; + + if (!dev) + dev = &fallback_dev; + + phys_mem = virt_to_phys(addr); + if (!need_iommu(dev, phys_mem, size)) + return phys_mem; + + bus = gart_map_simple(dev, addr, size, dir); + return bus; +} + +/* + * Free a DMA mapping. + */ +static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + unsigned long iommu_page; + int npages; + int i; + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = to_pages(dma_addr, size); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + CLEAR_LEAK(iommu_page + i); + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + if (!s->dma_length || !s->length) + break; + gart_unmap_single(dev, s->dma_address, s->dma_length, dir); + } +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + +#ifdef CONFIG_IOMMU_DEBUG + printk(KERN_DEBUG "dma_map_sg overflow\n"); +#endif + + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + unsigned long addr = page_to_phys(s->page) + s->offset; + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir); + if (addr == bad_dma_address) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(); + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(pages); + unsigned long iommu_page = iommu_start; + int i; + + if (iommu_start == -1) + return -1; + + for (i = start; i < stopat; i++) { + struct scatterlist *s = &sg[i]; + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(i > start && s->offset); + if (i == start) { + *sout = *s; + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + SET_LEAK(iommu_page); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; +} + +static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, + unsigned long pages, int need) +{ + if (!need) { + BUG_ON(stopat - start != 1); + *sout = sg[start]; + sout->dma_length = sg[start].length; + return 0; + } + return __dma_map_cont(sg, start, stopat, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + int out; + int start; + unsigned long pages = 0; + int need = 0, nextneed; + + if (nents == 0) + return 0; + + if (!dev) + dev = &fallback_dev; + + out = 0; + start = 0; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + dma_addr_t addr = page_to_phys(s->page) + s->offset; + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + struct scatterlist *ps = &sg[i-1]; + /* Can only merge when the last chunk ends on a page + boundary and the new one doesn't have an offset. */ + if (!iommu_merge || !nextneed || !need || s->offset || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(sg, start, i, sg+out, pages, + need) < 0) + goto error; + out++; + pages = 0; + start = i; + } + } + + need = nextneed; + pages += to_pages(s->offset, s->length); + } + if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) + goto error; + out++; + flush_gart(); + if (out < nents) + sg[out].dma_length = 0; + return out; + +error: + flush_gart(); + gart_unmap_sg(dev, sg, nents, dir); + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir); + for (i = 0; i < nents; i++) + sg[i].dma_address = bad_dma_address; + return 0; +} + +static int no_agp; + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) + printk(KERN_WARNING + "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32; + u64 aper_base; + unsigned aper_order; + + pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x90, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_k8_gatt(struct agp_kern_info *info) +{ + struct pci_dev *dev; + void *gatt; + unsigned aper_base, new_aper_base; + unsigned aper_size, gatt_size, new_aper_size; + int i; + + printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; + dev = NULL; + for (i = 0; i < num_k8_northbridges; i++) { + dev = k8_northbridges[i]; + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + info->aper_base = aper_base; + info->aper_size = aper_size>>20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) + panic("Could not set GART PTEs to uncacheable pages"); + global_flush_tlb(); + + memset(gatt, 0, gatt_size); + agp_gatt_table = gatt; + + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; + u32 gatt_reg; + + dev = k8_northbridges[i]; + gatt_reg = __pa(gatt) >> 12; + gatt_reg <<= 4; + pci_write_config_dword(dev, 0x98, gatt_reg); + pci_read_config_dword(dev, 0x90, &ctl); + + ctl |= 1; + ctl &= ~((1<<4) | (1<<5)); + + pci_write_config_dword(dev, 0x90, ctl); + } + flush_gart(); + + printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + return 0; + + nommu: + /* Should not happen anymore */ + printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); + return -1; +} + +extern int agp_amd64_init(void); + +static const struct dma_mapping_ops gart_dma_ops = { + .mapping_error = NULL, + .map_single = gart_map_single, + .map_simple = gart_map_simple, + .unmap_single = gart_unmap_single, + .sync_single_for_cpu = NULL, + .sync_single_for_device = NULL, + .sync_single_range_for_cpu = NULL, + .sync_single_range_for_device = NULL, + .sync_sg_for_cpu = NULL, + .sync_sg_for_device = NULL, + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, +}; + +void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + if (no_agp && (dma_ops != &gart_dma_ops)) + return; + + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; + + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); + + ctl &= ~1; + + pci_write_config_dword(dev, 0x90, ctl); + } +} + +void __init gart_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long aper_size; + unsigned long iommu_start; + unsigned long scratch; + long i; + + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { + printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); + return; + } + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other K8 AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (swiotlb) + return; + + /* Did we detect a different HW IOMMU? */ + if (iommu_detected && !iommu_aperture) + return; + + if (no_iommu || + (!force_iommu && end_pfn <= MAX_DMA32_PFN) || + !iommu_aperture || + (no_agp && init_k8_gatt(&info) < 0)) { + if (end_pfn > MAX_DMA32_PFN) { + printk(KERN_ERR "WARNING more than 4GB of memory " + "but GART IOMMU not available.\n" + KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + } + return; + } + + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + memset(iommu_gart_bitmap, 0, iommu_pages/8); + +#ifdef CONFIG_IOMMU_LEAK + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages*sizeof(void *))); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); + else + printk("PCI-DMA: Cannot allocate leak trace area\n"); + } +#endif + + /* + * Out of IOMMU space handling. + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + + agp_memory_reserved = iommu_size; + printk(KERN_INFO + "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size>>20); + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_address = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); + + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + iommu_gatt_base[i] = gart_unmapped_entry; + + flush_gart(); + dma_ops = &gart_dma_ops; +} + +void __init gart_parse_options(char *p) +{ + int arg; + +#ifdef CONFIG_IOMMU_LEAK + if (!strncmp(p,"leak",4)) { + leak_trace = 1; + p += 4; + if (*p == '=') ++p; + if (isdigit(*p) && get_option(&p, &arg)) + iommu_leak_pages = arg; + } +#endif + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + if (!strncmp(p, "fullflush",8)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush",11)) + iommu_fullflush = 0; + if (!strncmp(p,"noagp",5)) + no_agp = 1; + if (!strncmp(p, "noaperture",10)) + fix_aperture = 0; + /* duplicated from pci-dma.c */ + if (!strncmp(p,"force",5)) + iommu_aperture_allowed = 1; + if (!strncmp(p,"allowed",7)) + iommu_aperture_allowed = 1; + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } +} diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c new file mode 100644 index 0000000..2a34c6c --- /dev/null +++ b/arch/x86/kernel/pci-nommu_64.c @@ -0,0 +1,97 @@ +/* Fallback functions when the main IOMMU code is not compiled in. This + code is roughly equivalent to i386. */ +#include +#include +#include +#include +#include + +#include +#include +#include + +static int +check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) +{ + if (hwdev && bus + size > *hwdev->dma_mask) { + if (*hwdev->dma_mask >= DMA_32BIT_MASK) + printk(KERN_ERR + "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", + name, (long long)bus, size, + (long long)*hwdev->dma_mask); + return 0; + } + return 1; +} + +static dma_addr_t +nommu_map_single(struct device *hwdev, void *ptr, size_t size, + int direction) +{ + dma_addr_t bus = virt_to_bus(ptr); + if (!check_addr("map_single", hwdev, bus, size)) + return bad_dma_address; + return bus; +} + +static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, + int direction) +{ +} + +/* Map a set of buffers described by scatterlist in streaming + * mode for DMA. This is the scatter-gather version of the + * above pci_map_single interface. Here the scatter gather list + * elements are each tagged with the appropriate dma address + * and length. They are obtained via sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for pci_map_single are + * the same here. + */ +static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction) +{ + int i; + + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) + return 0; + s->dma_length = s->length; + } + return nents; +} + +/* Unmap a set of streaming mode DMA translations. + * Again, cpu read rules concerning calls here are the same as for + * pci_unmap_single() above. + */ +static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ +} + +const struct dma_mapping_ops nommu_dma_ops = { + .map_single = nommu_map_single, + .unmap_single = nommu_unmap_single, + .map_sg = nommu_map_sg, + .unmap_sg = nommu_unmap_sg, + .is_phys = 1, +}; + +void __init no_iommu_init(void) +{ + if (dma_ops) + return; + + force_iommu = 0; /* no HW IOMMU */ + dma_ops = &nommu_dma_ops; +} diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c new file mode 100644 index 0000000..b2f405e --- /dev/null +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -0,0 +1,44 @@ +/* Glue code to lib/swiotlb.c */ + +#include +#include +#include +#include + +#include +#include +#include + +int swiotlb __read_mostly; +EXPORT_SYMBOL(swiotlb); + +const struct dma_mapping_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .alloc_coherent = swiotlb_alloc_coherent, + .free_coherent = swiotlb_free_coherent, + .map_single = swiotlb_map_single, + .unmap_single = swiotlb_unmap_single, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, + .sync_single_range_for_device = swiotlb_sync_single_range_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg, + .unmap_sg = swiotlb_unmap_sg, + .dma_supported = NULL, +}; + +void __init pci_swiotlb_init(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) + swiotlb = 1; + if (swiotlb_force) + swiotlb = 1; + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_init(); + dma_ops = &swiotlb_dma_ops; + } +} diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c new file mode 100644 index 0000000..ae8f912 --- /dev/null +++ b/arch/x86/kernel/pmtimer_64.c @@ -0,0 +1,69 @@ +/* Ported over from i386 by AK, original copyright was: + * + * (C) Dominik Brodowski 2003 + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + * + * Dropped all the hardware bug workarounds for now. Hopefully they + * are not needed on 64bit chipsets. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ + +static inline u32 cyc2us(u32 cycles) +{ + /* The Power Management Timer ticks at 3.579545 ticks per microsecond. + * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] + * + * Even with HZ = 100, delta is at maximum 35796 ticks, so it can + * easily be multiplied with 286 (=0x11E) without having to fear + * u32 overflows. + */ + cycles *= 286; + return (cycles >> 10); +} + +static unsigned pmtimer_wait_tick(void) +{ + u32 a, b; + for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; + a == b; + b = inl(pmtmr_ioport) & ACPI_PM_MASK) + cpu_relax(); + return b; +} + +/* note: wait time is rounded up to one tick */ +void pmtimer_wait(unsigned us) +{ + u32 a, b; + a = pmtimer_wait_tick(); + do { + b = inl(pmtmr_ioport); + cpu_relax(); + } while (cyc2us(b - a) < us); +} + +static int __init nopmtimer_setup(char *s) +{ + pmtmr_ioport = 0; + return 1; +} + +__setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c new file mode 100644 index 0000000..9895655 --- /dev/null +++ b/arch/x86/kernel/process_64.c @@ -0,0 +1,903 @@ +/* + * linux/arch/x86-64/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage extern void ret_from_fork(void); + +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL(idle_notifier_unregister); + +void enter_idle(void) +{ + write_pda(isidle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (test_and_clear_bit_pda(0, isidle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} + +/* + * We use this if we don't have any better + * idle routine.. + */ +static void default_idle(void) +{ + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); + local_irq_disable(); + if (!need_resched()) { + /* Enables interrupts one instruction before HLT. + x86 special cases this so there is no race. */ + safe_halt(); + } else + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + local_irq_enable(); + cpu_relax(); +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map, tmp = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && + !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +#ifdef CONFIG_HOTPLUG_CPU +DECLARE_PER_CPU(int, cpu_state); + +#include +/* We halt the CPU with physical CPU hotplug */ +static inline void play_dead(void) +{ + idle_task_exit(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + local_irq_disable(); + while (1) + halt(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle (void) +{ + current_thread_info()->status |= TS_POLLING; + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + rmb(); + idle = pm_idle; + if (!idle) + idle = default_idle; + if (cpu_is_offline(smp_processor_id())) + play_dead(); + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); + enter_idle(); + idle(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(eax, ecx); + } +} + +/* Default MONITOR/MWAIT with no hints, used for default C1 state */ +static void mwait_idle(void) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else { + local_irq_enable(); + } +} + +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +{ + static int printed; + if (cpu_has(c, X86_FEATURE_MWAIT)) { + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + if (!printed) { + printk(KERN_INFO "using mwait in idle threads.\n"); + printed = 1; + } + pm_idle = mwait_idle; + } + } +} + +static int __init idle_setup (char *str) +{ + if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; + + boot_option_idle_override = 1; + return 0; +} +early_param("idle", idle_setup); + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs * regs) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex,gsindex; + unsigned int ds,cs,es; + + printk("\n"); + print_modules(); + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); + printk_address(regs->rip); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + regs->eflags); + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP: %016lx R08: %016lx R09: %016lx\n", + regs->rbp, regs->r8, regs->r9); + printk("R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk("R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("movl %%fs,%0" : "=r" (fsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); + + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs,fsindex,gs,gsindex,shadowgs); + printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); + printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); +} + +void show_regs(struct pt_regs *regs) +{ + printk("CPU %d:", smp_processor_id()); + __show_regs(regs); + show_trace(NULL, regs, (void *)(regs + 1)); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + + if (me->thread.io_bitmap_ptr) { + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); + t->io_bitmap_max = 0; + put_cpu(); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { + clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); + if (test_tsk_thread_flag(tsk, TIF_IA32)) { + clear_tsk_thread_flag(tsk, TIF_IA32); + } else { + set_tsk_thread_flag(tsk, TIF_IA32); + current_thread_info()->status |= TS_COMPAT; + } + } + clear_tsk_thread_flag(tsk, TIF_DEBUG); + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct n_desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + desc->a = LDT_entry_a(&ud); + desc->b = LDT_entry_b(&ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + struct desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + return desc->base0 | + (((u32)desc->base1) << 16) | + (((u32)desc->base2) << 24); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + int err; + struct pt_regs * childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) + (THREAD_SIZE + task_stack_page(p))) - 1; + *childregs = *regs; + + childregs->rax = 0; + childregs->rsp = rsp; + if (rsp == ~0UL) + childregs->rsp = (unsigned long)childregs; + + p->thread.rsp = (unsigned long) childregs; + p->thread.rsp0 = (unsigned long) (childregs+1); + p->thread.userrsp = me->thread.userrsp; + + set_tsk_thread_flag(p, TIF_FORK); + + p->thread.fs = me->thread.fs; + p->thread.gs = me->thread.gs; + + asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); + asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); + asm("mov %%es,%0" : "=m" (p->thread.es)); + asm("mov %%ds,%0" : "=m" (p->thread.ds)); + + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = ia32_child_tls(p, childregs); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) + +static inline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread, + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } +} + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + */ +__kprobes struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + + /* + * Reload esp0, LDT and the page table pointer: + */ + tss->rsp0 = next->rsp0; + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + asm volatile("mov %%es,%0" : "=m" (prev->es)); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + load_TLS(next, cpu); + + /* + * Switch FS and GS. + */ + { + unsigned fsindex; + asm volatile("movl %%fs,%0" : "=r" (fsindex)); + /* segment register != 0 always requires a reload. + also reload when it has changed. + when prev process used 64bit base always reload + to avoid an information leak. */ + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* check if the user used a selector != 0 + * if yes clear 64bit base, since overloaded base + * is always mapped to the Null selector + */ + if (fsindex) + prev->fs = 0; + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + } + { + unsigned gsindex; + asm volatile("movl %%gs,%0" : "=r" (gsindex)); + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) + prev->gs = 0; + } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; + } + + /* Must be after DS reload */ + unlazy_fpu(prev_p); + + /* + * Switch the PDA and FPU contexts. + */ + prev->userrsp = read_pda(oldrsp); + write_pda(oldrsp, next->userrsp); + write_pda(pcurrent, next_p); + + write_pda(kernelstack, + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); + /* + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement + */ + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif + + /* + * Now maybe reload the debug registers and handle I/O bitmaps + */ + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + __switch_to_xtra(prev_p, next_p, tss); + + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); + return prev_p; +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs regs) +{ + long error; + char * filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + } + putname(filename); + return error; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +asmlinkage long sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); +} + +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->rsp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage long sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + NULL, NULL); +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp,rip; + int count = 0; + + if (!p || p == current || p->state==TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.rsp); + do { + if (fp < (unsigned long)stack || + fp > (unsigned long)stack+THREAD_SIZE) + return 0; + rip = *(u64 *)(fp+8); + if (!in_sched_functions(rip)) + return rip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = checking_wrmsrl(MSR_FS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) + rdmsrl(MSR_FS_BASE, base); + else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + unsigned gsindex; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + asm("movl %%gs,%0" : "=r" (gsindex)); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } + else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs *pp, ptregs; + + pp = task_pt_regs(tsk); + + ptregs = *pp; + ptregs.cs &= 0xffff; + ptregs.ss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c new file mode 100644 index 0000000..eea3702 --- /dev/null +++ b/arch/x86/kernel/ptrace_64.c @@ -0,0 +1,627 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * x86-64 port 2000-2002 Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). + * Also masks reserved bits (63-22, 15, 5, 3, 1). + */ +#define FLAG_MASK 0x54dd5UL + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100UL + +/* + * eflags and offset of eflags on child stack.. + */ +#define EFLAGS offsetof(struct pt_regs, eflags) +#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) + +/* + * this routine will get a word off of the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline unsigned long get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->thread.rsp0; + stack += offset; + return (*((unsigned long *)stack)); +} + +/* + * this routine will put a word on the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline long put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *) task->thread.rsp0; + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +#define LDT_SEGMENT 4 + +unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->rip; + seg = regs->cs & 0xffff; + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if (seg & LDT_SEGMENT) { + u32 *desc; + unsigned long base; + + seg &= ~7UL; + + down(&child->mm->context.sem); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } + up(&child->mm->context.sem); + } + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_rip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf2: case 0xf3: + continue; + + case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +static void set_singlestep(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * If TF was already set, don't do anything else + */ + if (regs->eflags & TRAP_FLAG) + return; + + /* Set TF on the kernel stack.. */ + regs->eflags |= TRAP_FLAG; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + */ + if (is_setting_trap_flag(child, regs)) + return; + + child->ptrace |= PT_DTRACE; +} + +static void clear_singlestep(struct task_struct *child) +{ + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (child->ptrace & PT_DTRACE) { + struct pt_regs *regs = task_pt_regs(child); + regs->eflags &= ~TRAP_FLAG; + child->ptrace &= ~PT_DTRACE; + } +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + clear_singlestep(child); +} + +static int putreg(struct task_struct *child, + unsigned long regno, unsigned long value) +{ + unsigned long tmp; + + switch (regno) { + case offsetof(struct user_regs_struct,fs): + if (value && (value & 3) != 3) + return -EIO; + child->thread.fsindex = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,gs): + if (value && (value & 3) != 3) + return -EIO; + child->thread.gsindex = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,ds): + if (value && (value & 3) != 3) + return -EIO; + child->thread.ds = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,es): + if (value && (value & 3) != 3) + return -EIO; + child->thread.es = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,ss): + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + return 0; + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_OF(child)) + return -EIO; + child->thread.fs = value; + return 0; + case offsetof(struct user_regs_struct,gs_base): + if (value >= TASK_SIZE_OF(child)) + return -EIO; + child->thread.gs = value; + return 0; + case offsetof(struct user_regs_struct, eflags): + value &= FLAG_MASK; + tmp = get_stack_long(child, EFL_OFFSET); + tmp &= ~FLAG_MASK; + value |= tmp; + break; + case offsetof(struct user_regs_struct,cs): + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + } + put_stack_long(child, regno - sizeof(struct pt_regs), value); + return 0; +} + +static unsigned long getreg(struct task_struct *child, unsigned long regno) +{ + unsigned long val; + switch (regno) { + case offsetof(struct user_regs_struct, fs): + return child->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + return child->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + return child->thread.ds; + case offsetof(struct user_regs_struct, es): + return child->thread.es; + case offsetof(struct user_regs_struct, fs_base): + return child->thread.fs; + case offsetof(struct user_regs_struct, gs_base): + return child->thread.gs; + default: + regno = regno - sizeof(struct pt_regs); + val = get_stack_long(child, regno); + if (test_tsk_thread_flag(child, TIF_IA32)) + val &= 0xffffffff; + return val; + } + +} + +long arch_ptrace(struct task_struct *child, long request, long addr, long data) +{ + long i, ret; + unsigned ui; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: + ret = generic_ptrace_peekdata(child, addr, data); + break; + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & 7) || + addr > sizeof(struct user) - 7) + break; + + switch (addr) { + case 0 ... sizeof(struct user_regs_struct) - sizeof(long): + tmp = getreg(child, addr); + break; + case offsetof(struct user, u_debugreg[0]): + tmp = child->thread.debugreg0; + break; + case offsetof(struct user, u_debugreg[1]): + tmp = child->thread.debugreg1; + break; + case offsetof(struct user, u_debugreg[2]): + tmp = child->thread.debugreg2; + break; + case offsetof(struct user, u_debugreg[3]): + tmp = child->thread.debugreg3; + break; + case offsetof(struct user, u_debugreg[6]): + tmp = child->thread.debugreg6; + break; + case offsetof(struct user, u_debugreg[7]): + tmp = child->thread.debugreg7; + break; + default: + tmp = 0; + break; + } + ret = put_user(tmp,(unsigned long __user *) data); + break; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + ret = generic_ptrace_pokedata(child, addr, data); + break; + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + { + int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; + ret = -EIO; + if ((addr & 7) || + addr > sizeof(struct user) - 7) + break; + + switch (addr) { + case 0 ... sizeof(struct user_regs_struct) - sizeof(long): + ret = putreg(child, addr, data); + break; + /* Disallows to set a breakpoint into the vsyscall */ + case offsetof(struct user, u_debugreg[0]): + if (data >= TASK_SIZE_OF(child) - dsize) break; + child->thread.debugreg0 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[1]): + if (data >= TASK_SIZE_OF(child) - dsize) break; + child->thread.debugreg1 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[2]): + if (data >= TASK_SIZE_OF(child) - dsize) break; + child->thread.debugreg2 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[3]): + if (data >= TASK_SIZE_OF(child) - dsize) break; + child->thread.debugreg3 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[6]): + if (data >> 32) + break; + child->thread.debugreg6 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[7]): + /* See arch/i386/kernel/ptrace.c for an explanation of + * this awkward check.*/ + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + break; + if (i == 4) { + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); + ret = 0; + } + break; + } + break; + } + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: /* restart after signal. */ + + ret = -EIO; + if (!valid_signal(data)) + break; + if (request == PTRACE_SYSCALL) + set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + else + clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = data; + /* make sure the single step bit is not set. */ + clear_singlestep(child); + wake_up_process(child); + ret = 0; + break; + +#ifdef CONFIG_IA32_EMULATION + /* This makes only sense with 32bit programs. Allow a + 64bit debugger to fully examine them too. Better + don't use it against 64bit processes, use + PTRACE_ARCH_PRCTL instead. */ + case PTRACE_SET_THREAD_AREA: { + struct user_desc __user *p; + int old; + p = (struct user_desc __user *)data; + get_user(old, &p->entry_number); + put_user(addr, &p->entry_number); + ret = do_set_thread_area(&child->thread, p); + put_user(old, &p->entry_number); + break; + case PTRACE_GET_THREAD_AREA: + p = (struct user_desc __user *)data; + get_user(old, &p->entry_number); + put_user(addr, &p->entry_number); + ret = do_get_thread_area(&child->thread, p); + put_user(old, &p->entry_number); + break; + } +#endif + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: + ret = 0; + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ + break; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + clear_singlestep(child); + wake_up_process(child); + break; + + case PTRACE_SINGLESTEP: /* set the trap flag. */ + ret = -EIO; + if (!valid_signal(data)) + break; + clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + set_singlestep(child); + child->exit_code = data; + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + break; + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, + sizeof(struct user_regs_struct))) { + ret = -EIO; + break; + } + ret = 0; + for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { + ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); + data += sizeof(long); + } + break; + } + + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp; + if (!access_ok(VERIFY_READ, (unsigned __user *)data, + sizeof(struct user_regs_struct))) { + ret = -EIO; + break; + } + ret = 0; + for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { + ret = __get_user(tmp, (unsigned long __user *) data); + if (ret) + break; + ret = putreg(child, ui, tmp); + if (ret) + break; + data += sizeof(long); + } + break; + } + + case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ + if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + ret = get_fpregs((struct user_i387_struct __user *)data, child); + break; + } + + case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ + if (!access_ok(VERIFY_READ, (unsigned __user *)data, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + ret = set_fpregs(child, (struct user_i387_struct __user *)data); + break; + } + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + return ret; +} + +static void syscall_trace(struct pt_regs *regs) +{ + +#if 0 + printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", + current->comm, + regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), + current_thread_info()->flags, current->ptrace); +#endif + + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_rax); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); + + if (unlikely(current->audit_context)) { + if (test_thread_flag(TIF_IA32)) { + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_rax, + regs->rbx, regs->rcx, + regs->rdx, regs->rsi); + } else { + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_rax, + regs->rdi, regs->rsi, + regs->rdx, regs->r10); + } + } +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); + + if ((test_thread_flag(TIF_SYSCALL_TRACE) + || test_thread_flag(TIF_SINGLESTEP)) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c new file mode 100644 index 0000000..368db2b --- /dev/null +++ b/arch/x86/kernel/reboot_64.c @@ -0,0 +1,171 @@ +/* Various gunk just to reboot the machine. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +static long no_idt[3]; +static enum { + BOOT_TRIPLE = 't', + BOOT_KBD = 'k' +} reboot_type = BOOT_KBD; +static int reboot_mode = 0; +int reboot_force; + +/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] + warm Don't set the cold reboot flag + cold Set the cold reboot flag + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + force Avoid anything that could hang. + */ +static int __init reboot_setup(char *str) +{ + for (;;) { + switch (*str) { + case 'w': + reboot_mode = 0x1234; + break; + + case 'c': + reboot_mode = 0; + break; + + case 't': + case 'b': + case 'k': + reboot_type = *str; + break; + case 'f': + reboot_force = 1; + break; + } + if((str = strchr(str,',')) != NULL) + str++; + else + break; + } + return 1; +} + +__setup("reboot=", reboot_setup); + +static inline void kb_wait(void) +{ + int i; + + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; +} + +void machine_shutdown(void) +{ + unsigned long flags; + + /* Stop the cpus and apics */ +#ifdef CONFIG_SMP + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); + } + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. + */ + smp_send_stop(); +#endif + + local_irq_save(flags); + +#ifndef CONFIG_SMP + disable_local_APIC(); +#endif + + disable_IO_APIC(); + + local_irq_restore(flags); + + pci_iommu_shutdown(); +} + +void machine_emergency_restart(void) +{ + int i; + + /* Tell the BIOS if we want cold or warm reboot */ + *((unsigned short *)__va(0x472)) = reboot_mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_KBD: + for (i=0; i<10; i++) { + kb_wait(); + udelay(50); + outb(0xfe,0x64); /* pulse reset low */ + udelay(50); + } + + case BOOT_TRIPLE: + __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); + __asm__ __volatile__("int3"); + + reboot_type = BOOT_KBD; + break; + } + } +} + +void machine_restart(char * __unused) +{ + printk("machine restart\n"); + + if (!reboot_force) { + machine_shutdown(); + } + machine_emergency_restart(); +} + +void machine_halt(void) +{ +} + +void machine_power_off(void) +{ + if (pm_power_off) { + if (!reboot_force) { + machine_shutdown(); + } + pm_power_off(); + } +} + diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S new file mode 100644 index 0000000..14e9587 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -0,0 +1,276 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ + + .text + .align PAGE_ALIGNED + .code64 + .globl relocate_kernel +relocate_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* map the control page at its virtual address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + /* identity map the control page at its physical address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + +relocate_new_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + + /* switch to new set of page tables */ + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret + +identity_mapped: + /* store the start address on the stack */ + pushq %rdx + + /* Set cr0 to a known state: + * 31 1 == Paging enabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movq %cr0, %rax + andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax + orl $((1<<31)|(1<<0)), %eax + movq %rax, %cr0 + + /* Set cr4 to a known state: + * 10 0 == xmm exceptions disabled + * 9 0 == xmm registers instructions disabled + * 8 0 == performance monitoring counter disabled + * 7 0 == page global disabled + * 6 0 == machine check exceptions disabled + * 5 1 == physical address extension enabled + * 4 0 == page size extensions disabled + * 3 0 == Debug extensions disabled + * 2 0 == Time stamp disable (disabled) + * 1 0 == Protected mode virtual interrupts disabled + * 0 0 == VME disabled + */ + + movq $((1<<5)), %rax + movq %rax, %cr4 + + jmp 1f +1: + + /* Switch to the identity mapped page tables, + * and flush the TLB. + */ + movq %rcx, %cr3 + + /* Do the copies */ + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorq %rdi, %rdi + xorq %rsi, %rsi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testq $0x1, %rcx /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testq $0x2, %rcx /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testq $0x4, %rcx /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq $512, %rcx + rep ; movsq + jmp 0b +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* set all of the registers to known values */ + /* leave %rsp alone */ + + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c new file mode 100644 index 0000000..1200aaa --- /dev/null +++ b/arch/x86/kernel/setup64.c @@ -0,0 +1,289 @@ +/* + * X86-64 specific CPU setup. + * Copyright (C) 1995 Linus Torvalds + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. + * See setup.c for older changelog. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +char x86_boot_params[BOOT_PARAM_SIZE] __initdata; + +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; + +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); + +unsigned long __supported_pte_mask __read_mostly = ~0UL; +static int do_not_nx __cpuinitdata = 0; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +int force_personality32 = 0; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes +off PROT_READ implies PROT_EXEC (default) +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +/* + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning + */ +void __init setup_per_cpu_areas(void) +{ + int i; + unsigned long size; + +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif + + /* Copy section for each CPU (we discard the original) */ + size = PERCPU_ENOUGH_ROOM; + + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); + for_each_cpu_mask (i, cpu_possible_map) { + char *ptr; + + if (!NODE_DATA(cpu_to_node(i))) { + printk("cpu with no node %d, num_online_nodes %d\n", + i, num_online_nodes()); + ptr = alloc_bootmem_pages(size); + } else { + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); + } + if (!ptr) + panic("Cannot allocate cpu data for CPU %d\n", i); + cpu_pda(i)->data_offset = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + } +} + +void pda_init(int cpu) +{ + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + } else { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", cpu); + } + + + pda->irqstackptr += IRQSTACKSIZE-64; +} + +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] +__attribute__((section(".bss.page_aligned"))); + +extern asmlinkage void ignore_sysret(void); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to set CS/DS + * but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init (); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); +} + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) { + __supported_pte_mask &= ~_PAGE_NX; + } +} + +unsigned long kernel_eflags; + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init. + */ +void __cpuinit cpu_init (void) +{ + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) { + pda_init(cpu); + } else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk("Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + if (cpu) + memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); + + cpu_gdt_descr[cpu].size = GDT_SIZE; + asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); + asm volatile("lidt %0" :: "m" (idt_descr)); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + + /* + * set up and load the per-CPU TSS + */ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; + } + + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); + + fpu_init(); + + raw_local_save_flags(kernel_eflags); +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c new file mode 100644 index 0000000..af838f6 --- /dev/null +++ b/arch/x86/kernel/setup_64.c @@ -0,0 +1,1117 @@ +/* + * linux/arch/x86-64/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Nov 2001 Dave Jones + * Forked from i386 setup code. + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include