summaryrefslogtreecommitdiff
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig22
-rw-r--r--arch/x86_64/defconfig45
-rw-r--r--arch/x86_64/ia32/ia32_signal.c11
-rw-r--r--arch/x86_64/ia32/ia32entry.S1
-rw-r--r--arch/x86_64/kernel/Makefile2
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c2
-rw-r--r--arch/x86_64/kernel/e820.c38
-rw-r--r--arch/x86_64/kernel/head.S20
-rw-r--r--arch/x86_64/kernel/io_apic.c24
-rw-r--r--arch/x86_64/kernel/ioport.c2
-rw-r--r--arch/x86_64/kernel/irq.c12
-rw-r--r--arch/x86_64/kernel/mce.c66
-rw-r--r--arch/x86_64/kernel/mce_amd.c44
-rw-r--r--arch/x86_64/kernel/nmi.c75
-rw-r--r--arch/x86_64/kernel/pci-calgary.c17
-rw-r--r--arch/x86_64/kernel/pci-dma.c28
-rw-r--r--arch/x86_64/kernel/pci-gart.c4
-rw-r--r--arch/x86_64/kernel/ptrace.c8
-rw-r--r--arch/x86_64/kernel/setup.c169
-rw-r--r--arch/x86_64/kernel/setup64.c1
-rw-r--r--arch/x86_64/kernel/stacktrace.c5
-rw-r--r--arch/x86_64/kernel/time.c14
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c5
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/copy_user_nocache.S217
-rw-r--r--arch/x86_64/mm/fault.c18
-rw-r--r--arch/x86_64/mm/numa.c202
-rw-r--r--arch/x86_64/mm/pageattr.c4
-rw-r--r--arch/x86_64/pci/Makefile3
-rw-r--r--arch/x86_64/pci/mmconfig.c116
30 files changed, 761 insertions, 416 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 02dd394..7982cbc 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -152,18 +152,18 @@ config MPSC
Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs
with Intel Extended Memory 64 Technology(EM64T). For details see
<http://www.intel.com/technology/64bitextensions/>.
- Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the
- Netburst core and shouldn't use this option. You can distingush them
+ Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
+ Netburst core and shouldn't use this option. You can distinguish them
using the cpu family field
- in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one
- (this rule only applies to system that support EM64T)
+ in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one
+ (this rule only applies to systems that support EM64T)
config MCORE2
bool "Intel Core2 / newer Xeon"
help
Optimize for Intel Core2 and newer Xeons (51xx)
- You can distingush the newer Xeons from the older ones using
- the cpu family field in /proc/cpuinfo. 15 is a older Xeon
+ You can distinguish the newer Xeons from the older ones using
+ the cpu family field in /proc/cpuinfo. 15 is an older Xeon
(use CONFIG_MPSC then), 6 is a newer one. This rule only
applies to CPUs that support EM64T.
@@ -458,8 +458,8 @@ config IOMMU
on systems with more than 3GB. This is usually needed for USB,
sound, many IDE/SATA chipsets and some other devices.
Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
- based IOMMU and a software bounce buffer based IOMMU used on Intel
- systems and as fallback.
+ based hardware IOMMU and a software bounce buffer based IOMMU used
+ on Intel systems and as fallback.
The code is only active when needed (enough memory and limited
device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
too.
@@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
bool
+ help
+ Support for software bounce buffers used on x86-64 systems
+ which don't have a hardware IOMMU (e.g. the current generation
+ of Intel's x86-64 CPUs). Using this PCI devices which can only
+ access 32-bits of memory can be used on systems with more than
+ 3 GB of memory. If unsure, say Y.
config X86_MCE
bool "Machine check support" if EMBEDDED
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 69584c2..293a4a4 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.20-rc3
-# Fri Jan 5 11:54:41 2007
+# Linux kernel version: 2.6.20-git8
+# Tue Feb 13 11:25:16 2007
#
CONFIG_X86_64=y
CONFIG_64BIT=y
@@ -11,6 +11,7 @@ CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
+CONFIG_ZONE_DMA=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
@@ -153,6 +154,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
CONFIG_RESOURCES_64BIT=y
+CONFIG_ZONE_DMA_FLAG=1
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
@@ -201,13 +203,14 @@ CONFIG_ACPI=y
CONFIG_ACPI_SLEEP=y
CONFIG_ACPI_SLEEP_PROC_FS=y
CONFIG_ACPI_SLEEP_PROC_SLEEP=y
+CONFIG_ACPI_PROCFS=y
CONFIG_ACPI_AC=y
CONFIG_ACPI_BATTERY=y
CONFIG_ACPI_BUTTON=y
-# CONFIG_ACPI_VIDEO is not set
# CONFIG_ACPI_HOTKEY is not set
CONFIG_ACPI_FAN=y
# CONFIG_ACPI_DOCK is not set
+# CONFIG_ACPI_BAY is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
@@ -263,7 +266,6 @@ CONFIG_PCI_MMCONFIG=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCIEAER=y
CONFIG_PCI_MSI=y
-# CONFIG_PCI_MULTITHREAD_PROBE is not set
# CONFIG_PCI_DEBUG is not set
# CONFIG_HT_IRQ is not set
@@ -398,6 +400,7 @@ CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set
#
@@ -466,6 +469,7 @@ CONFIG_BLK_DEV_IDECD=y
# CONFIG_BLK_DEV_IDETAPE is not set
# CONFIG_BLK_DEV_IDEFLOPPY is not set
# CONFIG_BLK_DEV_IDESCSI is not set
+CONFIG_BLK_DEV_IDEACPI=y
# CONFIG_IDE_TASK_IOCTL is not set
#
@@ -497,6 +501,7 @@ CONFIG_BLK_DEV_ATIIXP=y
# CONFIG_BLK_DEV_JMICRON is not set
# CONFIG_BLK_DEV_SC1200 is not set
CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_IT8213 is not set
# CONFIG_BLK_DEV_IT821X is not set
# CONFIG_BLK_DEV_NS87415 is not set
# CONFIG_BLK_DEV_PDC202XX_OLD is not set
@@ -507,6 +512,7 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y
# CONFIG_BLK_DEV_SLC90E66 is not set
# CONFIG_BLK_DEV_TRM290 is not set
# CONFIG_BLK_DEV_VIA82CXXX is not set
+# CONFIG_BLK_DEV_TC86C001 is not set
# CONFIG_IDE_ARM is not set
CONFIG_BLK_DEV_IDEDMA=y
# CONFIG_IDEDMA_IVB is not set
@@ -599,6 +605,7 @@ CONFIG_MEGARAID_SAS=y
# Serial ATA (prod) and Parallel ATA (experimental) drivers
#
CONFIG_ATA=y
+# CONFIG_ATA_NONSTANDARD is not set
CONFIG_SATA_AHCI=y
CONFIG_SATA_SVW=y
CONFIG_ATA_PIIX=y
@@ -614,6 +621,7 @@ CONFIG_SATA_SIL=y
# CONFIG_SATA_ULI is not set
CONFIG_SATA_VIA=y
# CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
CONFIG_SATA_INTEL_COMBINED=y
# CONFIG_PATA_ALI is not set
# CONFIG_PATA_AMD is not set
@@ -630,6 +638,7 @@ CONFIG_SATA_INTEL_COMBINED=y
# CONFIG_PATA_HPT3X2N is not set
# CONFIG_PATA_HPT3X3 is not set
# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
# CONFIG_PATA_JMICRON is not set
# CONFIG_PATA_TRIFLEX is not set
# CONFIG_PATA_MARVELL is not set
@@ -682,9 +691,7 @@ CONFIG_IEEE1394=y
# Subsystem Options
#
# CONFIG_IEEE1394_VERBOSEDEBUG is not set
-# CONFIG_IEEE1394_OUI_DB is not set
# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
-# CONFIG_IEEE1394_EXPORT_FULL_API is not set
#
# Device Drivers
@@ -707,6 +714,11 @@ CONFIG_IEEE1394_RAWIO=y
# CONFIG_I2O is not set
#
+# Macintosh device drivers
+#
+# CONFIG_MAC_EMUMOUSEBTN is not set
+
+#
# Network device support
#
CONFIG_NETDEVICES=y
@@ -774,6 +786,7 @@ CONFIG_8139TOO=y
# CONFIG_EPIC100 is not set
# CONFIG_SUNDANCE is not set
# CONFIG_VIA_RHINE is not set
+# CONFIG_SC92031 is not set
#
# Ethernet (1000 Mbit)
@@ -795,11 +808,13 @@ CONFIG_E1000=y
CONFIG_TIGON3=y
CONFIG_BNX2=y
# CONFIG_QLA3XXX is not set
+# CONFIG_ATL1 is not set
#
# Ethernet (10000 Mbit)
#
# CONFIG_CHELSIO_T1 is not set
+# CONFIG_CHELSIO_T3 is not set
# CONFIG_IXGB is not set
CONFIG_S2IO=m
# CONFIG_S2IO_NAPI is not set
@@ -1115,6 +1130,7 @@ CONFIG_SOUND=y
# Open Sound System
#
CONFIG_SOUND_PRIME=y
+CONFIG_OBSOLETE_OSS=y
# CONFIG_SOUND_BT878 is not set
# CONFIG_SOUND_ES1371 is not set
CONFIG_SOUND_ICH=y
@@ -1128,6 +1144,7 @@ CONFIG_SOUND_ICH=y
# HID Devices
#
CONFIG_HID=y
+# CONFIG_HID_DEBUG is not set
#
# USB support
@@ -1142,10 +1159,8 @@ CONFIG_USB=y
# Miscellaneous USB options
#
CONFIG_USB_DEVICEFS=y
-# CONFIG_USB_BANDWIDTH is not set
# CONFIG_USB_DYNAMIC_MINORS is not set
# CONFIG_USB_SUSPEND is not set
-# CONFIG_USB_MULTITHREAD_PROBE is not set
# CONFIG_USB_OTG is not set
#
@@ -1155,9 +1170,11 @@ CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_SPLIT_ISO is not set
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
# CONFIG_USB_ISP116X_HCD is not set
CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
+# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set
@@ -1208,6 +1225,7 @@ CONFIG_USB_HID=y
# CONFIG_USB_ATI_REMOTE2 is not set
# CONFIG_USB_KEYSPAN_REMOTE is not set
# CONFIG_USB_APPLETOUCH is not set
+# CONFIG_USB_GTCO is not set
#
# USB Imaging devices
@@ -1313,6 +1331,10 @@ CONFIG_USB_MON=y
#
#
+# Auxiliary Display support
+#
+
+#
# Virtualization
#
# CONFIG_KVM is not set
@@ -1512,6 +1534,7 @@ CONFIG_UNUSED_SYMBOLS=y
CONFIG_DEBUG_FS=y
# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
+# CONFIG_DEBUG_SHIRQ is not set
CONFIG_LOG_BUF_SHIFT=18
CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_SCHEDSTATS is not set
@@ -1520,7 +1543,6 @@ CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_RWSEMS is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
@@ -1560,4 +1582,5 @@ CONFIG_CRC32=y
# CONFIG_LIBCRC32C is not set
CONFIG_ZLIB_INFLATE=y
CONFIG_PLIST=y
-CONFIG_IOMAP_COPY=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index ff499ef..359eacc 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -21,6 +21,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compat.h>
+#include <linux/binfmts.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
@@ -449,7 +450,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
/* Return stub is in 32bit vsyscall page */
{
- void __user *restorer = VSYSCALL32_SIGRETURN;
+ void __user *restorer;
+ if (current->binfmt->hasvdso)
+ restorer = VSYSCALL32_SIGRETURN;
+ else
+ restorer = (void *)&frame->retcode;
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
@@ -495,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
ptrace_notify(SIGTRAP);
#if DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+ printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
@@ -601,7 +606,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
ptrace_notify(SIGTRAP);
#if DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+ printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5f32cf4..eda7a0d 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -718,4 +718,5 @@ ia32_sys_call_table:
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
.quad sys_getcpu
+ .quad sys_epoll_pwait
ia32_syscall_end:
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 3c7cbff..ae39945 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o
obj-y += topology.o
obj-y += intel_cacheinfo.o
+obj-y += pcspeaker.o
CFLAGS_vsyscall.o := $(PROFILING) -g0
@@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o
i8237-y += ../../i386/kernel/i8237.o
msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
alternative-y += ../../i386/kernel/alternative.o
+pcspeaker-y += ../../i386/kernel/pcspeaker.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 5ebf62c..23178ce 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0;
unsigned long acpi_video_flags;
extern char wakeup_start, wakeup_end;
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
static pgd_t low_ptr;
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 6fe191c..4651fd2 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
return 1;
}
+#ifdef CONFIG_NUMA
+ /* NUMA memory to node map */
+ if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+ *addrp = nodemap_addr + nodemap_size;
+ return 1;
+ }
+#endif
/* XXX ramdisk image here? */
return 0;
}
@@ -184,6 +191,37 @@ unsigned long __init e820_end_of_ram(void)
}
/*
+ * Find the hole size in the range.
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+ unsigned long ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long last, addr;
+
+ if (ei->type != E820_RAM ||
+ ei->addr+ei->size <= start ||
+ ei->addr >= end)
+ continue;
+
+ addr = round_up(ei->addr, PAGE_SIZE);
+ if (addr < start)
+ addr = start;
+
+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (last >= end)
+ last = end;
+
+ if (last > addr)
+ ram += last - addr;
+ }
+ return ((end - start) - ram);
+}
+
+/*
* Mark e820 reserved areas as busy for the resource manager.
*/
void __init e820_reserve_resources(void)
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 1e6f808..598a4d0 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -163,6 +163,20 @@ startup_64:
*/
lgdt cpu_gdt_descr
+ /* set up data segments. actually 0 would do too */
+ movl $__KERNEL_DS,%eax
+ movl %eax,%ds
+ movl %eax,%ss
+ movl %eax,%es
+
+ /*
+ * We don't really need to load %fs or %gs, but load them anyway
+ * to kill any stale realmode selectors. This allows execution
+ * under VT hardware.
+ */
+ movl %eax,%fs
+ movl %eax,%gs
+
/*
* Setup up a dummy PDA. this is just for some early bootup code
* that does in_interrupt()
@@ -173,12 +187,6 @@ startup_64:
shrq $32,%rdx
wrmsr
- /* set up data segments. actually 0 would do too */
- movl $__KERNEL_DS,%eax
- movl %eax,%ds
- movl %eax,%ss
- movl %eax,%es
-
/* esi is pointer to real mode structure with interesting info.
pass it to C */
movl %esi, %edi
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 6be6730..566e64d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.trigger = irq_trigger(idx);
entry.polarity = irq_polarity(idx);
@@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
if (irq_trigger(idx)) {
entry.trigger = 1;
entry.mask = 1;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
}
if (!apic && !IO_APIC_IRQ(irq))
@@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
if (vector < 0)
return;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+ entry.dest = cpu_mask_to_apicid(mask);
entry.vector = vector;
ioapic_register_intr(irq, vector, IOAPIC_AUTO);
@@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
*/
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* unmask IRQ now */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
entry.trigger = 0;
@@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
- " Stat Dest Deli Vect: \n");
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect: \n");
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
entry = ioapic_read_entry(apic, i);
- printk(KERN_DEBUG " %02x %03X %02X ",
+ printk(KERN_DEBUG " %02x %03X ",
i,
- entry.dest.logical.logical_dest,
- entry.dest.physical.physical_dest
+ entry.dest
);
printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
@@ -1293,8 +1292,7 @@ void disable_IO_APIC(void)
entry.dest_mode = 0; /* Physical */
entry.delivery_mode = dest_ExtINT; /* ExtInt */
entry.vector = 0;
- entry.dest.physical.physical_dest =
- GET_APIC_ID(apic_read(APIC_ID));
+ entry.dest = GET_APIC_ID(apic_read(APIC_ID));
/*
* Add it to the IO-APIC irq-routing table:
@@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void)
entry1.dest_mode = 0; /* physical delivery */
entry1.mask = 0; /* unmask IRQ now */
- entry1.dest.physical.physical_dest = hard_smp_processor_id();
+ entry1.dest = hard_smp_processor_id();
entry1.delivery_mode = dest_ExtINT;
entry1.polarity = entry0.polarity;
entry1.trigger = 0;
@@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+ entry.dest = cpu_mask_to_apicid(mask);
entry.trigger = triggering;
entry.polarity = polarity;
entry.mask = 1; /* Disabled (masked) */
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index fe063d3..745b1f0 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
- regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+ regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
return 0;
}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 0c06af6..3bc30d2 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -18,6 +18,7 @@
#include <asm/uaccess.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
+#include <asm/smp.h>
atomic_t irq_err_count;
@@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
if (likely(irq < NR_IRQS))
generic_handle_irq(irq);
- else if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
- __func__, smp_processor_id(), vector);
+ else {
+ if (!disable_apic)
+ ack_APIC_irq();
+
+ if (printk_ratelimit())
+ printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
+ __func__, smp_processor_id(), vector);
+ }
irq_exit();
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bdb54a2..8011a8e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/ctype.h>
+#include <linux/kmod.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -42,6 +43,10 @@ static unsigned long console_logged;
static int notify_user;
static int rip_msr;
static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
/*
* Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = {
void mce_log(struct mce *mce)
{
unsigned next, entry;
+ atomic_inc(&mce_events);
mce->finished = 0;
wmb();
for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
}
+static void do_mce_trigger(void)
+{
+ static atomic_t mce_logged;
+ int events = atomic_read(&mce_events);
+ if (events != atomic_read(&mce_logged) && trigger[0]) {
+ /* Small race window, but should be harmless. */
+ atomic_set(&mce_logged, events);
+ call_usermodehelper(trigger, trigger_argv, NULL, -1);
+ }
+}
+
/*
* The actual machine check handler
*/
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
}
/* Never do anything final in the polling timer */
- if (!regs)
+ if (!regs) {
+ /* Normal interrupt context here. Call trigger for any new
+ events. */
+ do_mce_trigger();
goto out;
+ }
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+/* TBD should generate these dynamically based on number of available banks */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
ACCESSOR(bank3ctl,bank[3],mce_restart())
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+ strcpy(buf, trigger);
+ strcat(buf, "\n");
+ return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+ char *p;
+ int len;
+ strncpy(trigger, buf, sizeof(trigger));
+ trigger[sizeof(trigger)-1] = 0;
+ len = strlen(trigger);
+ p = strchr(trigger, '\n');
+ if (*p) *p = 0;
+ return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+ &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+ &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+ &attr_tolerant, &attr_check_interval, &attr_trigger,
+ NULL
+};
/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
err = sysdev_register(&per_cpu(device_mce,cpu));
if (!err) {
- for (i = 0; i < banks; i++)
+ for (i = 0; mce_attributes[i]; i++)
sysdev_create_file(&per_cpu(device_mce,cpu),
- bank_attributes[i]);
- sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
- sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ mce_attributes[i]);
}
return err;
}
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu)
{
int i;
- for (i = 0; i < banks; i++)
+ for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
- bank_attributes[i]);
- sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
- sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+ mce_attributes[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 93c7072..d0bd5d6 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -37,6 +37,8 @@
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
#define MASK_LVTOFF_HI 0x00F00000
#define MASK_COUNT_EN_HI 0x00080000
#define MASK_INT_TYPE_HI 0x00060000
@@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1)
- address = MCG_XBLK_ADDR
- + ((low & MASK_BLKPTR_LO) >> 21);
+ else if (block == 1) {
+ address = (low & MASK_BLKPTR_LO) >> 21;
+ if (!address)
+ break;
+ address += MCG_XBLK_ADDR;
+ }
else
++address;
if (rdmsr_safe(address, &low, &high))
- continue;
+ break;
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
break;
}
- if (!(high & MASK_VALID_HI >> 1) ||
- (high & MASK_VALID_HI >> 2))
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
continue;
if (!block)
@@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void)
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+ continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1)
- address = MCG_XBLK_ADDR
- + ((low & MASK_BLKPTR_LO) >> 21);
+ else if (block == 1) {
+ address = (low & MASK_BLKPTR_LO) >> 21;
+ if (!address)
+ break;
+ address += MCG_XBLK_ADDR;
+ }
else
++address;
if (rdmsr_safe(address, &low, &high))
- continue;
+ break;
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -206,10 +216,14 @@ asmlinkage void mce_threshold_interrupt(void)
break;
}
- if (!(high & MASK_VALID_HI >> 1) ||
- (high & MASK_VALID_HI >> 2))
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
continue;
+ /* Log the machine check that caused the threshold
+ event. */
+ do_machine_check(NULL, 0);
+
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
@@ -385,7 +399,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
return 0;
if (rdmsr_safe(address, &low, &high))
- goto recurse;
+ return 0;
if (!(high & MASK_VALID_HI)) {
if (block)
@@ -394,8 +408,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
return 0;
}
- if (!(high & MASK_VALID_HI >> 1) ||
- (high & MASK_VALID_HI >> 2))
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
goto recurse;
b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 9cb42ec..486f4c6 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- return boot_cpu_data.x86 == 15;
+ return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
return 1;
@@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+ unsigned int retval = hz;
+
+ /*
+ * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) {
+ retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
+ }
+ return retval;
+}
+
int __init check_nmi_watchdog (void)
{
int *counts;
@@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void)
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
nmi_hz = 1;
- /*
- * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
- ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
- nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
- }
+ if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0)
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
}
kfree(counts);
@@ -360,6 +368,33 @@ void enable_timer_nmi_watchdog(void)
}
}
+static void __acpi_nmi_disable(void *__unused)
+{
+ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+ apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
#ifdef CONFIG_PM
static int nmi_pm_active; /* nmi_active before suspend */
@@ -634,7 +669,9 @@ static int setup_intel_arch_watchdog(void)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
- wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+ wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -855,15 +892,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
dummy &= ~P4_CCCR_OVF;
wrmsrl(wd->cccr_msr, dummy);
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* start the cycle over again */
+ wrmsrl(wd->perfctr_msr,
+ -((u64)cpu_khz * 1000 / nmi_hz));
} else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
/*
* ArchPerfom/Core Duo needs to re-unmask
* the apic vector
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* ARCH_PERFMON has 32 bit counter writes */
+ wrmsr(wd->perfctr_msr,
+ (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0);
+ } else {
+ /* start the cycle over again */
+ wrmsrl(wd->perfctr_msr,
+ -((u64)cpu_khz * 1000 / nmi_hz));
}
- /* start the cycle over again */
- wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
rc = 1;
} else if (nmi_watchdog == NMI_IO_APIC) {
/* don't know how to accurately check for this.
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 3d65b1d..04480c3 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = {
#define PHB_DEBUG_STUFF_OFFSET 0x0020
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
static int translate_empty_slots __read_mostly = 0;
static int calgary_detected __read_mostly = 0;
@@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
{
unsigned long entry;
unsigned long badbit;
+ unsigned long badend;
+
+ /* were we called with bad_dma_address? */
+ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+ printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+ "address 0x%Lx\n", dma_addr);
+ WARN_ON(1);
+ return;
+ }
entry = dma_addr >> PAGE_SHIFT;
@@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
u64 start;
struct iommu_table *tbl = dev->sysdata;
- /* reserve bad_dma_address in case it's a legal address */
- iommu_range_reserve(tbl, bad_dma_address, 1);
+ /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+ iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
start = (640 * 1024);
@@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void)
}
force_iommu = 1;
+ bad_dma_address = 0x0;
dma_ops = &calgary_dma_ops;
return 0;
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 683b7a5..651ccfb 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
- [,forcesac][,fullflush][,nomerge][,biomerge]
- size set size of iommu (in bytes)
- noagp don't initialize the AGP driver and use full aperture.
- off don't use the IOMMU
- leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
- memaper[=order] allocate an own aperture over RAM with size 32MB^order.
- noforce don't force IOMMU usage. Default.
- force Force IOMMU.
- merge Do lazy merging. This may improve performance on some block devices.
- Implies force (experimental)
- biomerge Do merging at the BIO layer. This is more efficient than merge,
- but should be only done with very big IOMMUs. Implies merge,force.
- nomerge Don't do SG merging.
- forcesac For SAC mode for masks <40bits (experimental)
- fullflush Flush IOMMU on each allocation (default)
- nofullflush Don't use IOMMU fullflush
- allowed overwrite iommu off workarounds for specific chipsets.
- soft Use software bounce buffering (default for Intel machines)
- noaperture Don't touch the aperture for AGP.
- allowdac Allow DMA >4GB
- nodac Forbid DMA >4GB
- panic Force panic when IOMMU overflows
-*/
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
__init int iommu_setup(char *p)
{
iommu_merge = 1;
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index fc1960f..030eb37 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
{
u64 mask = *dev->dma_mask;
- int high = addr + size >= mask;
+ int high = addr + size > mask;
int mmu = high;
if (force_iommu)
mmu = 1;
@@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size
static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
u64 mask = *dev->dma_mask;
- int high = addr + size >= mask;
+ int high = addr + size > mask;
int mmu = high;
return mmu;
}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index addc14a..4326a69 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
- ret |= __get_user(tmp, (unsigned long __user *) data);
- putreg(child, ui, tmp);
+ ret = __get_user(tmp, (unsigned long __user *) data);
+ if (ret)
+ break;
+ ret = putreg(child, ui, tmp);
+ if (ret)
+ break;
data += sizeof(long);
}
break;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 6047724..3d98b69 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -138,128 +138,6 @@ struct resource code_resource = {
.flags = IORESOURCE_RAM,
};
-#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
-
-static struct resource system_rom_resource = {
- .name = "System ROM",
- .start = 0xf0000,
- .end = 0xfffff,
- .flags = IORESOURCE_ROM,
-};
-
-static struct resource extension_rom_resource = {
- .name = "Extension ROM",
- .start = 0xe0000,
- .end = 0xeffff,
- .flags = IORESOURCE_ROM,
-};
-
-static struct resource adapter_rom_resources[] = {
- { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM },
- { .name = "Adapter ROM", .start = 0, .end = 0,
- .flags = IORESOURCE_ROM }
-};
-
-static struct resource video_rom_resource = {
- .name = "Video ROM",
- .start = 0xc0000,
- .end = 0xc7fff,
- .flags = IORESOURCE_ROM,
-};
-
-static struct resource video_ram_resource = {
- .name = "Video RAM area",
- .start = 0xa0000,
- .end = 0xbffff,
- .flags = IORESOURCE_RAM,
-};
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static int __init romchecksum(unsigned char *rom, unsigned long length)
-{
- unsigned char *p, sum = 0;
-
- for (p = rom; p < rom + length; p++)
- sum += *p;
- return sum == 0;
-}
-
-static void __init probe_roms(void)
-{
- unsigned long start, length, upper;
- unsigned char *rom;
- int i;
-
- /* video rom */
- upper = adapter_rom_resources[0].start;
- for (start = video_rom_resource.start; start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- video_rom_resource.start = start;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
-
- /* if checksum okay, trust length byte */
- if (length && romchecksum(rom, length))
- video_rom_resource.end = start + length - 1;
-
- request_resource(&iomem_resource, &video_rom_resource);
- break;
- }
-
- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
- if (start < upper)
- start = upper;
-
- /* system rom */
- request_resource(&iomem_resource, &system_rom_resource);
- upper = system_rom_resource.start;
-
- /* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt(extension_rom_resource.start);
- if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
- if (romchecksum(rom, length)) {
- request_resource(&iomem_resource, &extension_rom_resource);
- upper = extension_rom_resource.start;
- }
- }
-
- /* check for adapter roms on 2k boundaries */
- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
- start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
-
- /* but accept any length that fits if checksum okay */
- if (!length || start + length > upper || !romchecksum(rom, length))
- continue;
-
- adapter_rom_resources[i].start = start;
- adapter_rom_resources[i].end = start + length - 1;
- request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
- start = adapter_rom_resources[i++].end & ~2047UL;
- }
-}
-
#ifdef CONFIG_PROC_VMCORE
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel. This option will be passed
@@ -444,6 +322,11 @@ void __init setup_arch(char **cmdline_p)
/* reserve ebda region */
if (ebda_addr)
reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+ /* reserve nodemap region */
+ if (nodemap_addr)
+ reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
#ifdef CONFIG_SMP
/*
@@ -519,15 +402,11 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
- probe_roms();
+ * We trust e820 completely. No explicit ROM probing in memory.
+ */
e820_reserve_resources();
e820_mark_nosave_regions();
- request_resource(&iomem_resource, &video_ram_resource);
-
{
unsigned i;
/* request I/O space for devices used on all i[345]86 PCs */
@@ -1063,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
- NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
+ NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+ "3dnowext", "3dnow",
/* Transmeta-defined */
"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -1081,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
@@ -1091,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* AMD-defined (#2) */
- "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+ "altmovcr8", "abm", "sse4a",
+ "misalignsse", "3dnowprefetch",
+ "osvw", "ibs", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};
@@ -1103,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"ttp", /* thermal trip */
"tm",
"stc",
+ "100mhzsteps",
+ "hwpstate",
+ NULL, /* tsc invariant mapped to constant_tsc */
NULL,
/* nothing */ /* constant_tsc - moved to flags */
};
@@ -1219,23 +1104,3 @@ struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
-
-#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
-#include <linux/platform_device.h>
-static __init int add_pcspkr(void)
-{
- struct platform_device *pd;
- int ret;
-
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
-
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
-}
-device_initcall(add_pcspkr);
-#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8c4b80f..6a70b55 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL(__supported_pte_mask);
static int do_not_nx __cpuinitdata = 0;
/* noexec=on|off
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 6026b31..65ac2c6 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr)
trace->skip--;
return;
}
- if (trace->nr_entries < trace->max_entries - 1)
+ if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = addr;
}
@@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = {
void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
{
dump_trace(task, NULL, NULL, &save_stack_ops, trace);
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 335cc91..3cc6886 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc);
#define TICK_COUNT 100000000
#define TICK_MIN 5000
+#define MAX_READ_RETRIES 5
/*
* Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
@@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc);
*/
static void __init read_hpet_tsc(int *hpet, int *tsc)
{
- int tsc1, tsc2, hpet1;
+ int tsc1, tsc2, hpet1, retries = 0;
+ static int msg;
do {
tsc1 = get_cycles_sync();
hpet1 = hpet_readl(HPET_COUNTER);
tsc2 = get_cycles_sync();
- } while (tsc2 - tsc1 > TICK_MIN);
+ } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES);
+ if (retries >= MAX_READ_RETRIES && !msg++)
+ printk(KERN_WARNING
+ "hpet.c: exceeded max retries to read HPET & TSC\n");
*hpet = hpet1;
*tsc = tsc2;
}
@@ -1221,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void)
if (PIE_on)
PIE_count += lost_ints;
- printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
- hpet_rtc_int_freq);
+ if (printk_ratelimit())
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ hpet_rtc_int_freq);
}
}
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e47..0dffae6 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(copy_from_user);
EXPORT_SYMBOL(copy_to_user);
EXPORT_SYMBOL(__copy_from_user_inatomic);
@@ -34,8 +35,8 @@ EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
#ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+extern void __write_lock_failed(rwlock_t *rw);
+extern void __read_lock_failed(rwlock_t *rw);
EXPORT_SYMBOL(__write_lock_failed);
EXPORT_SYMBOL(__read_lock_failed);
#endif
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index b78d417..8d5f835 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S
new file mode 100644
index 0000000..4620efb
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx /* save zero flag */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+
+ xorl %eax,%eax /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movnti %r11,(%rdi)
+.Ld2: movnti %r8,1*8(%rdi)
+.Ld3: movnti %r9,2*8(%rdi)
+.Ld4: movnti %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movnti %r11,4*8(%rdi)
+.Ld6: movnti %r8,5*8(%rdi)
+.Ld7: movnti %r9,6*8(%rdi)
+.Ld8: movnti %r10,7*8(%rdi)
+
+ dec %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movnti %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE %rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp) /* zero flag set? */
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 49e8cf2..6ada723 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
-static inline int notify_page_fault(enum die_val val, const char *str,
- struct pt_regs *regs, long err, int trap, int sig)
+static inline int notify_page_fault(struct pt_regs *regs, long err)
{
struct die_args args = {
.regs = regs,
- .str = str,
+ .str = "page fault",
.err = err,
- .trapnr = trap,
- .signr = sig
+ .trapnr = 14,
+ .signr = SIGSEGV
};
- return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
+ return atomic_notifier_call_chain(&notify_page_fault_chain,
+ DIE_PAGE_FAULT, &args);
}
/* Sometimes the CPU reports invalid exceptions on prefetch.
@@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (vmalloc_fault(address) >= 0)
return;
}
- if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
- SIGSEGV) == NOTIFY_STOP)
+ if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
@@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
goto bad_area_nosemaphore;
}
- if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
- SIGSEGV) == NOTIFY_STOP)
+ if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
return;
if (likely(regs->eflags & X86_EFLAGS_IF))
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 2ee2e00..41b8fb0 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
/*
@@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
int res = -1;
unsigned long addr, end;
- if (shift >= 64)
- return -1;
- memset(memnodemap, 0xff, sizeof(memnodemap));
+ memset(memnodemap, 0xff, memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
if (addr >= end)
continue;
- if ((end >> shift) >= NODEMAPSIZE)
+ if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != 0xff)
return -1;
memnodemap[addr >> shift] = i;
- addr += (1UL << shift);
+ addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
{
- int shift = 20;
+ unsigned long pad, pad_addr;
+
+ memnodemap = memnode.embedded_map;
+ if (memnodemapsize <= 48)
+ return 0;
+
+ pad = L1_CACHE_BYTES - 1;
+ pad_addr = 0x8000;
+ nodemap_size = pad + memnodemapsize;
+ nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+ nodemap_size);
+ if (nodemap_addr == -1UL) {
+ printk(KERN_ERR
+ "NUMA: Unable to allocate Memory to Node hash map\n");
+ nodemap_addr = nodemap_size = 0;
+ return -1;
+ }
+ pad_addr = (nodemap_addr + pad) & ~pad;
+ memnodemap = phys_to_virt(pad_addr);
+
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ nodemap_addr, nodemap_addr + nodemap_size);
+ return 0;
+}
- while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
- shift++;
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+{
+ int i, nodes_used = 0;
+ unsigned long start, end;
+ unsigned long bitfield = 0, memtop = 0;
+
+ for (i = 0; i < numnodes; i++) {
+ start = nodes[i].start;
+ end = nodes[i].end;
+ if (start >= end)
+ continue;
+ bitfield |= start;
+ nodes_used++;
+ if (end > memtop)
+ memtop = end;
+ }
+ if (nodes_used <= 1)
+ i = 63;
+ else
+ i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ memnodemapsize = (memtop >> i)+1;
+ return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+ int shift;
+ shift = extract_lsb_from_nodes(nodes, numnodes);
+ if (allocate_cachealigned_memnodemap())
+ return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
@@ -216,31 +272,113 @@ void __init numa_init_array(void)
}
#ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
int numa_fake __initdata = 0;
-/* Numa emulation */
+/*
+ * This function is used to find out if the start and end correspond to
+ * different zones.
+ */
+int zone_cross_over(unsigned long start, unsigned long end)
+{
+ if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
+ (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
+ return 1;
+ return 0;
+}
+
static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
{
- int i;
+ int i, big;
struct bootnode nodes[MAX_NUMNODES];
- unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+ unsigned long sz, old_sz;
+ unsigned long hole_size;
+ unsigned long start, end;
+ unsigned long max_addr = (end_pfn << PAGE_SHIFT);
+
+ start = (start_pfn << PAGE_SHIFT);
+ hole_size = e820_hole_size(start, max_addr);
+ sz = (max_addr - start - hole_size) / numa_fake;
/* Kludge needed for the hash function */
- if (hweight64(sz) > 1) {
- unsigned long x = 1;
- while ((x << 1) < sz)
- x <<= 1;
- if (x < sz/2)
- printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
- sz = x;
- }
+ old_sz = sz;
+ /*
+ * Round down to the nearest FAKE_NODE_MIN_SIZE.
+ */
+ sz &= FAKE_NODE_MIN_HASH_MASK;
+
+ /*
+ * We ensure that each node is at least 64MB big. Smaller than this
+ * size can cause VM hiccups.
+ */
+ if (sz == 0) {
+ printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
+ "the number of nodes\n", numa_fake);
+ numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
+ printk(KERN_INFO "Number of fake nodes will be = %d\n",
+ numa_fake);
+ sz = FAKE_NODE_MIN_SIZE;
+ }
+ /*
+ * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
+ * This logic ensures the extra memory gets distributed among as many
+ * nodes as possible (as compared to one single node getting all that
+ * extra memory.
+ */
+ big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
+ printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
+ "%d\n",
+ (sz >> 20), (hole_size >> 20), big);
memset(&nodes,0,sizeof(nodes));
+ end = start;
for (i = 0; i < numa_fake; i++) {
- nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+ /*
+ * In case we are not able to allocate enough memory for all
+ * the nodes, we reduce the number of fake nodes.
+ */
+ if (end >= max_addr) {
+ numa_fake = i - 1;
+ break;
+ }
+ start = nodes[i].start = end;
+ /*
+ * Final node can have all the remaining memory.
+ */
if (i == numa_fake-1)
- sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
- nodes[i].end = nodes[i].start + sz;
+ sz = max_addr - start;
+ end = nodes[i].start + sz;
+ /*
+ * Fir "big" number of nodes get extra granule.
+ */
+ if (i < big)
+ end += FAKE_NODE_MIN_SIZE;
+ /*
+ * Iterate over the range to ensure that this node gets at
+ * least sz amount of RAM (excluding holes)
+ */
+ while ((end - start - e820_hole_size(start, end)) < sz) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end >= max_addr)
+ break;
+ }
+ /*
+ * Look at the next node to make sure there is some real memory
+ * to map. Bad things happen when the only memory present
+ * in a zone on a fake node is IO hole.
+ */
+ while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
+ if (zone_cross_over(start, end + sz)) {
+ end = (MAX_DMA32_PFN << PAGE_SHIFT);
+ break;
+ }
+ if (end >= max_addr)
+ break;
+ end += FAKE_NODE_MIN_SIZE;
+ }
+ if (end > max_addr)
+ end = max_addr;
+ nodes[i].end = end;
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
i,
nodes[i].start, nodes[i].end,
@@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
end_pfn << PAGE_SHIFT);
/* setup dummy node covering all memory */
memnode_shift = 63;
+ memnodemap = memnode.embedded_map;
memnodemap[0] = 0;
nodes_clear(node_online_map);
node_set_online(0);
@@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-#ifdef CONFIG_SPARSEMEM
-static void __init arch_sparse_init(void)
-{
- int i;
-
- for_each_online_node(i)
- memory_present(i, node_start_pfn(i), node_end_pfn(i));
-
- sparse_init();
-}
-#else
-#define arch_sparse_init() do {} while (0)
-#endif
-
void __init paging_init(void)
{
int i;
@@ -344,7 +469,8 @@ void __init paging_init(void)
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = end_pfn;
- arch_sparse_init();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
for_each_online_node(i) {
setup_node_zones(i);
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index ccb91dd..65c5eaa 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
pud_t *pud;
pmd_t *pmd;
pte_t large_pte;
+ unsigned long pfn;
pgd = pgd_offset_k(address);
BUG_ON(pgd_none(*pgd));
@@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, address);
BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
- large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
+ pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
+ large_pte = pfn_pte(pfn, ref_prot);
large_pte = pte_mkhuge(large_pte);
set_pte((pte_t *)pmd, large_pte);
}
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
index 149aba0..c9eddc8 100644
--- a/arch/x86_64/pci/Makefile
+++ b/arch/x86_64/pci/Makefile
@@ -11,7 +11,7 @@ obj-y += fixup.o init.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o common.o early.o
# mmconfig has a 64bit special
-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o
obj-$(CONFIG_NUMA) += k8-bus.o
@@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o
i386-y += ../../i386/pci/i386.o
init-y += ../../i386/pci/init.o
early-y += ../../i386/pci/early.o
+mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index faabb6e..65d8273 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -13,16 +13,6 @@
#include "pci.h"
-/* aperture is up to 256MB but BIOS may reserve less */
-#define MMCONFIG_APER_MIN (2 * 1024*1024)
-#define MMCONFIG_APER_MAX (256 * 1024*1024)
-
-/* Verify the first 16 busses. We assume that systems with more busses
- get MCFG right. */
-#define MAX_CHECK_BUS 16
-
-static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
-
/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
struct acpi_mcfg_allocation *cfg;
@@ -32,30 +22,17 @@ static struct mmcfg_virt *pci_mmcfg_virt;
static char __iomem *get_virt(unsigned int seg, unsigned bus)
{
- int cfg_num = -1;
struct acpi_mcfg_allocation *cfg;
+ int cfg_num;
- while (1) {
- ++cfg_num;
- if (cfg_num >= pci_mmcfg_config_num)
- break;
+ for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
cfg = pci_mmcfg_virt[cfg_num].cfg;
- if (cfg->pci_segment != seg)
- continue;
- if ((cfg->start_bus_number <= bus) &&
+ if (cfg->pci_segment == seg &&
+ (cfg->start_bus_number <= bus) &&
(cfg->end_bus_number >= bus))
return pci_mmcfg_virt[cfg_num].virt;
}
- /* Handle more broken MCFG tables on Asus etc.
- They only contain a single entry for bus 0-0. Assume
- this applies to all busses. */
- cfg = &pci_mmcfg_config[0];
- if (pci_mmcfg_config_num == 1 &&
- cfg->pci_segment == 0 &&
- (cfg->start_bus_number | cfg->end_bus_number) == 0)
- return pci_mmcfg_virt[0].virt;
-
/* Fall back to type 0 */
return NULL;
}
@@ -63,8 +40,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
char __iomem *addr;
- if (seg == 0 && bus < MAX_CHECK_BUS &&
- test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
+ if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+ test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
return NULL;
addr = get_virt(seg, bus);
if (!addr)
@@ -135,79 +112,46 @@ static struct pci_raw_ops pci_mmcfg = {
.write = pci_mmcfg_write,
};
-/* K8 systems have some devices (typically in the builtin northbridge)
- that are only accessible using type1
- Normally this can be expressed in the MCFG by not listing them
- and assigning suitable _SEGs, but this isn't implemented in some BIOS.
- Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them. */
-static __init void unreachable_devices(void)
+static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
{
- int i, k;
- /* Use the max bus number from ACPI here? */
- for (k = 0; k < MAX_CHECK_BUS; k++) {
- for (i = 0; i < 32; i++) {
- u32 val1;
- char __iomem *addr;
-
- pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
- addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
- if (addr == NULL|| readl(addr) != val1) {
- set_bit(i + 32*k, fallback_slots);
- printk(KERN_NOTICE "PCI: No mmconfig possible"
- " on device %02x:%02x\n", k, i);
- }
- }
+ void __iomem *addr;
+ u32 size;
+
+ size = (cfg->end_bus_number + 1) << 20;
+ addr = ioremap_nocache(cfg->address, size);
+ if (addr) {
+ printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
+ cfg->address, cfg->address + size - 1);
}
+ return addr;
}
-void __init pci_mmcfg_init(int type)
+int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+ unsigned int devfn)
{
- int i;
-
- if ((pci_probe & PCI_PROBE_MMCONF) == 0)
- return;
-
- acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
- if ((pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
- return;
-
- /* Only do this check when type 1 works. If it doesn't work
- assume we run on a Mac and always use MCFG */
- if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address,
- pci_mmcfg_config[0].address + MMCONFIG_APER_MIN,
- E820_RESERVED)) {
- printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n",
- (unsigned long)pci_mmcfg_config[0].address);
- printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
- return;
- }
+ return pci_dev_base(seg, bus, devfn) != NULL;
+}
- pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
+int __init pci_mmcfg_arch_init(void)
+{
+ int i;
+ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
+ pci_mmcfg_config_num, GFP_KERNEL);
if (pci_mmcfg_virt == NULL) {
printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
- return;
+ return 0;
}
+
for (i = 0; i < pci_mmcfg_config_num; ++i) {
pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
- pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address,
- MMCONFIG_APER_MAX);
+ pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
if (!pci_mmcfg_virt[i].virt) {
printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
"segment %d\n",
pci_mmcfg_config[i].pci_segment);
- return;
+ return 0;
}
- printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n",
- (unsigned long)pci_mmcfg_config[i].address);
}
-
- unreachable_devices();
-
raw_pci_ops = &pci_mmcfg;
- pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+ return 1;
}