From f89112502805c1f6a6955f90ad158e538edb319d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 4 Mar 2011 10:26:36 +0100 Subject: x86-64, NUMA: Revert NUMA affine page table allocation This patch reverts NUMA affine page table allocation added by commit 1411e0ec31 (x86-64, numa: Put pgtable to local node memory). The commit made an undocumented change where the kernel linear mapping strictly follows intersection of e820 memory map and NUMA configuration. If the physical memory configuration has holes or NUMA nodes are not properly aligned, this leads to using unnecessarily smaller mapping size which leads to increased TLB pressure. For details, http://thread.gmane.org/gmane.linux.kernel/1104672 Patches to fix the problem have been proposed but the underlying code needs more cleanup and the approach itself seems a bit heavy handed and it has been determined to revert the feature for now and come back to it in the next developement cycle. http://thread.gmane.org/gmane.linux.kernel/1105959 As init_memory_mapping_high() callsites have been consolidated since the commit, reverting is done manually. Also, the RED-PEN comment in arch/x86/mm/init.c is not restored as the problem no longer exists with memblock based top-down early memory allocation. Signed-off-by: Tejun Heo Cc: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 97e6007..bce688d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -54,8 +54,6 @@ static inline phys_addr_t get_max_mapped(void) extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); -void init_memory_mapping_high(void); - extern void initmem_init(void); extern void free_initmem(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 46e684f..c3a606c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -963,6 +963,14 @@ void __init setup_arch(char **cmdline_p) max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn<start); - final_end = min_t(unsigned long, end_pfn<end); - - if (final_end <= final_start) - return 0; - - pfn_mapped = init_memory_mapping(final_start, final_end); - - if (pfn_mapped > data->pfn_mapped) - data->pfn_mapped = pfn_mapped; - - return 0; -} - -static unsigned long __init_refok -init_memory_mapping_active_regions(unsigned long start, unsigned long end) -{ - struct mapping_work_data data; - - data.start = start; - data.end = end; - data.pfn_mapped = 0; - - work_with_active_regions(MAX_NUMNODES, mapping_work_fn, &data); - - return data.pfn_mapped; -} - -void __init_refok init_memory_mapping_high(void) -{ - if (max_pfn > max_low_pfn) { - max_pfn_mapped = init_memory_mapping_active_regions(1UL<<32, - max_pfn< Date: Fri, 4 Mar 2011 14:49:28 +0100 Subject: x86-64, NUMA: Fix numa_emulation code with node0 without RAM On one system that does not have RAM on node0. When numa_emulation is compiled in, and 1. boot system without numa=fake... 2. or boot system with numa=fake=128 to make emulation fail will get: [ 0.092026] ------------[ cut here ]------------ [ 0.096005] kernel BUG at arch/x86/mm/numa_emulation.c:439! [ 0.096005] invalid opcode: 0000 [#1] SMP [ 0.096005] last sysfs file: [ 0.096005] CPU 0 [ 0.096005] Modules linked in: [ 0.096005] [ 0.096005] Pid: 0, comm: swapper Not tainted 2.6.38-rc6-tip-yh-03869-gcb0491d-dirty #684 Sun Microsystems Sun Fire X4240/Sun Fire X4240 [ 0.096005] RIP: 0010:[] [] numa_add_cpu+0x56/0xcf [ 0.096005] RSP: 0000:ffffffff82437ed8 EFLAGS: 00010246 ... [ 0.096005] Call Trace: [ 0.096005] [] identify_cpu+0x2d7/0x2df [ 0.096005] [] identify_boot_cpu+0x10/0x30 [ 0.096005] [] check_bugs+0x9/0x2d [ 0.096005] [] start_kernel+0x3d7/0x3f1 [ 0.096005] [] x86_64_start_reservations+0x9c/0xa0 [ 0.096005] [] x86_64_start_kernel+0x1dd/0x1e8 [ 0.096005] Code: 74 06 48 8d 04 90 eb 0f 48 c7 c0 30 d9 00 00 48 03 04 d5 90 0f 60 82 8b 00 83 f8 ff 74 0d 0f a3 05 8b 7e 92 00 19 d2 85 d2 75 02 <0f> 0b 48 98 be 00 01 00 00 48 c7 c7 e0 44 60 82 44 8b 2c 85 e0 [ 0.096005] RIP [] numa_add_cpu+0x56/0xcf [ 0.096005] RSP [ 0.096026] ---[ end trace a7919e7f17c0a725 ]--- We need to use early_cpu_to_node() directly, because numa_cpu_node() will return node0 that is not onlined. Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index aeecea9..75b31dc 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -417,9 +417,7 @@ void __cpuinit numa_add_cpu(int cpu) { int physnid, nid; - nid = numa_cpu_node(cpu); - if (nid == NUMA_NO_NODE) - nid = early_cpu_to_node(cpu); + nid = early_cpu_to_node(cpu); BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); physnid = emu_nid_to_phys[nid]; -- cgit v0.10.2 From c09cedf4f75f1e47ea17f55e18e9cfb81bec8575 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 4 Mar 2011 15:17:21 +0100 Subject: x86-64, NUMA: Clean up initmem_init() This patch cleans initmem_init() so that it is more readable and doesn't use an unnecessary array of function pointers to convolute the flow of the code. It also makes it obvious that dummy_numa_init() will always succeed (and documents that requirement) so that the existing BUG() is never actually reached. No functional change. -tj: Updated comment for dummy_numa_init() slightly. Signed-off-by: David Rientjes Signed-off-by: Tejun Heo diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 86491ba..9ec0f20 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -562,6 +562,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return 0; } +/** + * dummy_numma_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", @@ -575,57 +584,64 @@ static int __init dummy_numa_init(void) return 0; } -void __init initmem_init(void) +static int __init numa_init(int (*init_func)(void)) { - int (*numa_init[])(void) = { [2] = dummy_numa_init }; - int i, j; - - if (!numa_off) { -#ifdef CONFIG_ACPI_NUMA - numa_init[0] = x86_acpi_numa_init; -#endif -#ifdef CONFIG_AMD_NUMA - numa_init[1] = amd_numa_init; -#endif - } + int i; + int ret; - for (i = 0; i < ARRAY_SIZE(numa_init); i++) { - if (!numa_init[i]) - continue; + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); - for (j = 0; j < MAX_LOCAL_APIC; j++) - set_apicid_to_node(j, NUMA_NO_NODE); + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); - numa_reset_distance(); + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; - if (numa_init[i]() < 0) - continue; + numa_emulation(&numa_meminfo, numa_distance_cnt); - if (numa_cleanup_meminfo(&numa_meminfo) < 0) - continue; + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; - numa_emulation(&numa_meminfo, numa_distance_cnt); + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); - if (numa_register_memblks(&numa_meminfo) < 0) + if (nid == NUMA_NO_NODE) continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} - for (j = 0; j < nr_cpu_ids; j++) { - int nid = early_cpu_to_node(j); +void __init initmem_init(void) +{ + int ret; - if (nid == NUMA_NO_NODE) - continue; - if (!node_online(nid)) - numa_clear_node(j); - } - numa_init_array(); - return; + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + ret = numa_init(x86_acpi_numa_init); + if (!ret) + return; +#endif +#ifdef CONFIG_AMD_NUMA + ret = numa_init(amd_numa_init); + if (!ret) + return; +#endif } - BUG(); + + numa_init(dummy_numa_init); } unsigned long __init numa_free_all_bootmem(void) -- cgit v0.10.2 From 078a198906c796981f93ff100c210506e91aade5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 4 Mar 2011 16:32:02 +0100 Subject: x86-64, NUMA: Don't assume phys node 0 is always online in numa_emulation() Undetermined entries in emu_nid_to_phys[] are filled with zero assuming that physical node 0 is always online; however, this might not be true depending on hardware configuration. Find a physical node which is actually online and use it instead. Signed-off-by: Tejun Heo Reported-by: David Rientjes LKML-Reference: diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 75b31dc..3696be0 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -301,6 +301,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) const u64 max_addr = max_pfn << PAGE_SHIFT; u8 *phys_dist = NULL; size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int dfl_phys_nid; int i, j, ret; if (!emu_cmdline) @@ -357,6 +358,19 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) node_distance(i, j); } + /* determine the default phys nid to use for unmapped nodes */ + dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + dfl_phys_nid = emu_nid_to_phys[i]; + break; + } + } + if (dfl_phys_nid == NUMA_NO_NODE) { + pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); + goto no_emu; + } + /* commit */ *numa_meminfo = ei; @@ -377,7 +391,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) /* make sure all emulated nodes are mapped to a physical node */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) - emu_nid_to_phys[i] = 0; + emu_nid_to_phys[i] = dfl_phys_nid; /* * Transform distance table. numa_set_distance() ignores all -- cgit v0.10.2