From c2d0879112825cddddd6c4f9b2645ff32acd6dc5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 22 Nov 2010 16:31:35 -0800 Subject: xen: clean up "extra" memory handling some more Make sure that extra_pages is added for all E820_RAM regions beyond mem_end - completely excluded regions as well as the remains of partially included regions. Also makes sure the extra region is not unnecessarily high, and simplifies the logic to decide which regions should be added. Signed-off-by: Jeremy Fitzhardinge diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 38fdffa..b85dcee 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -182,24 +182,21 @@ char * __init xen_memory_setup(void) for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end = map[i].addr + map[i].size; - if (map[i].type == E820_RAM) { - if (map[i].addr < mem_end && end > mem_end) { - /* Truncate region to max_mem. */ - u64 delta = end - mem_end; + if (map[i].type == E820_RAM && end > mem_end) { + /* RAM off the end - may be partially included */ + u64 delta = min(map[i].size, end - mem_end); - map[i].size -= delta; - extra_pages += PFN_DOWN(delta); + map[i].size -= delta; + end -= delta; - end = mem_end; - } + extra_pages += PFN_DOWN(delta); } - if (end > xen_extra_mem_start) + if (map[i].size > 0 && end > xen_extra_mem_start) xen_extra_mem_start = end; - /* If region is non-RAM or below mem_end, add what remains */ - if ((map[i].type != E820_RAM || map[i].addr < mem_end) && - map[i].size > 0) + /* Add region if any remains */ + if (map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } -- cgit v0.10.2 From bc15fde77fc5d9ec2eec6066a5ab554ea1266a0a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 22 Nov 2010 17:17:50 -0800 Subject: xen: use default_idle We just need the idle loop to drop into safe_halt, which default_idle() is perfectly capable of doing. There's no need to duplicate it. Signed-off-by: Jeremy Fitzhardinge diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b85dcee..95fb68a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -250,20 +250,6 @@ char * __init xen_memory_setup(void) return "Xen"; } -static void xen_idle(void) -{ - local_irq_disable(); - - if (need_resched()) - local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); - safe_halt(); - current_thread_info()->status |= TS_POLLING; - } -} - /* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes @@ -360,7 +346,11 @@ void __init xen_arch_setup(void) MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); - pm_idle = xen_idle; + /* Set up idle, making sure it calls safe_halt() pvop */ +#ifdef CONFIG_X86_32 + boot_cpu_data.hlt_works_ok = 1; +#endif + pm_idle = default_idle; fiddle_vdso(); } -- cgit v0.10.2 From 31e323cca9d5c8afd372976c35a5d46192f540d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 29 Nov 2010 14:16:53 -0800 Subject: xen: don't bother to stop other cpus on shutdown/reboot Xen will shoot all the VCPUs when we do a shutdown hypercall, so there's no need to do it manually. In any case it will fail because all the IPI irqs have been pulled down by this point, so the cross-CPU calls will simply hang forever. Until change 76fac077db6b34e2c6383a7b4f3f4f7b7d06d8ce the function calls were not synchronously waited for, so this wasn't apparent. However after that change the calls became synchronous leading to a hang on shutdown on multi-VCPU guests. Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: Alok Kataria diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 235c0f4..4a5973a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1016,10 +1016,6 @@ static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP - stop_other_cpus(); -#endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } -- cgit v0.10.2 From 805e3f495057aa5307ad4e3d6dc7073d4733c691 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 3 Nov 2010 15:32:21 +0000 Subject: xen: x86/32: perform initial startup on initial_page_table Only make swapper_pg_dir readonly and pinned when generic x86 architecture code (which also starts on initial_page_table) switches to it. This helps ensure that the generic setup paths work on Xen unmodified. In particular clone_pgd_range writes directly to the destination pgd entries and is used to initialise swapper_pg_dir so we need to ensure that it remains writeable until the last possible moment during bring up. This is complicated slightly by the need to avoid sharing kernel PMD entries when running under Xen, therefore the Xen implementation must make a copy of the kernel PMD (which is otherwise referred to by both intial_page_table and swapper_pg_dir) before switching to swapper_pg_dir. Signed-off-by: Ian Campbell Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Signed-off-by: Jeremy Fitzhardinge diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4a5973a..0db7303 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1187,8 +1187,6 @@ asmlinkage void __init xen_start_kernel(void) /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); - init_mm.pgd = pgd; - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 21ed8d7..c9cf23e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2119,44 +2119,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, return pgd; } #else /* !CONFIG_X86_64 */ -static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); + +static __init void xen_write_cr3_init(unsigned long cr3) +{ + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); + + BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(cr3 != __pa(swapper_pg_dir)); + + /* + * We are switching to swapper_pg_dir for the first time (from + * initial_page_table) and therefore need to mark that page + * read-only and then pin it. + * + * Xen disallows sharing of kernel PMDs for PAE + * guests. Therefore we must copy the kernel PMD from + * initial_page_table into a new kernel PMD to be used in + * swapper_pg_dir. + */ + swapper_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + memcpy(swapper_kernel_pmd, initial_kernel_pmd, + sizeof(pmd_t) * PTRS_PER_PMD); + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + xen_write_cr3(cr3); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(initial_page_table))); + set_page_prot(initial_page_table, PAGE_KERNEL); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; - level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + initial_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - xen_map_identity_early(level2_kernel_pgt, max_pfn); + xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); - set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], - __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + initial_page_table[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); + set_page_prot(initial_page_table, PAGE_KERNEL_RO); set_page_prot(empty_zero_page, PAGE_KERNEL_RO); pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - xen_write_cr3(__pa(swapper_pg_dir)); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa(initial_page_table))); + xen_write_cr3(__pa(initial_page_table)); memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); - return swapper_pg_dir; + return initial_page_table; } #endif /* CONFIG_X86_64 */ @@ -2290,7 +2329,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, +#ifdef CONFIG_X86_32 + .write_cr3 = xen_write_cr3_init, +#else .write_cr3 = xen_write_cr3, +#endif .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, -- cgit v0.10.2 From 2a4c92fa24e1853d0e21f9e6e45859b832240f94 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Dec 2010 15:30:06 -0800 Subject: xen: prevent crashes with non-HIGHMEM 32-bit kernels with largeish memory If this is a non-HIGHMEM 32-bit kernel, then the page structures only go up to the limit of addressable memory, even if more memory is physically present. Don't try to add that extra memory to the balloon. Signed-off-by: Jeremy Fitzhardinge diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2b17ad5..43f9f02 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -412,8 +412,16 @@ static int __init balloon_init(void) register_balloon(&balloon_sysdev); - /* Initialise the balloon with excess memory space. */ - extra_pfn_end = min(e820_end_of_ram_pfn(), + /* + * Initialise the balloon with excess memory space. We need + * to make sure we don't add memory which doesn't exist or + * logically exist. The E820 map can be trimmed to be smaller + * than the amount of physical memory due to the mem= command + * line parameter. And if this is a 32-bit non-HIGHMEM kernel + * on a system with memory which requires highmem to access, + * don't try to use it. + */ + extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()), (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size)); for (pfn = PFN_UP(xen_extra_mem_start); pfn < extra_pfn_end; -- cgit v0.10.2 From 29dcbc5c25d6d8140337e96bf503c8475092c586 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Dec 2010 16:14:27 -0800 Subject: xen: allocate irq descs on any NUMA node Allocate irq descs on any NUMA node (we don't care) rather than specifically node 0, which may not exist. (At the moment NUMA is meaningless within a domain, so any info the kernel has is just from an SRAT table we haven't suppressed/disabled.) Signed-off-by: Jeremy Fitzhardinge diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 2811bb9..0009e48 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -423,7 +423,7 @@ static int find_unbound_irq(void) if (irq == start) goto no_irqs; - res = irq_alloc_desc_at(irq, 0); + res = irq_alloc_desc_at(irq, -1); if (WARN_ON(res != irq)) return -1; @@ -630,7 +630,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) if (identity_mapped_irq(gsi) || (!xen_initial_domain() && xen_pv_domain())) { irq = gsi; - irq_alloc_desc_at(irq, 0); + irq_alloc_desc_at(irq, -1); } else irq = find_unbound_irq(); -- cgit v0.10.2