From c2d0879112825cddddd6c4f9b2645ff32acd6dc5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 22 Nov 2010 16:31:35 -0800
Subject: xen: clean up "extra" memory handling some more

Make sure that extra_pages is added for all E820_RAM regions beyond
mem_end - completely excluded regions as well as the remains of partially
included regions.

Also makes sure the extra region is not unnecessarily high, and simplifies
the logic to decide which regions should be added.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 38fdffa..b85dcee 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -182,24 +182,21 @@ char * __init xen_memory_setup(void)
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end = map[i].addr + map[i].size;
 
-		if (map[i].type == E820_RAM) {
-			if (map[i].addr < mem_end && end > mem_end) {
-				/* Truncate region to max_mem. */
-				u64 delta = end - mem_end;
+		if (map[i].type == E820_RAM && end > mem_end) {
+			/* RAM off the end - may be partially included */
+			u64 delta = min(map[i].size, end - mem_end);
 
-				map[i].size -= delta;
-				extra_pages += PFN_DOWN(delta);
+			map[i].size -= delta;
+			end -= delta;
 
-				end = mem_end;
-			}
+			extra_pages += PFN_DOWN(delta);
 		}
 
-		if (end > xen_extra_mem_start)
+		if (map[i].size > 0 && end > xen_extra_mem_start)
 			xen_extra_mem_start = end;
 
-		/* If region is non-RAM or below mem_end, add what remains */
-		if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
-		    map[i].size > 0)
+		/* Add region if any remains */
+		if (map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
 	}
 
-- 
cgit v0.10.2


From bc15fde77fc5d9ec2eec6066a5ab554ea1266a0a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 22 Nov 2010 17:17:50 -0800
Subject: xen: use default_idle

We just need the idle loop to drop into safe_halt, which default_idle()
is perfectly capable of doing.  There's no need to duplicate it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b85dcee..95fb68a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -250,20 +250,6 @@ char * __init xen_memory_setup(void)
 	return "Xen";
 }
 
-static void xen_idle(void)
-{
-	local_irq_disable();
-
-	if (need_resched())
-		local_irq_enable();
-	else {
-		current_thread_info()->status &= ~TS_POLLING;
-		smp_mb__after_clear_bit();
-		safe_halt();
-		current_thread_info()->status |= TS_POLLING;
-	}
-}
-
 /*
  * Set the bit indicating "nosegneg" library variants should be used.
  * We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -360,7 +346,11 @@ void __init xen_arch_setup(void)
 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
 
-	pm_idle = xen_idle;
+	/* Set up idle, making sure it calls safe_halt() pvop */
+#ifdef CONFIG_X86_32
+	boot_cpu_data.hlt_works_ok = 1;
+#endif
+	pm_idle = default_idle;
 
 	fiddle_vdso();
 }
-- 
cgit v0.10.2


From 31e323cca9d5c8afd372976c35a5d46192f540d1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 29 Nov 2010 14:16:53 -0800
Subject: xen: don't bother to stop other cpus on shutdown/reboot

Xen will shoot all the VCPUs when we do a shutdown hypercall, so there's
no need to do it manually.

In any case it will fail because all the IPI irqs have been pulled
down by this point, so the cross-CPU calls will simply hang forever.

Until change 76fac077db6b34e2c6383a7b4f3f4f7b7d06d8ce the function calls
were not synchronously waited for, so this wasn't apparent.  However after
that change the calls became synchronous leading to a hang on shutdown
on multi-VCPU guests.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable Kernel <stable@kernel.org>
Cc: Alok Kataria <akataria@vmware.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 235c0f4..4a5973a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1016,10 +1016,6 @@ static void xen_reboot(int reason)
 {
 	struct sched_shutdown r = { .reason = reason };
 
-#ifdef CONFIG_SMP
-	stop_other_cpus();
-#endif
-
 	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
 		BUG();
 }
-- 
cgit v0.10.2


From 805e3f495057aa5307ad4e3d6dc7073d4733c691 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Wed, 3 Nov 2010 15:32:21 +0000
Subject: xen: x86/32: perform initial startup on initial_page_table

Only make swapper_pg_dir readonly and pinned when generic x86 architecture code
(which also starts on initial_page_table) switches to it.  This helps ensure
that the generic setup paths work on Xen unmodified. In particular
clone_pgd_range writes directly to the destination pgd entries and is used to
initialise swapper_pg_dir so we need to ensure that it remains writeable until
the last possible moment during bring up.

This is complicated slightly by the need to avoid sharing kernel PMD entries
when running under Xen, therefore the Xen implementation must make a copy of
the kernel PMD (which is otherwise referred to by both intial_page_table and
swapper_pg_dir) before switching to swapper_pg_dir.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 4a5973a..0db7303 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1187,8 +1187,6 @@ asmlinkage void __init xen_start_kernel(void)
 	/* Allocate and initialize top and mid mfn levels for p2m structure */
 	xen_build_mfn_list_list();
 
-	init_mm.pgd = pgd;
-
 	/* keep using Xen gdt for now; no urgent need to change it */
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 21ed8d7..c9cf23e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2119,44 +2119,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 	return pgd;
 }
 #else	/* !CONFIG_X86_64 */
-static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+
+static __init void xen_write_cr3_init(unsigned long cr3)
+{
+	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
+
+	BUG_ON(read_cr3() != __pa(initial_page_table));
+	BUG_ON(cr3 != __pa(swapper_pg_dir));
+
+	/*
+	 * We are switching to swapper_pg_dir for the first time (from
+	 * initial_page_table) and therefore need to mark that page
+	 * read-only and then pin it.
+	 *
+	 * Xen disallows sharing of kernel PMDs for PAE
+	 * guests. Therefore we must copy the kernel PMD from
+	 * initial_page_table into a new kernel PMD to be used in
+	 * swapper_pg_dir.
+	 */
+	swapper_kernel_pmd =
+		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
+	       sizeof(pmd_t) * PTRS_PER_PMD);
+	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
+		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
+	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
+
+	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+	xen_write_cr3(cr3);
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
+
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+			  PFN_DOWN(__pa(initial_page_table)));
+	set_page_prot(initial_page_table, PAGE_KERNEL);
+	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
+
+	pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
 
 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 					 unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
 
-	level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+	initial_kernel_pmd =
+		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
 				  512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
 
-	xen_map_identity_early(level2_kernel_pgt, max_pfn);
+	xen_map_identity_early(initial_kernel_pmd, max_pfn);
 
-	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
-	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
-			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+	initial_page_table[KERNEL_PGD_BOUNDARY] =
+		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
 
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
+	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
 	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
 
 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
-	xen_write_cr3(__pa(swapper_pg_dir));
-
-	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+			  PFN_DOWN(__pa(initial_page_table)));
+	xen_write_cr3(__pa(initial_page_table));
 
 	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
 		      __pa(xen_start_info->pt_base +
 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
 		      "XEN PAGETABLES");
 
-	return swapper_pg_dir;
+	return initial_page_table;
 }
 #endif	/* CONFIG_X86_64 */
 
@@ -2290,7 +2329,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.write_cr2 = xen_write_cr2,
 
 	.read_cr3 = xen_read_cr3,
+#ifdef CONFIG_X86_32
+	.write_cr3 = xen_write_cr3_init,
+#else
 	.write_cr3 = xen_write_cr3,
+#endif
 
 	.flush_tlb_user = xen_flush_tlb,
 	.flush_tlb_kernel = xen_flush_tlb,
-- 
cgit v0.10.2


From 2a4c92fa24e1853d0e21f9e6e45859b832240f94 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Dec 2010 15:30:06 -0800
Subject: xen: prevent crashes with non-HIGHMEM 32-bit kernels with largeish
 memory

If this is a non-HIGHMEM 32-bit kernel, then the page structures only go
up to the limit of addressable memory, even if more memory is physically
present.  Don't try to add that extra memory to the balloon.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 2b17ad5..43f9f02 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -412,8 +412,16 @@ static int __init balloon_init(void)
 
 	register_balloon(&balloon_sysdev);
 
-	/* Initialise the balloon with excess memory space. */
-	extra_pfn_end = min(e820_end_of_ram_pfn(),
+	/*
+	 * Initialise the balloon with excess memory space.  We need
+	 * to make sure we don't add memory which doesn't exist or
+	 * logically exist.  The E820 map can be trimmed to be smaller
+	 * than the amount of physical memory due to the mem= command
+	 * line parameter.  And if this is a 32-bit non-HIGHMEM kernel
+	 * on a system with memory which requires highmem to access,
+	 * don't try to use it.
+	 */
+	extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()),
 			    (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
 	for (pfn = PFN_UP(xen_extra_mem_start);
 	     pfn < extra_pfn_end;
-- 
cgit v0.10.2


From 29dcbc5c25d6d8140337e96bf503c8475092c586 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Dec 2010 16:14:27 -0800
Subject: xen: allocate irq descs on any NUMA node

Allocate irq descs on any NUMA node (we don't care) rather than
specifically node 0, which may not exist.

(At the moment NUMA is meaningless within a domain, so any info
the kernel has is just from an SRAT table we haven't suppressed/disabled.)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 2811bb9..0009e48 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -423,7 +423,7 @@ static int find_unbound_irq(void)
 	if (irq == start)
 		goto no_irqs;
 
-	res = irq_alloc_desc_at(irq, 0);
+	res = irq_alloc_desc_at(irq, -1);
 
 	if (WARN_ON(res != irq))
 		return -1;
@@ -630,7 +630,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
 	if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
 				xen_pv_domain())) {
 		irq = gsi;
-		irq_alloc_desc_at(irq, 0);
+		irq_alloc_desc_at(irq, -1);
 	} else
 		irq = find_unbound_irq();
 
-- 
cgit v0.10.2