From 6a242909b01120f6f3d571c0b75e20ec61f0d8d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:58 +0900
Subject: percpu: clean up percpu constants

Impact: cleaup

Make the following cleanups.

* There isn't much arch-specific about PERCPU_MODULE_RESERVE.  Always
  define it whether arch overrides PERCPU_ENOUGH_ROOM or not.

* blackfin overrides PERCPU_ENOUGH_ROOM to align static area size.  Do
  it by default.

* percpu allocation sizes doesn't have much to do with the page size.
  Don't use PAGE_SHIFT in their definition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bryan Wu <cooloney@kernel.org>

diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h
index 797c0c1..c94c7bc 100644
--- a/arch/blackfin/include/asm/percpu.h
+++ b/arch/blackfin/include/asm/percpu.h
@@ -3,14 +3,4 @@
 
 #include <asm-generic/percpu.h>
 
-#ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE 8192
-#else
-#define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
-
 #endif	/* __ARCH_BLACKFIN_PERCPU__ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 545b068..2d34b03 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -5,6 +5,7 @@
 #include <linux/slab.h> /* For kmalloc() */
 #include <linux/smp.h>
 #include <linux/cpumask.h>
+#include <linux/pfn.h>
 
 #include <asm/percpu.h>
 
@@ -52,17 +53,18 @@
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
 
-/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
-#ifndef PERCPU_ENOUGH_ROOM
+/* enough to cover all DEFINE_PER_CPUs in modules */
 #ifdef CONFIG_MODULES
-#define PERCPU_MODULE_RESERVE	8192
+#define PERCPU_MODULE_RESERVE		(8 << 10)
 #else
-#define PERCPU_MODULE_RESERVE	0
+#define PERCPU_MODULE_RESERVE		0
 #endif
 
+#ifndef PERCPU_ENOUGH_ROOM
 #define PERCPU_ENOUGH_ROOM						\
-	(__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
-#endif	/* PERCPU_ENOUGH_ROOM */
+	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) +	\
+	 PERCPU_MODULE_RESERVE)
+#endif
 
 /*
  * Must be an lvalue. Since @var must be a simple identifier,
@@ -79,7 +81,7 @@
 #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
 /* minimum unit size, also is the maximum supported allocation size */
-#define PCPU_MIN_UNIT_SIZE		(16UL << PAGE_SHIFT)
+#define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)
 
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
@@ -96,15 +98,15 @@
 #ifndef PERCPU_DYNAMIC_RESERVE
 #  if BITS_PER_LONG > 32
 #    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(6 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(24 << 10)
 #    else
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
 #    endif
 #  else
 #    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(4 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
 #    else
-#      define PERCPU_DYNAMIC_RESERVE	(2 << PAGE_SHIFT)
+#      define PERCPU_DYNAMIC_RESERVE	(8 << 10)
 #    endif
 #  endif
 #endif	/* PERCPU_DYNAMIC_RESERVE */
-- 
cgit v0.10.2


From 2441d15c97d498b18f03ae9fba262ffeae42a08b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: cosmetic renames in pcpu_setup_first_chunk()

Impact: cosmetic, preparation for future changes

Make the following renames in pcpur_setup_first_chunk() in preparation
for future changes.

* s/free_size/dyn_size/
* s/static_vm/first_vm/
* s/static_chunk/schunk/

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 2d34b03..a0b4ea2 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -118,7 +118,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 					size_t static_size, size_t unit_size,
-					size_t free_size, void *base_addr,
+					size_t dyn_size, void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 3d0f545..9531590 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @free_size: free size in bytes, 0 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * return the same number of pages for all cpus.
  *
  * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
  *
- * @free_size determines the number of free bytes after the static
+ * @dyn_size determines the number of free bytes after the static
  * area in the first chunk.  If zero, whatever left is available.
  * Specifying non-zero value make percpu leave the area after
- * @static_size + @free_size alone.
+ * @static_size + @dyn_size alone.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu);
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t unit_size,
-				     size_t free_size, void *base_addr,
+				     size_t dyn_size, void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
-	static struct vm_struct static_vm;
-	struct pcpu_chunk *static_chunk;
+	static struct vm_struct first_vm;
+	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && free_size);
-	BUG_ON(unit_size && unit_size < static_size + free_size);
+	BUG_ON(!unit_size && dyn_size);
+	BUG_ON(unit_size && unit_size < static_size + dyn_size);
 	BUG_ON(unit_size & ~PAGE_MASK);
 	BUG_ON(base_addr && !unit_size);
 	BUG_ON(base_addr && populate_pte_fn);
@@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static_chunk */
-	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
-	INIT_LIST_HEAD(&static_chunk->list);
-	static_chunk->vm = &static_vm;
+	/* init static chunk */
+	schunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&schunk->list);
+	schunk->vm = &first_vm;
 
-	if (free_size)
-		static_chunk->free_size = free_size;
+	if (dyn_size)
+		schunk->free_size = dyn_size;
 	else
-		static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - pcpu_static_size;
 
-	static_chunk->contig_hint = static_chunk->free_size;
+	schunk->contig_hint = schunk->free_size;
 
 	/* allocate vm address */
-	static_vm.flags = VM_ALLOC;
-	static_vm.size = pcpu_chunk_size;
+	first_vm.flags = VM_ALLOC;
+	first_vm.size = pcpu_chunk_size;
 
 	if (!base_addr)
-		vm_area_register_early(&static_vm, PAGE_SIZE);
+		vm_area_register_early(&first_vm, PAGE_SIZE);
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
@@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		 * be mapped or unmapped by percpu and is marked
 		 * immutable.
 		 */
-		static_vm.addr = base_addr;
-		static_chunk->immutable = true;
+		first_vm.addr = base_addr;
+		schunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 			if (!page)
 				break;
-			*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
 		BUG_ON(i < PFN_UP(pcpu_static_size));
@@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (populate_pte_fn) {
 		for_each_possible_cpu(cpu)
 			for (i = 0; i < nr_pages; i++)
-				populate_pte_fn(pcpu_chunk_addr(static_chunk,
+				populate_pte_fn(pcpu_chunk_addr(schunk,
 								cpu, i));
 
-		err = pcpu_map(static_chunk, 0, nr_pages);
+		err = pcpu_map(schunk, 0, nr_pages);
 		if (err)
 			panic("failed to setup static percpu area, err=%d\n",
 			      err);
 	}
 
-	/* link static_chunk in */
-	pcpu_chunk_relocate(static_chunk, -1);
-	pcpu_chunk_addr_insert(static_chunk);
+	/* link the first chunk in */
+	pcpu_chunk_relocate(schunk, -1);
+	pcpu_chunk_addr_insert(schunk);
 
 	/* we're done */
-	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 	return pcpu_unit_size;
 }
-- 
cgit v0.10.2


From 61ace7fa2fff9c4b6641c506b6b3f1a9394a1b11 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: improve first chunk initial area map handling

Impact: no functional change

When the first chunk is created, its initial area map is not allocated
because kmalloc isn't online yet.  The map is allocated and
initialized on the first allocation request on the chunk.  This works
fine but the scattering of initialization logic between the init
function and allocation path is a bit confusing.

This patch makes the first chunk initialize and use minimal statically
allocated map from pcpu_setpu_first_chunk().  The map resizing path
still needs to handle this specially but it's more straight-forward
and gives more latitude to the init path.  This will ease future
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 9531590..503ccad 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -93,9 +93,6 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* the size of kernel static area */
-static int pcpu_static_size __read_mostly;
-
 /*
  * One mutex to rule them all.
  *
@@ -316,15 +313,28 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 
 	/* reallocation required? */
 	if (chunk->map_alloc < target) {
-		int new_alloc = chunk->map_alloc;
+		int new_alloc;
 		int *new;
 
+		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		new = pcpu_realloc(chunk->map,
-				   chunk->map_alloc * sizeof(new[0]),
-				   new_alloc * sizeof(new[0]));
+		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
+			/*
+			 * map_alloc smaller than the default size
+			 * indicates that the chunk is one of the
+			 * first chunks and still using static map.
+			 * Allocate a dynamic one and copy.
+			 */
+			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
+			if (new)
+				memcpy(new, chunk->map,
+				       chunk->map_alloc * sizeof(new[0]));
+		} else
+			new = pcpu_realloc(chunk->map,
+					   chunk->map_alloc * sizeof(new[0]),
+					   new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
@@ -367,22 +377,6 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	int max_contig = 0;
 	int i, off;
 
-	/*
-	 * The static chunk initially doesn't have map attached
-	 * because kmalloc wasn't available during init.  Give it one.
-	 */
-	if (unlikely(!chunk->map)) {
-		chunk->map = pcpu_realloc(NULL, 0,
-				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-		if (!chunk->map)
-			return -ENOMEM;
-
-		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-		chunk->map[chunk->map_used++] = -pcpu_static_size;
-		if (chunk->free_size)
-			chunk->map[chunk->map_used++] = chunk->free_size;
-	}
-
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 		bool is_last = i + 1 == chunk->map_used;
 		int head, tail;
@@ -874,12 +868,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
+	static int smap[2];
 	struct pcpu_chunk *schunk;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	BUG_ON(!unit_size && dyn_size);
 	BUG_ON(unit_size && unit_size < static_size + dyn_size);
@@ -893,7 +889,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 					PFN_UP(static_size));
 
-	pcpu_static_size = static_size;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
@@ -912,14 +907,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
+	schunk->map = smap;
+	schunk->map_alloc = ARRAY_SIZE(smap);
 
 	if (dyn_size)
 		schunk->free_size = dyn_size;
 	else
-		schunk->free_size = pcpu_unit_size - pcpu_static_size;
+		schunk->free_size = pcpu_unit_size - static_size;
 
 	schunk->contig_hint = schunk->free_size;
 
+	schunk->map[schunk->map_used++] = -static_size;
+	if (schunk->free_size)
+		schunk->map[schunk->map_used++] = schunk->free_size;
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -948,7 +949,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 			*pcpu_chunk_pagep(schunk, cpu, i) = page;
 		}
 
-		BUG_ON(i < PFN_UP(pcpu_static_size));
+		BUG_ON(i < PFN_UP(static_size));
 
 		if (nr_pages < 0)
 			nr_pages = i;
-- 
cgit v0.10.2


From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments

Impact: argument semantic cleanup

In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant
auto-sizing.  It's okay for @unit_size as 0 doesn't make sense but 0
dynamic reserve size is valid.  Alos, if arch @dyn_size is calculated
from other parameters, it might end up passing in 0 @dyn_size and
malfunction when the size is automatically adjusted.

This patch makes both @unit_size and @dyn_size ssize_t and use -1 for
auto sizing.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index c29f301..ef3a2cd3f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
 				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a0b4ea2..a96fc53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,8 +117,9 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size, size_t unit_size,
-					size_t dyn_size, void *base_addr,
+					size_t static_size,
+					ssize_t unit_size, ssize_t dyn_size,
+					void *base_addr,
 					pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 503ccad..a84cf99 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
- * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
  *
@@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
- * @unit_size, if non-zero, determines unit size and must be aligned
- * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
+ * @unit_size, if non-negative, specifies unit size and must be
+ * aligned to PAGE_SIZE and equal to or larger than @static_size +
+ * @dyn_size.
  *
- * @dyn_size determines the number of free bytes after the static
- * area in the first chunk.  If zero, whatever left is available.
- * Specifying non-zero value make percpu leave the area after
- * @static_size + @dyn_size alone.
+ * @dyn_size, if non-negative, limits the number of bytes available
+ * for dynamic allocation in the first chunk.  Specifying non-negative
+ * value make percpu leave alone the area beyond @static_size +
+ * @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size, size_t unit_size,
-				     size_t dyn_size, void *base_addr,
+				     size_t static_size,
+				     ssize_t unit_size, ssize_t dyn_size,
+				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
@@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	/* santiy checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
-	BUG_ON(!unit_size && dyn_size);
-	BUG_ON(unit_size && unit_size < static_size + dyn_size);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(base_addr && !unit_size);
+	if (unit_size >= 0) {
+		BUG_ON(unit_size < static_size +
+				   (dyn_size >= 0 ? dyn_size : 0));
+		BUG_ON(unit_size & ~PAGE_MASK);
+	} else {
+		BUG_ON(dyn_size >= 0);
+		BUG_ON(base_addr);
+	}
 	BUG_ON(base_addr && populate_pte_fn);
 
-	if (unit_size)
+	if (unit_size >= 0)
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
@@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
+	if (dyn_size < 0)
+		dyn_size = pcpu_unit_size - static_size;
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
-
-	if (dyn_size)
-		schunk->free_size = dyn_size;
-	else
-		schunk->free_size = pcpu_unit_size - static_size;
-
+	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
-- 
cgit v0.10.2


From 9a4f8a878b68d5a5d9ee60908a52cf6a55e1b823 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: x86: make embedding percpu allocator return excessive free space

Impact: reduce unnecessary memory usage on certain configurations

Embedding percpu allocator allocates unit_size *
smp_num_possible_cpus() bytes consecutively and use it for the first
chunk.  However, if the static area is small, this can result in
excessive prellocated free space in the first chunk due to
PCPU_MIN_UNIT_SIZE restriction.

This patch makes embedding percpu allocator preallocate only what's
necessary as described by PERPCU_DYNAMIC_RESERVE and return the
leftover to the bootmem allocator.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef3a2cd3f..38e2b2a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -241,24 +241,31 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
  * Embedding allocator
  *
  * The first chunk is sized to just contain the static area plus
- * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
- * bootmem allocator and used as-is without being mapped into vmalloc
- * area.  This enables the first chunk to piggy back on the linear
- * physical PMD mapping and doesn't add any additional pressure to
- * TLB.
+ * module and dynamic reserves, and allocated as a contiguous area
+ * using bootmem allocator and used as-is without being mapped into
+ * vmalloc area.  This enables the first chunk to piggy back on the
+ * linear physical PMD mapping and doesn't add any additional pressure
+ * to TLB.  Note that if the needed size is smaller than the minimum
+ * unit size, the leftover is returned to the bootmem allocator.
  */
 static void *pcpue_ptr __initdata;
+static size_t pcpue_size __initdata;
 static size_t pcpue_unit_size __initdata;
 
 static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 {
-	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
-			    + ((size_t)pageno << PAGE_SHIFT));
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpue_size)
+		return NULL;
+
+	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
 }
 
 static ssize_t __init setup_pcpu_embed(size_t static_size)
 {
 	unsigned int cpu;
+	size_t dyn_size;
 
 	/*
 	 * If large page isn't supported, there's no benefit in doing
@@ -269,25 +276,30 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		return -EINVAL;
 
 	/* allocate and copy */
-	pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
-	pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+	pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
+	dyn_size = pcpue_size - static_size;
+
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
 	if (!pcpue_ptr)
 		return -ENOMEM;
 
-	for_each_possible_cpu(cpu)
-		memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
-		       static_size);
+	for_each_possible_cpu(cpu) {
+		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+
+		free_bootmem(__pa(ptr + pcpue_size),
+			     pcpue_unit_size - pcpue_size);
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
-		pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
 	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
-				      pcpue_unit_size,
-				      pcpue_unit_size - static_size, pcpue_ptr,
-				      NULL);
+				      pcpue_unit_size, dyn_size,
+				      pcpue_ptr, NULL);
 }
 
 /*
-- 
cgit v0.10.2


From 3e24aa58907c62bc79d1094e941a374568f62522 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu: add an indirection ptr for chunk page map access

Impact: allow sharing page map, no functional difference yet

Make chunk->page access indirect by adding a pointer and renaming the
actual array to page_ar.  This will be used by future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index a84cf99..5b47d9f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,7 +80,8 @@ struct pcpu_chunk {
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	bool			immutable;	/* no [de]population allowed */
-	struct page		*page[];	/* #cpus * UNIT_PAGES */
+	struct page		**page;		/* points to page array */
+	struct page		*page_ar[];	/* #cpus * UNIT_PAGES */
 };
 
 static int pcpu_unit_pages __read_mostly;
@@ -696,6 +697,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
+	chunk->page = chunk->page_ar;
 
 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 	if (!chunk->vm) {
@@ -918,6 +920,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
+	schunk->page = schunk->page_ar;
 	schunk->free_size = dyn_size;
 	schunk->contig_hint = schunk->free_size;
 
-- 
cgit v0.10.2


From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu, module: implement reserved allocation and use it for module
 percpu variables

Impact: add reserved allocation functionality and use it for module
	percpu variables

This patch implements reserved allocation from the first chunk.  When
setting up the first chunk, arch can ask to set aside certain number
of bytes right after the core static area which is available only
through a separate reserved allocator.  This will be used primarily
for module static percpu variables on architectures with limited
relocation range to ensure that the module perpcu symbols are inside
the relocatable range.

If reserved area is requested, the first chunk becomes reserved and
isn't available for regular allocation.  If the first chunk also
includes piggy-back dynamic allocation area, a separate chunk mapping
the same region is created to serve dynamic allocation.  The first one
is called static first chunk and the second dynamic first chunk.
Although they share the page map, their different area map
initializations guarantee they serve disjoint areas according to their
purposes.

If arch doesn't setup reserved area, reserved allocation is handled
like any other allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 38e2b2a..dd4eabc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -217,7 +217,7 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
 				     pcpur_size - static_size, vm.addr, NULL);
 	goto out_free_ar;
 
@@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
+				     NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a96fc53..8ff1515 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size,
-					ssize_t unit_size, ssize_t dyn_size,
-					void *base_addr,
-					pcpu_populate_pte_fn_t populate_pte_fn);
+				size_t static_size, size_t reserved_size,
+				ssize_t unit_size, ssize_t dyn_size,
+				void *base_addr,
+				pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
@@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  */
 #define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
 
+extern void *__alloc_reserved_percpu(size_t size, size_t align);
+
 #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 
 struct percpu_data {
diff --git a/kernel/module.c b/kernel/module.c
index 1f0657a..f0e04d6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 		align = PAGE_SIZE;
 	}
 
-	ptr = __alloc_percpu(size, align);
+	ptr = __alloc_reserved_percpu(size, align);
 	if (!ptr)
 		printk(KERN_WARNING
 		       "Could not allocate %lu bytes percpu data\n", size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5b47d9f..ef8e169 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
 /*
  * One mutex to rule them all.
  *
@@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size)
  *
  * This function is called after an allocation or free changed @chunk.
  * New slot according to the changed state is determined and @chunk is
- * moved to the slot.
+ * moved to the slot.  Note that the reserved chunk is never put on
+ * chunk slots.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 
-	if (oslot != nslot) {
+	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
@@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
+	/* is it in the reserved chunk? */
+	if (pcpu_reserved_chunk) {
+		void *start = pcpu_reserved_chunk->vm->addr;
+
+		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+			return pcpu_reserved_chunk;
+	}
+
+	/* nah... search the regular ones */
 	n = *pcpu_chunk_rb_search(addr, &parent);
 	if (!n) {
 		/* no exactly matching chunk, the parent is the closest */
@@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 }
 
 /**
- * __alloc_percpu - allocate percpu area
+ * pcpu_alloc - the percpu allocator
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
@@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_percpu(size_t size, size_t align)
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
 	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
@@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align)
 
 	mutex_lock(&pcpu_mutex);
 
-	/* allocate area */
+	/* serve reserved allocations from the reserved chunk if available */
+	if (reserved && pcpu_reserved_chunk) {
+		chunk = pcpu_reserved_chunk;
+		if (size > chunk->contig_hint)
+			goto out_unlock;
+		off = pcpu_alloc_area(chunk, size, align);
+		if (off >= 0)
+			goto area_found;
+		goto out_unlock;
+	}
+
+	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
@@ -773,8 +800,41 @@ out_unlock:
 	mutex_unlock(&pcpu_mutex);
 	return ptr;
 }
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, false);
+}
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align from reserved
+ * percpu area if arch has set it up; otherwise, allocation is served
+ * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_reserved_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, true);
+}
+
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
 	WARN_ON(chunk->immutable);
@@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
@@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
+ * @reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk.  This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation.  This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * @dyn_size, if non-negative, limits the number of bytes available
  * for dynamic allocation in the first chunk.  Specifying non-negative
  * value make percpu leave alone the area beyond @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @populate_pte_fn is used to populate the pagetable.  NULL means the
  * caller already populated the pagetable.
  *
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area.  They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other.  The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size,
+				     size_t static_size, size_t reserved_size,
 				     ssize_t unit_size, ssize_t dyn_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
-	static int smap[2];
-	struct pcpu_chunk *schunk;
+	static int smap[2], dmap[2];
+	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
+		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size +
+		BUG_ON(unit_size < static_size + reserved_size +
 				   (dyn_size >= 0 ? dyn_size : 0));
 		BUG_ON(unit_size & ~PAGE_MASK);
 	} else {
@@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size));
+					PFN_UP(static_size + reserved_size));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	if (dyn_size < 0)
-		dyn_size = pcpu_unit_size - static_size;
+		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
@@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static chunk */
+	/*
+	 * Initialize static chunk.  If reserved_size is zero, the
+	 * static chunk covers static area + dynamic allocation area
+	 * in the first chunk.  If reserved_size is not zero, it
+	 * covers static area + reserved area (mostly used for module
+	 * static percpu allocation).
+	 */
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
-	schunk->free_size = dyn_size;
+
+	if (reserved_size) {
+		schunk->free_size = reserved_size;
+		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+	} else {
+		schunk->free_size = dyn_size;
+		dyn_size = 0;			/* dynamic area covered */
+	}
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
+	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
+
+	/* init dynamic chunk if necessary */
+	if (dyn_size) {
+		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		INIT_LIST_HEAD(&dchunk->list);
+		dchunk->vm = &first_vm;
+		dchunk->map = dmap;
+		dchunk->map_alloc = ARRAY_SIZE(dmap);
+		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+
+		dchunk->contig_hint = dchunk->free_size = dyn_size;
+		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+		dchunk->map[dchunk->map_used++] = dchunk->free_size;
+	}
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the static chunk can't
-		 * be mapped or unmapped by percpu and is marked
+		 * vmalloc area.  In this case the first chunks can't
+		 * be mapped or unmapped by percpu and are marked
 		 * immutable.
 		 */
 		first_vm.addr = base_addr;
 		schunk->immutable = true;
+		if (dchunk)
+			dchunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	pcpu_chunk_relocate(schunk, -1);
-	pcpu_chunk_addr_insert(schunk);
+	if (!dchunk) {
+		pcpu_chunk_relocate(schunk, -1);
+		pcpu_chunk_addr_insert(schunk);
+	} else {
+		pcpu_chunk_relocate(dchunk, -1);
+		pcpu_chunk_addr_insert(dchunk);
+	}
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
cgit v0.10.2


From 6b19b0c2400437a3c10059ede0e59b517092e1bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: x86, percpu: setup reserved percpu area for x86_64

Impact: fix relocation overflow during module load

x86_64 uses 32bit relocations for symbol access and static percpu
symbols whether in core or modules must be inside 2GB of the percpu
segement base which the dynamic percpu allocator doesn't guarantee.
This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the
first chunk so that module percpu areas are always allocated from the
first chunk which is always inside the relocatable range.

This problem exists for any percpu allocator but is easily triggered
when using the embedding allocator because the second chunk is located
beyond 2GB on it.

This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such
that it only indicates the size of the area to reserve for dynamic
allocation as static and dynamic areas can be separate.  New
PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as
the reserved area separation eats away some allocatable space and
having slightly more headroom (currently between 4 and 8k after
minimal boot sans module area) makes sense for common case
performance.

x86_32 can address anywhere from anywhere and doesn't need reserving.

Mike Galbraith first reported the problem first and bisected it to the
embedding percpu allocator commit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index dd4eabc..efa615f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations.  Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base.  On x86_32, anything can
+ * address anywhere.  No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE	0
+#endif
+
 /**
  * pcpu_need_numa - determine percpu allocation needs to consider NUMA
  *
@@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 {
 	static struct vm_struct vm;
 	pg_data_t *last;
-	size_t ptrs_size;
+	size_t ptrs_size, dyn_size;
 	unsigned int cpu;
 	ssize_t ret;
 
@@ -169,12 +182,14 @@ proceed:
 	 * Currently supports only single page.  Supporting multiple
 	 * pages won't be too difficult if it ever becomes necessary.
 	 */
-	pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
 	if (pcpur_size > PMD_SIZE) {
 		pr_warning("PERCPU: static data is larger than large page, "
 			   "can't use large page\n");
 		return -EINVAL;
 	}
+	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	/* allocate pointer array and alloc large pages */
 	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
@@ -217,8 +232,9 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
-				     pcpur_size - static_size, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE,
+				     PMD_SIZE, dyn_size, vm.addr, NULL);
 	goto out_free_ar;
 
 enomem:
@@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 		return -EINVAL;
 
 	/* allocate and copy */
-	pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
 	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-	dyn_size = pcpue_size - static_size;
+	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
 				       PAGE_SIZE);
@@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      PERCPU_FIRST_CHUNK_RESERVE,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
-				     NULL, pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
+				     pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8ff1515..54a968b 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -85,31 +85,20 @@
 
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
- * back on the first chunk if arch is manually allocating and mapping
- * it for faster access (as a part of large page mapping for example).
- * Note that dynamic percpu allocator covers both static and dynamic
- * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ * back on the first chunk for dynamic percpu allocation if arch is
+ * manually allocating and mapping it for faster access (as a part of
+ * large page mapping for example).
  *
- * On typical configuration with modules, the following values leave
- * about 8k of free space on the first chunk after boot on both x86_32
- * and 64 when module support is enabled.  When module support is
- * disabled, it's much tighter.
+ * The following values give between one and two pages of free space
+ * after typical minimal boot (2-way SMP, single disk and NIC) with
+ * both defconfig and a distro config on x86_64 and 32.  More
+ * intelligent way to determine this would be nice.
  */
-#ifndef PERCPU_DYNAMIC_RESERVE
-#  if BITS_PER_LONG > 32
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(24 << 10)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
-#    endif
-#  else
-#    ifdef CONFIG_MODULES
-#      define PERCPU_DYNAMIC_RESERVE	(16 << 10)
-#    else
-#      define PERCPU_DYNAMIC_RESERVE	(8 << 10)
-#    endif
-#  endif
-#endif	/* PERCPU_DYNAMIC_RESERVE */
+#if BITS_PER_LONG > 32
+#define PERCPU_DYNAMIC_RESERVE		(20 << 10)
+#else
+#define PERCPU_DYNAMIC_RESERVE		(12 << 10)
+#endif
 
 extern void *pcpu_base_addr;
 
-- 
cgit v0.10.2


From d1a8e7792047f7dca7eb5759250e2c12800bf262 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 6 Mar 2009 03:12:50 -0800
Subject: x86: make "memtest" like "memtest=17"

Impact: make boot command line "memtest" do one loop by default

So don't need to guess many patterns in one loop.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B10532.3020105@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd788..605c8be 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
 {
 	if (arg)
 		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	else
+		memtest_pattern = ARRAY_SIZE(patterns);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From c77a3b59c624454c501cbfa1a3611d5a00bf9532 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 17:04:26 +0200
Subject: x86: fix uninitialized variable in init_memory_mapping()

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1236265466.31324.9.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6d63e3d..15219e0 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -134,8 +134,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 {
 	unsigned long page_size_mask = 0;
 	unsigned long start_pfn, end_pfn;
+	unsigned long ret = 0;
 	unsigned long pos;
-	unsigned long ret;
 
 	struct map_range mr[NR_RANGE_MR];
 	int nr_range, i;
-- 
cgit v0.10.2


From 5dd61dfabcaa5bfb67afb8a2d255bd1e156562e3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Mar 2009 17:04:57 +0200
Subject: x86: rename do_not_nx to disable_nx in mm/init_64.c

As a preparational step for unifying noexec handling on 32-bit and 64-bit,
rename the do_not_nx variable to disable_nx on 64-bit.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1236265497.31324.11.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8a853bc..54efa57d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -85,7 +85,7 @@ early_param("gbpages", parse_direct_gbpages_on);
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
 
 /*
  * noexec=on|off
@@ -100,9 +100,9 @@ static int __init nonx_setup(char *str)
 		return -EINVAL;
 	if (!strncmp(str, "on", 2)) {
 		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
+		disable_nx = 0;
 	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
+		disable_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
 	}
 	return 0;
@@ -114,7 +114,7 @@ void __cpuinit check_efer(void)
 	unsigned long efer;
 
 	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
+	if (!(efer & EFER_NX) || disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }
 
-- 
cgit v0.10.2


From 1880d93b80acc3171850e9df5048bcb26b75c2f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:09 +0900
Subject: percpu: replace pcpu_realloc() with pcpu_mem_alloc() and
 pcpu_mem_free()

Impact: code reorganization for later changes

With static map handling moved to pcpu_split_block(), pcpu_realloc()
only clutters the code and it's also unsuitable for scheduled locking
changes.  Implement and use pcpu_mem_alloc/free() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index ef8e169..f1d0e90 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -164,39 +164,41 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 }
 
 /**
- * pcpu_realloc - versatile realloc
- * @p: the current pointer (can be NULL for new allocations)
- * @size: the current size in bytes (can be 0 for new allocations)
- * @new_size: the wanted new size in bytes (can be 0 for free)
+ * pcpu_mem_alloc - allocate memory
+ * @size: bytes to allocate
  *
- * More robust realloc which can be used to allocate, resize or free a
- * memory area of arbitrary size.  If the needed size goes over
- * PAGE_SIZE, kernel VM is used.
+ * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, vmalloc() is used.  The returned
+ * memory is always zeroed.
  *
  * RETURNS:
- * The new pointer on success, NULL on failure.
+ * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+static void *pcpu_mem_alloc(size_t size)
 {
-	void *new;
-
-	if (new_size <= PAGE_SIZE)
-		new = kmalloc(new_size, GFP_KERNEL);
-	else
-		new = vmalloc(new_size);
-	if (new_size && !new)
-		return NULL;
-
-	memcpy(new, p, min(size, new_size));
-	if (new_size > size)
-		memset(new + size, 0, new_size - size);
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_KERNEL);
+	else {
+		void *ptr = vmalloc(size);
+		if (ptr)
+			memset(ptr, 0, size);
+		return ptr;
+	}
+}
 
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ * @size: size of the area
+ *
+ * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
+ */
+static void pcpu_mem_free(void *ptr, size_t size)
+{
 	if (size <= PAGE_SIZE)
-		kfree(p);
+		kfree(ptr);
 	else
-		vfree(p);
-
-	return new;
+		vfree(ptr);
 }
 
 /**
@@ -331,29 +333,27 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 	if (chunk->map_alloc < target) {
 		int new_alloc;
 		int *new;
+		size_t size;
 
 		new_alloc = PCPU_DFL_MAP_ALLOC;
 		while (new_alloc < target)
 			new_alloc *= 2;
 
-		if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
-			/*
-			 * map_alloc smaller than the default size
-			 * indicates that the chunk is one of the
-			 * first chunks and still using static map.
-			 * Allocate a dynamic one and copy.
-			 */
-			new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
-			if (new)
-				memcpy(new, chunk->map,
-				       chunk->map_alloc * sizeof(new[0]));
-		} else
-			new = pcpu_realloc(chunk->map,
-					   chunk->map_alloc * sizeof(new[0]),
-					   new_alloc * sizeof(new[0]));
+		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
 		if (!new)
 			return -ENOMEM;
 
+		size = chunk->map_alloc * sizeof(chunk->map[0]);
+		memcpy(new, chunk->map, size);
+
+		/*
+		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
+		 * chunk is one of the first chunks and still using
+		 * static map.
+		 */
+		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+			pcpu_mem_free(chunk->map, size);
+
 		chunk->map_alloc = new_alloc;
 		chunk->map = new;
 	}
@@ -696,7 +696,7 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 		return;
 	if (chunk->vm)
 		free_vm_area(chunk->vm);
-	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
 	kfree(chunk);
 }
 
@@ -708,8 +708,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 	if (!chunk)
 		return NULL;
 
-	chunk->map = pcpu_realloc(NULL, 0,
-				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
 	chunk->page = chunk->page_ar;
-- 
cgit v0.10.2


From 9f7dcf224bd09ec9ebcbfb383bf2c465e0e0b03d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:09 +0900
Subject: percpu: move chunk area map extension out of area allocation

Impact: code reorganization for later changes

Separate out chunk area map extension into a separate function -
pcpu_extend_area_map() - and call it directly from pcpu_alloc() such
that pcpu_alloc_area() is guaranteed to have enough area map slots on
invocation.

With this change, pcpu_alloc_area() does only area allocation and the
only failure mode is when the chunk doens't have enough room, so
there's no need to distinguish it from memory allocation failures.
Make it return -1 on such cases instead of hacky -ENOSPC.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index f1d0e90..7d9bc35 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -307,6 +307,50 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 }
 
 /**
+ * pcpu_extend_area_map - extend area map for allocation
+ * @chunk: target chunk
+ *
+ * Extend area map of @chunk so that it can accomodate an allocation.
+ * A single allocation can split an area into three areas, so this
+ * function makes sure that @chunk->map has at least two extra slots.
+ *
+ * RETURNS:
+ * 0 if noop, 1 if successfully extended, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+{
+	int new_alloc;
+	int *new;
+	size_t size;
+
+	/* has enough? */
+	if (chunk->map_alloc >= chunk->map_used + 2)
+		return 0;
+
+	new_alloc = PCPU_DFL_MAP_ALLOC;
+	while (new_alloc < chunk->map_used + 2)
+		new_alloc *= 2;
+
+	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+	if (!new)
+		return -ENOMEM;
+
+	size = chunk->map_alloc * sizeof(chunk->map[0]);
+	memcpy(new, chunk->map, size);
+
+	/*
+	 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
+	 * one of the first chunks and still using static map.
+	 */
+	if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+		pcpu_mem_free(chunk->map, size);
+
+	chunk->map_alloc = new_alloc;
+	chunk->map = new;
+	return 0;
+}
+
+/**
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
  * @i: index of map block to split
@@ -321,44 +365,16 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * depending on @head, is reduced by @tail bytes and @tail byte block
  * is inserted after the target block.
  *
- * RETURNS:
- * 0 on success, -errno on failure.
+ * @chunk->map must have enough free slots to accomodate the split.
  */
-static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
+			     int head, int tail)
 {
 	int nr_extra = !!head + !!tail;
-	int target = chunk->map_used + nr_extra;
-
-	/* reallocation required? */
-	if (chunk->map_alloc < target) {
-		int new_alloc;
-		int *new;
-		size_t size;
-
-		new_alloc = PCPU_DFL_MAP_ALLOC;
-		while (new_alloc < target)
-			new_alloc *= 2;
-
-		new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-		if (!new)
-			return -ENOMEM;
-
-		size = chunk->map_alloc * sizeof(chunk->map[0]);
-		memcpy(new, chunk->map, size);
-
-		/*
-		 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the
-		 * chunk is one of the first chunks and still using
-		 * static map.
-		 */
-		if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-			pcpu_mem_free(chunk->map, size);
 
-		chunk->map_alloc = new_alloc;
-		chunk->map = new;
-	}
+	BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
 
-	/* insert a new subblock */
+	/* insert new subblocks */
 	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 		sizeof(chunk->map[0]) * (chunk->map_used - i));
 	chunk->map_used += nr_extra;
@@ -371,7 +387,6 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 		chunk->map[i++] -= tail;
 		chunk->map[i] = tail;
 	}
-	return 0;
 }
 
 /**
@@ -384,8 +399,11 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
  * Note that this function only allocates the offset.  It doesn't
  * populate or map the area.
  *
+ * @chunk->map must have at least two free slots.
+ *
  * RETURNS:
- * Allocated offset in @chunk on success, -errno on failure.
+ * Allocated offset in @chunk on success, -1 if no matching area is
+ * found.
  */
 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 {
@@ -433,8 +451,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 
 		/* split if warranted */
 		if (head || tail) {
-			if (pcpu_split_block(chunk, i, head, tail))
-				return -ENOMEM;
+			pcpu_split_block(chunk, i, head, tail);
 			if (head) {
 				i++;
 				off += head;
@@ -461,14 +478,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 
-	/*
-	 * Tell the upper layer that this chunk has no area left.
-	 * Note that this is not an error condition but a notification
-	 * to upper layer that it needs to look at other chunks.
-	 * -ENOSPC is chosen as it isn't used in memory subsystem and
-	 * matches the meaning in a way.
-	 */
-	return -ENOSPC;
+	/* tell the upper layer that this chunk has no matching area */
+	return -1;
 }
 
 /**
@@ -755,7 +766,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
-		if (size > chunk->contig_hint)
+		if (size > chunk->contig_hint ||
+		    pcpu_extend_area_map(chunk) < 0)
 			goto out_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
@@ -768,11 +780,11 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
+			if (pcpu_extend_area_map(chunk) < 0)
+				goto out_unlock;
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
-			if (off != -ENOSPC)
-				goto out_unlock;
 		}
 	}
 
-- 
cgit v0.10.2


From a56dbddf06b653ef9c04ca3767f260fd31ccebab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:11 +0900
Subject: percpu: move fully free chunk reclamation into a work

Impact: code reorganization for later changes

Do fully free chunk reclamation using a work.  This change is to
prepare for locking changes.

Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 7d9bc35..4c8a419 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -63,6 +63,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/workqueue.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -118,6 +119,10 @@ static DEFINE_MUTEX(pcpu_mutex);
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
+/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_reclaim(struct work_struct *work);
+static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+
 static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
@@ -846,13 +851,37 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
 	return pcpu_alloc(size, align, true);
 }
 
-static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+/**
+ * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * @work: unused
+ *
+ * Reclaim all fully free chunks except for the first one.
+ */
+static void pcpu_reclaim(struct work_struct *work)
 {
-	WARN_ON(chunk->immutable);
-	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
-	list_del(&chunk->list);
-	rb_erase(&chunk->rb_node, &pcpu_addr_root);
-	free_pcpu_chunk(chunk);
+	LIST_HEAD(todo);
+	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+	struct pcpu_chunk *chunk, *next;
+
+	mutex_lock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, head, list) {
+		WARN_ON(chunk->immutable);
+
+		/* spare the first one */
+		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+			continue;
+
+		rb_erase(&chunk->rb_node, &pcpu_addr_root);
+		list_move(&chunk->list, &todo);
+	}
+
+	mutex_unlock(&pcpu_mutex);
+
+	list_for_each_entry_safe(chunk, next, &todo, list) {
+		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+		free_pcpu_chunk(chunk);
+	}
 }
 
 /**
@@ -877,14 +906,13 @@ void free_percpu(void *ptr)
 
 	pcpu_free_area(chunk, off);
 
-	/* the chunk became fully free, kill one if there are other free ones */
+	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_size == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 
-		list_for_each_entry(pos,
-				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
-				pcpu_kill_chunk(pos);
+				schedule_work(&pcpu_reclaim_work);
 				break;
 			}
 	}
-- 
cgit v0.10.2


From 7ab152470e8416ef2a44c800fdc157e2192f2974 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 6 Mar 2009 19:08:34 +0300
Subject: x86: linkage.h - guard assembler specifics by __ASSEMBLY__

Stephen Rothwell reported:

|Today's linux-next build (x86_64 allmodconfig) produced this warning:
|
|In file included from drivers/char/epca.c:49:
|drivers/char/digiFep1.h:7:1: warning: "GLOBAL" redefined
|In file included from include/linux/linkage.h:5,
|                 from include/linux/kernel.h:11,
|                 from arch/x86/include/asm/system.h:10,
|                 from arch/x86/include/asm/processor.h:17,
|                 from include/linux/prefetch.h:14,
|                 from include/linux/list.h:6,
|                 from include/linux/module.h:9,
|                 from drivers/char/epca.c:29:
|arch/x86/include/asm/linkage.h:55:1: warning: this is the location of the previous definition
|
|Probably introduced by commit 95695547a7db44b88a7ee36cf5df188de267e99e
|("x86: asm linkage - introduce GLOBAL macro") from the x86 tree.

Any assembler specific snippets being placed in headers
are to be protected by __ASSEMBLY__. Fixed.

Also move __ALIGN definition under the same protection as well.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090306160833.GB7420@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a..a0d70b4 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -4,11 +4,6 @@
 #undef notrace
 #define notrace __attribute__((no_instrument_function))
 
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 /*
@@ -50,16 +45,25 @@
 	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
 			      "g" (arg4), "g" (arg5), "g" (arg6))
 
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
 
 #define GLOBAL(name)	\
 	.globl name;	\
 	name:
 
+#ifdef CONFIG_X86_64
+#define __ALIGN .p2align 4,,15
+#define __ALIGN_STR ".p2align 4,,15"
+#endif
+
 #ifdef CONFIG_X86_ALIGNMENT_16
 #define __ALIGN .align 16,0x90
 #define __ALIGN_STR ".align 16,0x90"
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_LINKAGE_H */
 
-- 
cgit v0.10.2


From ccea34b5d0fbab081496d1860f31acee99fa8a6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2009 00:44:13 +0900
Subject: percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Peter Zijlstra <peterz@infradead.org>

diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419..bfe6a3a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 
 /*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
  */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
  * kzalloc() is used; otherwise, vmalloc() is used.  The returned
  * memory is always zeroed.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  * searchs for the chunk with the highest start address which isn't
  * beyond @addr.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * The address of the found chunk.
  */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  * @new: chunk to insert
  *
  * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  * A single allocation can split an area into three areas, so this
  * function makes sure that @chunk->map has at least two extra slots.
  *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
  * RETURNS:
  * 0 if noop, 1 if successfully extended, -errno on failure.
  */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
 
+	spin_unlock_irq(&pcpu_lock);
+
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
 
 	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-	if (!new)
+	if (!new) {
+		spin_lock_irq(&pcpu_lock);
 		return -ENOMEM;
+	}
+
+	/*
+	 * Acquire pcpu_lock and switch to new area map.  Only free
+	 * could have happened inbetween, so map_used couldn't have
+	 * grown.
+	 */
+	spin_lock_irq(&pcpu_lock);
+	BUG_ON(new_alloc < chunk->map_used + 2);
 
 	size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
  *
  * @chunk->map must have at least two free slots.
  *
+ * CONTEXT:
+ * pcpu_lock.
+ *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
  * from @chunk.  If @flush is true, vcache is flushed before unmapping
  * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
  */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 				  bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
  *
  * For each cpu, populate and map pages [@page_start,@page_end) into
  * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
 	int slot, off;
 
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 		return NULL;
 	}
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint ||
 		    pcpu_extend_area_map(chunk) < 0)
-			goto out_unlock;
+			goto fail_unlock;
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
+restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
-			if (pcpu_extend_area_map(chunk) < 0)
-				goto out_unlock;
+
+			switch (pcpu_extend_area_map(chunk)) {
+			case 0:
+				break;
+			case 1:
+				goto restart;	/* pcpu_lock dropped, restart */
+			default:
+				goto fail_unlock;
+			}
+
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 	}
 
 	/* hmmm... no space left, create a new chunk */
+	spin_unlock_irq(&pcpu_lock);
+
 	chunk = alloc_pcpu_chunk();
 	if (!chunk)
-		goto out_unlock;
+		goto fail_unlock_mutex;
+
+	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
 	pcpu_chunk_addr_insert(chunk);
-
-	off = pcpu_alloc_area(chunk, size, align);
-	if (off < 0)
-		goto out_unlock;
+	goto restart;
 
 area_found:
+	spin_unlock_irq(&pcpu_lock);
+
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
+		spin_lock_irq(&pcpu_lock);
 		pcpu_free_area(chunk, off);
-		goto out_unlock;
+		goto fail_unlock;
 	}
 
-	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-	mutex_unlock(&pcpu_mutex);
-	return ptr;
+	mutex_unlock(&pcpu_alloc_mutex);
+
+	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+	spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+	mutex_unlock(&pcpu_alloc_mutex);
+	return NULL;
 }
 
 /**
@@ -825,6 +897,9 @@ out_unlock:
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * percpu area if arch has set it up; otherwise, allocation is served
  * from the same dynamic area.  Might sleep.  Might trigger writeouts.
  *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
 
-	mutex_lock(&pcpu_mutex);
+	mutex_lock(&pcpu_alloc_mutex);
+	spin_lock_irq(&pcpu_lock);
 
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
 		list_move(&chunk->list, &todo);
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irq(&pcpu_lock);
+	mutex_unlock(&pcpu_alloc_mutex);
 
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
  */
 void free_percpu(void *ptr)
 {
 	void *addr = __pcpu_ptr_to_addr(ptr);
 	struct pcpu_chunk *chunk;
+	unsigned long flags;
 	int off;
 
 	if (!ptr)
 		return;
 
-	mutex_lock(&pcpu_mutex);
+	spin_lock_irqsave(&pcpu_lock, flags);
 
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
 			}
 	}
 
-	mutex_unlock(&pcpu_mutex);
+	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-- 
cgit v0.10.2


From 3a450de1365d20afde406f0d9b2931a5e4a4fd6a Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 6 Mar 2009 17:30:56 -0600
Subject: x86: UV: remove uv_flush_tlb_others() WARN_ON

In uv_flush_tlb_others() (arch/x86/kernel/tlb_uv.c),
the "WARN_ON(!in_atomic())" fails if CONFIG_PREEMPT is not enabled.

And CONFIG_PREEMPT is not enabled by default in the distribution that
most UV owners will use.

We could #ifdef CONFIG_PREEMPT the warning, but that is not good form.
And there seems to be no suitable fix to in_atomic() when CONFIG_PREMPT
is not on.

As Ingo commented:

  > and we have no proper primitive to test for atomicity. (mainly
  > because we dont know about atomicity on a non-preempt kernel)

So we drop the WARN_ON.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549a..d038b9c 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	int locals = 0;
 	struct bau_desc *bau_desc;
 
-	WARN_ON(!in_atomic());
-
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
 	uv_cpu = uv_blade_processor_id();
-- 
cgit v0.10.2


From 1f442d70c84aa798e243e721eba728a98434cd86 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 7 Mar 2009 23:46:26 -0800
Subject: x86: remove smp_apply_quirks()/smp_checks()

Impact: cleanup and code size reduction on 64-bit

This code is only applied to Intel Pentium and AMD K7 32-bit cpus.

Move those checks to intel_init()/amd_init() for 32-bit
so 64-bit will not build this code.

Also change to use cpu_index check to see if we need to emit warning.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B377D2.8030108@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5..f47df59 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	/* Athlon 660/661 is valid. */
+	if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+	    (c->x86_mask == 1)))
+		goto valid_k7;
+
+	/* Duron 670 is valid */
+	if ((c->x86_model == 7) && (c->x86_mask == 0))
+		goto valid_k7;
+
+	/*
+	 * Athlon 662, Duron 671, and Athlon >model 7 have capability
+	 * bit. It's worth noting that the A5 stepping (662) of some
+	 * Athlon XP's have the MP bit set.
+	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+	 * more.
+	 */
+	if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+	    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+	     (c->x86_model > 7))
+		if (cpu_has_mp)
+			goto valid_k7;
+
+	/* If we get here, not a certified SMP capable AMD system. */
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	WARN_ONCE(1, "WARNING: This combination of AMD"
+		"processors is not suitable for SMP.\n");
+	if (!test_taint(TAINT_UNSAFE_SMP))
+		add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+	;
+#endif
+}
+
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 	}
 
 	set_cpu_cap(c, X86_FEATURE_K7);
+
+	amd_k7_smp_check(c);
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559b..191117f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
 #include <asm/uaccess.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/topology.h>
@@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void)
 }
 #endif
 
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3) {
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+				    "with B stepping processors.\n");
+	}
+#endif
+}
+
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
 	unsigned long lo, hi;
@@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_NUMAQ
 	numaq_tsc_disable();
 #endif
+
+	intel_smp_check(c);
 }
 #else
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f..ef7d101 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
 
 /* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
 	cpumask_set_cpu(cpuid, cpu_callin_mask);
 }
 
-static int __cpuinitdata unsafe_smp;
-
 /*
  * Activate a secondary processor.
  */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
-	/*
-	 * Mask B, Pentium, but not Pentium MMX
-	 */
-	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
-	    c->x86_model <= 3)
-		/*
-		 * Remember we have B step Pentia with bugs
-		 */
-		smp_b_stepping = 1;
-
-	/*
-	 * Certain Athlons might work (for various values of 'work') in SMP
-	 * but they are not certified as MP capable.
-	 */
-	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-		if (num_possible_cpus() == 1)
-			goto valid_k7;
-
-		/* Athlon 660/661 is valid. */
-		if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-		    (c->x86_mask == 1)))
-			goto valid_k7;
-
-		/* Duron 670 is valid */
-		if ((c->x86_model == 7) && (c->x86_mask == 0))
-			goto valid_k7;
-
-		/*
-		 * Athlon 662, Duron 671, and Athlon >model 7 have capability
-		 * bit. It's worth noting that the A5 stepping (662) of some
-		 * Athlon XP's have the MP bit set.
-		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
-		 * more.
-		 */
-		if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-		    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
-		     (c->x86_model > 7))
-			if (cpu_has_mp)
-				goto valid_k7;
-
-		/* If we get here, not a certified SMP capable AMD system. */
-		unsafe_smp = 1;
-	}
-
-valid_k7:
-	;
-}
-
-static void __cpuinit smp_checks(void)
-{
-	if (smp_b_stepping)
-		printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
-				    "with B stepping processors.\n");
-
-	/*
-	 * Don't taint if we are running SMP kernel on a single non-MP
-	 * approved Athlon
-	 */
-	if (unsafe_smp && num_online_cpus() > 1) {
-		printk(KERN_INFO "WARNING: This combination of AMD"
-			"processors is not suitable for SMP.\n");
-		add_taint(TAINT_UNSAFE_SMP);
-	}
-}
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
-	smp_apply_quirks(c);
 }
 
 
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done.\n");
 
 	impress_friends();
-	smp_checks();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
-- 
cgit v0.10.2


From 8827247ffcc9e880cbe4705655065cf011265157 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Sat, 7 Mar 2009 13:34:19 +0800
Subject: x86: don't define __this_fixmap_does_not_exist()

Impact: improve out-of-range fixmap index debugging

Commit "1b42f51630c7eebce6fb780b480731eb81afd325"
defined the __this_fixmap_does_not_exist() function
with a WARN_ON(1) in it.

This causes the linker to not report an error when
__this_fixmap_does_not_exist() is called with a
non-constant parameter.

Ingo defined __this_fixmap_does_not_exist() because he
wanted to get virt addresses of fix memory of nest level
by non-constant index.

But we can fix this and still keep the link-time check:

We can get the four slot virt addresses on link time and
store them to array slot_virt[].

Then we can then refer the slot_virt with non-constant index,
in the ioremap-leak detection code.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
LKML-Reference: <49B2075B.4070509@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 62773ab..96786ef 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -504,13 +504,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
 	return &bm_pte[pte_index(addr)];
 }
 
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
 void __init early_ioremap_init(void)
 {
 	pmd_t *pmd;
+	int i;
 
 	if (early_ioremap_debug)
 		printk(KERN_INFO "early_ioremap_init()\n");
 
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	memset(bm_pte, 0, sizeof(bm_pte));
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -577,6 +583,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 
 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
 static int __init check_early_ioremap_leak(void)
 {
 	int count = 0;
@@ -598,7 +605,8 @@ static int __init check_early_ioremap_leak(void)
 }
 late_initcall(check_early_ioremap_leak);
 
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
 	unsigned long offset, last_addr;
 	unsigned int nrpages;
@@ -664,9 +672,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
 		--nrpages;
 	}
 	if (early_ioremap_debug)
-		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
 
-	prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
 	return prev_map[slot];
 }
 
@@ -734,8 +742,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 	}
 	prev_map[slot] = NULL;
 }
-
-void __this_fixmap_does_not_exist(void)
-{
-	WARN_ON(1);
-}
-- 
cgit v0.10.2


From e954ef20c29b7af07a8cb5452f14fb69e3d9d2b2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 5 Mar 2009 12:04:57 -0800
Subject: x86: fix warning about nodeid

Impact: cleanup

Ingo found there warning about nodeid with some configs.

try to use for_each_online_node for non numa too. in that case
nodeid will be 0.

also move out boundary checking from setup_node_bootmem(), so
non-numa config will not check it.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <49B03069.80001@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2966c6b..db81e9a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -806,11 +806,6 @@ static unsigned long __init setup_node_bootmem(int nodeid,
 {
 	unsigned long bootmap_size;
 
-	if (start_pfn > max_low_pfn)
-		return bootmap;
-	if (end_pfn > max_low_pfn)
-		end_pfn = max_low_pfn;
-
 	/* don't touch min_low_pfn */
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 					 bootmap >> PAGE_SHIFT,
@@ -843,13 +838,23 @@ void __init setup_bootmem_allocator(void)
 		 max_pfn_mapped<<PAGE_SHIFT);
 	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
+	for_each_online_node(nodeid) {
+		 unsigned long start_pfn, end_pfn;
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-	for_each_online_node(nodeid)
-		bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
-					node_end_pfn[nodeid], bootmap);
+		start_pfn = node_start_pfn[nodeid];
+		end_pfn = node_end_pfn[nodeid];
+		if (start_pfn > max_low_pfn)
+			continue;
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
 #else
-	bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
+		start_pfn = 0;
+		end_pfn = max_low_pfn;
 #endif
+		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+						 bootmap);
+	}
 
 	after_bootmem = 1;
 }
-- 
cgit v0.10.2


From d0fc63f7bd07cb779a06dc1cdd0c5a14e7f5d562 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Sun, 8 Mar 2009 20:21:35 +0200
Subject: x86 mmiotrace: fix remove_kmmio_fault_pages()

Impact: fix race+crash in mmiotrace

The list manipulation in remove_kmmio_fault_pages() was broken. If more
than one consecutive kmmio_fault_page was re-added during the grace
period between unregister_kmmio_probe() and remove_kmmio_fault_pages(),
the list manipulation failed to remove pages from the release list.

After a second grace period the pages get into rcu_free_kmmio_fault_pages()
and raise a BUG_ON() kernel crash.

The list manipulation is fixed to properly remove pages from the release
list.

This bug has been present from the very beginning of mmiotrace in the
mainline kernel. It was introduced in 0fd0e3da ("x86: mmiotrace full
patch, preview 1");

An urgent fix for Linus. Tested by Stuart (on 32-bit) and Pekka
(on amd and intel 64-bit systems, nouveau and nvidia proprietary).

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Signed-off-by: Pekka Paalanen <pq@iki.fi>
LKML-Reference: <20090308202135.34933feb@daedalus.pq.iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 9f20503..6a518dd 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -451,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 
 static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
-	struct kmmio_delayed_release *dr = container_of(
-						head,
-						struct kmmio_delayed_release,
-						rcu);
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
 	struct kmmio_fault_page *p = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
+
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (p) {
-		if (!p->count)
+		if (!p->count) {
 			list_del_rcu(&p->list);
-		else
+			prevp = &p->release_next;
+		} else {
 			*prevp = p->release_next;
-		prevp = &p->release_next;
+		}
 		p = p->release_next;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
+
 	/* This is the real RCU destroy call. */
 	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
 }
-- 
cgit v0.10.2


From 0feca851c1b3cb4ebfa3149144b3d5de0879ebaa Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 6 Mar 2009 10:09:26 -0800
Subject: x86-32: make sure virt_addr_valid() returns false for fixmap
 addresses

I found that virt_addr_valid() was returning true for fixmap addresses.

I'm not sure whether pfn_valid() is supposed to include this test,
but there's no harm in being explicit.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49B166D6.2080505@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 62773ab..62def57 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -87,6 +87,8 @@ bool __virt_addr_valid(unsigned long x)
 		return false;
 	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
 		return false;
+	if (x >= FIXADDR_START)
+		return false;
 	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
 EXPORT_SYMBOL(__virt_addr_valid);
-- 
cgit v0.10.2