3 files changed, 80 insertions, 66 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 3d542a9..7230d7a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -43,26 +43,14 @@ static unsigned nr_gpages;
 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
 
 #define hugepte_shift			mmu_huge_psizes
-#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
-#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+#define HUGEPTE_INDEX_SIZE(psize)	(mmu_huge_psizes[(psize)])
+#define PTRS_PER_HUGEPTE(psize)		(1 << mmu_huge_psizes[psize])
 
 #define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
-						+ hugepte_shift[psize])
+					 + HUGEPTE_INDEX_SIZE(psize))
 #define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
 #define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
 
-/* Subtract one from array size because we don't need a cache for 4K since
- * is not a huge page size */
-#define HUGE_PGTABLE_INDEX(psize)	(HUGEPTE_CACHE_NUM + psize - 1)
-#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
-
-static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-	[MMU_PAGE_64K]	= "hugepte_cache_64K",
-	[MMU_PAGE_1M]	= "hugepte_cache_1M",
-	[MMU_PAGE_16M]	= "hugepte_cache_16M",
-	[MMU_PAGE_16G]	= "hugepte_cache_16G",
-};
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
@@ -114,15 +102,15 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
-				      GFP_KERNEL|__GFP_REPEAT);
+	pte_t *new = kmem_cache_zalloc(PGT_CACHE(hugepte_shift[psize]),
+				       GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
+		kmem_cache_free(PGT_CACHE(hugepte_shift[psize]), new);
 	else
 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
 	spin_unlock(&mm->page_table_lock);
@@ -271,9 +259,7 @@ static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 
 	hpdp->pd = 0;
 	tlb->need_flush = 1;
-	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
-						 HUGEPTE_CACHE_NUM+psize-1,
-						 PGF_CACHENUM_MASK));
+	pgtable_free_tlb(tlb, hugepte, hugepte_shift[psize]);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -698,8 +684,6 @@ static void __init set_huge_psize(int psize)
 		if (mmu_huge_psizes[psize] ||
 		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
 			return;
-		if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
-			return;
 		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 
 		switch (mmu_psize_defs[psize].shift) {
@@ -753,9 +737,9 @@ static int __init hugetlbpage_init(void)
 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 		return -ENODEV;
 
-	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
-	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
-	 * sizes changes.
+	/* Add supported huge page sizes.  Need to change
+	 *  HUGE_MAX_HSTATE if the number of supported huge page sizes
+	 *  changes.
 	 */
 	set_huge_psize(MMU_PAGE_16M);
 	set_huge_psize(MMU_PAGE_16G);
@@ -769,16 +753,11 @@ static int __init hugetlbpage_init(void)
 
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 		if (mmu_huge_psizes[psize]) {
-			pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
-				kmem_cache_create(
-					HUGEPTE_CACHE_NAME(psize),
-					HUGEPTE_TABLE_SIZE(psize),
-					HUGEPTE_TABLE_SIZE(psize),
-					0,
-					NULL);
-			if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
-				panic("hugetlbpage_init(): could not create %s"\
-				      "\n", HUGEPTE_CACHE_NAME(psize));
+			pgtable_cache_add(hugepte_shift[psize], NULL);
+			if (!PGT_CACHE(hugepte_shift[psize]))
+				panic("hugetlbpage_init(): could not create "
+				      "pgtable cache for %d bit pagesize\n",
+				      mmu_psize_to_shift(psize));
 		}
 	}
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 335c578..82ac61d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -119,30 +119,58 @@ static void pmd_ctor(void *addr)
 	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-	PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-	"pgd_cache", "pmd_cache",
-#else
-	"pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need an extra cache per hugepagesize, initialized in
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
- * is not compile time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+	char *name;
+	unsigned long table_size = sizeof(void *) << shift;
+	unsigned long align = table_size;
+
+	/* When batching pgtable pointers for RCU freeing, we store
+	 * the index size in the low bits.  Table alignment must be
+	 * big enough to fit it */
+	unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+	struct kmem_cache *new;
+
+	/* It would be nice if this was a BUILD_BUG_ON(), but at the
+	 * moment, gcc doesn't seem to recognize is_power_of_2 as a
+	 * constant expression, so so much for that. */
+	BUG_ON(!is_power_of_2(minalign));
+	BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+	if (PGT_CACHE(shift))
+		return; /* Already have a cache of this size */
+
+	align = max_t(unsigned long, align, minalign);
+	name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+	new = kmem_cache_create(name, table_size, align, 0, ctor);
+	PGT_CACHE(shift) = new;
+
+	pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+
 
 void pgtable_cache_init(void)
 {
-	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
-	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+	pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+		panic("Couldn't allocate pgtable caches");
+
+	/* In all current configs, when the PUD index exists it's the
+	 * same size as either the pgd or pmd index.  Verify that the
+	 * initialization above has also created a PUD cache.  This
+	 * will need re-examiniation if we add new possibilities for
+	 * the pagetable layout. */
+	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 5304093..99df697 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -49,12 +49,12 @@ struct pte_freelist_batch
 {
 	struct rcu_head	rcu;
 	unsigned int	index;
-	pgtable_free_t	tables[0];
+	unsigned long	tables[0];
 };
 
 #define PTE_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-	  / sizeof(pgtable_free_t))
+	  / sizeof(unsigned long))
 
 static void pte_free_smp_sync(void *arg)
 {
@@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg)
 /* This is only called when we are critically out of memory
  * (and fail to get a page in pte_free_tlb).
  */
-static void pgtable_free_now(pgtable_free_t pgf)
+static void pgtable_free_now(void *table, unsigned shift)
 {
 	pte_freelist_forced_free++;
 
 	smp_call_function(pte_free_smp_sync, NULL, 1);
 
-	pgtable_free(pgf);
+	pgtable_free(table, shift);
 }
 
 static void pte_free_rcu_callback(struct rcu_head *head)
@@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head)
 		container_of(head, struct pte_freelist_batch, rcu);
 	unsigned int i;
 
-	for (i = 0; i < batch->index; i++)
-		pgtable_free(batch->tables[i]);
+	for (i = 0; i < batch->index; i++) {
+		void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
+		unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
+
+		pgtable_free(table, shift);
+	}
 
 	free_page((unsigned long)batch);
 }
@@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
 	call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
 	/* This is safe since tlb_gather_mmu has disabled preemption */
 	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+	unsigned long pgf;
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
-		pgtable_free(pgf);
+		pgtable_free(table, shift);
 		return;
 	}
 
 	if (*batchp == NULL) {
 		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
 		if (*batchp == NULL) {
-			pgtable_free_now(pgf);
+			pgtable_free_now(table, shift);
 			return;
 		}
 		(*batchp)->index = 0;
 	}
+	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+	pgf = (unsigned long)table | shift;
 	(*batchp)->tables[(*batchp)->index++] = pgf;
 	if ((*batchp)->index == PTE_FREELIST_SIZE) {
 		pte_free_submit(*batchp);