6 files changed, 192 insertions, 76 deletions
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index dcebfc8..51f8663 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -290,7 +290,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	/*
 	 * Early on, we need to check for migrating PTE entries;
 	 * see homecache.c.  If we find a migrating PTE, we wait until
-	 * the backing page claims to be done migrating, then we procede.
+	 * the backing page claims to be done migrating, then we proceed.
 	 * For kernel PTEs, we rewrite the PTE and return and retry.
 	 * Otherwise, we treat the fault like a normal "no PTE" fault,
 	 * rather than trying to patch up the existing PTE.
@@ -655,14 +655,6 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
 	}
 
 	/*
-	 * NOTE: the one other type of access that might bring us here
-	 * are the memory ops in __tns_atomic_acquire/__tns_atomic_release,
-	 * but we don't have to check specially for them since we can
-	 * always safely return to the address of the fault and retry,
-	 * since no separate atomic locks are involved.
-	 */
-
-	/*
 	 * Now that we have released the atomic lock (if necessary),
 	 * it's safe to spin if the PTE that caused the fault was migrating.
 	 */
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index d78df3a..cbe6f4f 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -179,23 +179,46 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 	panic("Unsafe to continue.");
 }
 
+void flush_remote_page(struct page *page, int order)
+{
+	int i, pages = (1 << order);
+	for (i = 0; i < pages; ++i, ++page) {
+		void *p = kmap_atomic(page);
+		int hfh = 0;
+		int home = page_home(page);
+#if CHIP_HAS_CBOX_HOME_MAP()
+		if (home == PAGE_HOME_HASH)
+			hfh = 1;
+		else
+#endif
+			BUG_ON(home < 0 || home >= NR_CPUS);
+		finv_buffer_remote(p, PAGE_SIZE, hfh);
+		kunmap_atomic(p);
+	}
+}
+
 void homecache_evict(const struct cpumask *mask)
 {
 	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
 }
 
-/* Return a mask of the cpus whose caches currently own these pages. */
-static void homecache_mask(struct page *page, int pages,
-			   struct cpumask *home_mask)
+/*
+ * Return a mask of the cpus whose caches currently own these pages.
+ * The return value is whether the pages are all coherently cached
+ * (i.e. none are immutable, incoherent, or uncached).
+ */
+static int homecache_mask(struct page *page, int pages,
+			  struct cpumask *home_mask)
 {
 	int i;
+	int cached_coherently = 1;
 	cpumask_clear(home_mask);
 	for (i = 0; i < pages; ++i) {
 		int home = page_home(&page[i]);
 		if (home == PAGE_HOME_IMMUTABLE ||
 		    home == PAGE_HOME_INCOHERENT) {
 			cpumask_copy(home_mask, cpu_possible_mask);
-			return;
+			return 0;
 		}
 #if CHIP_HAS_CBOX_HOME_MAP()
 		if (home == PAGE_HOME_HASH) {
@@ -203,11 +226,14 @@ static void homecache_mask(struct page *page, int pages,
 			continue;
 		}
 #endif
-		if (home == PAGE_HOME_UNCACHED)
+		if (home == PAGE_HOME_UNCACHED) {
+			cached_coherently = 0;
 			continue;
+		}
 		BUG_ON(home < 0 || home >= NR_CPUS);
 		cpumask_set_cpu(home, home_mask);
 	}
+	return cached_coherently;
 }
 
 /*
@@ -386,7 +412,7 @@ void homecache_change_page_home(struct page *page, int order, int home)
 		pte_t *ptep = virt_to_pte(NULL, kva);
 		pte_t pteval = *ptep;
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
-		*ptep = pte_set_home(pteval, home);
+		__set_pte(ptep, pte_set_home(pteval, home));
 	}
 }
 
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 201a582..42cfcba 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -219,7 +219,7 @@ try_again:
 	if (mm->free_area_cache < len)
 		goto fail;
 
-	/* either no address requested or cant fit in requested address hole */
+	/* either no address requested or can't fit in requested address hole */
 	addr = (mm->free_area_cache - len) & huge_page_mask(h);
 	do {
 		/*
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 0b9ce69..d6e87fd 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -53,22 +53,11 @@
 
 #include "migrate.h"
 
-/*
- * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
- * in the Tile Kconfig, but this generates configure warnings.
- * Do it here and force people to get it right to compile this file.
- * The problem is that with 4KB small pages and 16MB huge pages,
- * the default value doesn't allow us to group enough small pages
- * together to make up a huge page.
- */
-#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
-# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
-#endif
-
 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
 
 #ifndef __tilegx__
 unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
+EXPORT_SYMBOL(VMALLOC_RESERVE);
 #endif
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -445,7 +434,7 @@ static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
 
 /* Temporary page table we use for staging. */
 static pgd_t pgtables[PTRS_PER_PGD]
- __attribute__((section(".init.page")));
+ __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));
 
 /*
  * This maps the physical memory to kernel virtual address space, a total
@@ -653,6 +642,17 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	memcpy(pgd_base, pgtables, sizeof(pgtables));
 	__install_page_table(pgd_base, __get_cpu_var(current_asid),
 			     swapper_pgprot);
+
+	/*
+	 * We just read swapper_pgprot and thus brought it into the cache,
+	 * with its new home & caching mode.  When we start the other CPUs,
+	 * they're going to reference swapper_pgprot via their initial fake
+	 * VA-is-PA mappings, which cache everything locally.  At that
+	 * time, if it's in our cache with a conflicting home, the
+	 * simulator's coherence checker will complain.  So, flush it out
+	 * of our cache; we're not going to ever use it again anyway.
+	 */
+	__insn_finv(&swapper_pgprot);
 }
 
 /*
@@ -950,11 +950,7 @@ struct kmem_cache *pgd_cache;
 
 void __init pgtable_cache_init(void)
 {
-	pgd_cache = kmem_cache_create("pgd",
-				PTRS_PER_PGD*sizeof(pgd_t),
-				PTRS_PER_PGD*sizeof(pgd_t),
-				0,
-				NULL);
+	pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);
 	if (!pgd_cache)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
@@ -989,7 +985,7 @@ static long __write_once initfree = 1;
 static int __init set_initfree(char *str)
 {
 	long val;
-	if (strict_strtol(str, 0, &val)) {
+	if (strict_strtol(str, 0, &val) == 0) {
 		initfree = val;
 		pr_info("initfree: %s free init pages\n",
 			initfree ? "will" : "won't");
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index f738765..ac01a7c 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -18,6 +18,7 @@
 #include <linux/linkage.h>
 #include <linux/threads.h>
 #include <asm/page.h>
+#include <asm/thread_info.h>
 #include <asm/types.h>
 #include <asm/asm-offsets.h>
 #include <hv/hypervisor.h>
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 1f5430c..de7d8e2 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -41,7 +41,7 @@
  * The normal show_free_areas() is too verbose on Tile, with dozens
  * of processors and often four NUMA zones each with high and lowmem.
  */
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	struct zone *zone;
 
@@ -142,6 +142,76 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
 }
 #endif
 
+/**
+ * shatter_huge_page() - ensure a given address is mapped by a small page.
+ *
+ * This function converts a huge PTE mapping kernel LOWMEM into a bunch
+ * of small PTEs with the same caching.  No cache flush required, but we
+ * must do a global TLB flush.
+ *
+ * Any caller that wishes to modify a kernel mapping that might
+ * have been made with a huge page should call this function,
+ * since doing so properly avoids race conditions with installing the
+ * newly-shattered page and then flushing all the TLB entries.
+ *
+ * @addr: Address at which to shatter any existing huge page.
+ */
+void shatter_huge_page(unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned long flags = 0;  /* happy compiler */
+#ifdef __PAGETABLE_PMD_FOLDED
+	struct list_head *pos;
+#endif
+
+	/* Get a pointer to the pmd entry that we need to change. */
+	addr &= HPAGE_MASK;
+	BUG_ON(pgd_addr_invalid(addr));
+	BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
+	pgd = swapper_pg_dir + pgd_index(addr);
+	pud = pud_offset(pgd, addr);
+	BUG_ON(!pud_present(*pud));
+	pmd = pmd_offset(pud, addr);
+	BUG_ON(!pmd_present(*pmd));
+	if (!pmd_huge_page(*pmd))
+		return;
+
+	/*
+	 * Grab the pgd_lock, since we may need it to walk the pgd_list,
+	 * and since we need some kind of lock here to avoid races.
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+	if (!pmd_huge_page(*pmd)) {
+		/* Lost the race to convert the huge page. */
+		spin_unlock_irqrestore(&pgd_lock, flags);
+		return;
+	}
+
+	/* Shatter the huge page into the preallocated L2 page table. */
+	pmd_populate_kernel(&init_mm, pmd,
+			    get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
+
+#ifdef __PAGETABLE_PMD_FOLDED
+	/* Walk every pgd on the system and update the pmd there. */
+	list_for_each(pos, &pgd_list) {
+		pmd_t *copy_pmd;
+		pgd = list_to_pgd(pos) + pgd_index(addr);
+		pud = pud_offset(pgd, addr);
+		copy_pmd = pmd_offset(pud, addr);
+		__set_pmd(copy_pmd, *pmd);
+	}
+#endif
+
+	/* Tell every cpu to notice the change. */
+	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
+		     cpu_possible_mask, NULL, 0);
+
+	/* Hold the lock until the TLB flush is finished to avoid races. */
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
 /*
  * List of all pgd's needed so it can invalidate entries in both cached
  * and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
 	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
 #endif
 
-	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
-			swapper_pg_dir + KERNEL_PGD_INDEX_START,
-			KERNEL_PGD_PTRS);
+	memcpy(pgd + KERNEL_PGD_INDEX_START,
+	       swapper_pg_dir + KERNEL_PGD_INDEX_START,
+	       KERNEL_PGD_PTRS * sizeof(pgd_t));
 
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
+	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
 	struct page *p;
+#if L2_USER_PGTABLE_ORDER > 0
+	int i;
+#endif
 
 #ifdef CONFIG_HIGHPTE
 	flags |= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	if (p == NULL)
 		return NULL;
 
+#if L2_USER_PGTABLE_ORDER > 0
+	/*
+	 * Make every page have a page_count() of one, not just the first.
+	 * We don't use __GFP_COMP since it doesn't look like it works
+	 * correctly with tlb_remove_page().
+	 */
+	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+		init_page_count(p+i);
+		inc_zone_page_state(p+i, NR_PAGETABLE);
+	}
+#endif
+
 	pgtable_page_ctor(p);
 	return p;
 }
@@ -242,8 +327,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  */
 void pte_free(struct mm_struct *mm, struct page *p)
 {
+	int i;
+
 	pgtable_page_dtor(p);
-	__free_pages(p, L2_USER_PGTABLE_ORDER);
+	__free_page(p);
+
+	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+		__free_page(p+i);
+		dec_zone_page_state(p+i, NR_PAGETABLE);
+	}
 }
 
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
@@ -252,18 +344,11 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
 	int i;
 
 	pgtable_page_dtor(pte);
-	tlb->need_flush = 1;
-	if (tlb_fast_mode(tlb)) {
-		struct page *pte_pages[L2_USER_PGTABLE_PAGES];
-		for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
-			pte_pages[i] = pte + i;
-		free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
-		return;
-	}
-	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
-		tlb->pages[tlb->nr++] = pte + i;
-		if (tlb->nr >= FREE_PTE_NR)
-			tlb_flush_mmu(tlb, 0, 0);
+	tlb_remove_page(tlb, pte);
+
+	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+		tlb_remove_page(tlb, pte + i);
+		dec_zone_page_state(pte + i, NR_PAGETABLE);
 	}
 }
 
@@ -346,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
 	return x + y * smp_width;
 }
 
-void set_pte_order(pte_t *ptep, pte_t pte, int order)
+/*
+ * Convert a kernel VA to a PA and homing information.
+ */
+int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
 {
-	unsigned long pfn = pte_pfn(pte);
-	struct page *page = pfn_to_page(pfn);
+	struct page *page = virt_to_page(va);
+	pte_t null_pte = { 0 };
 
-	/* Update the home of a PTE if necessary */
-	pte = pte_set_home(pte, page_home(page));
+	*cpa = __pa(va);
 
+	/* Note that this is not writing a page table, just returning a pte. */
+	*pte = pte_set_home(null_pte, page_home(page));
+
+	return 0; /* return non-zero if not hfh? */
+}
+EXPORT_SYMBOL(va_to_cpa_and_pte);
+
+void __set_pte(pte_t *ptep, pte_t pte)
+{
 #ifdef __tilegx__
 	*ptep = pte;
 #else
-	/*
-	 * When setting a PTE, write the high bits first, then write
-	 * the low bits.  This sets the "present" bit only after the
-	 * other bits are in place.  If a particular PTE update
-	 * involves transitioning from one valid PTE to another, it
-	 * may be necessary to call set_pte_order() more than once,
-	 * transitioning via a suitable intermediate state.
-	 * Note that this sequence also means that if we are transitioning
-	 * from any migrating PTE to a non-migrating one, we will not
-	 * see a half-updated PTE with the migrating bit off.
-	 */
-#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
-# error Must write the present and migrating bits last
-#endif
-	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-	barrier();
-	((u32 *)ptep)[0] = (u32)(pte_val(pte));
-#endif
+# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
+#  error Must write the present and migrating bits last
+# endif
+	if (pte_present(pte)) {
+		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+		barrier();
+		((u32 *)ptep)[0] = (u32)(pte_val(pte));
+	} else {
+		((u32 *)ptep)[0] = (u32)(pte_val(pte));
+		barrier();
+		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+	}
+#endif /* __tilegx__ */
+}
+
+void set_pte(pte_t *ptep, pte_t pte)
+{
+	struct page *page = pfn_to_page(pte_pfn(pte));
+
+	/* Update the home of a PTE if necessary */
+	pte = pte_set_home(pte, page_home(page));
+
+	__set_pte(ptep, pte);
 }
 
 /* Can this mm load a PTE with cached_priority set? */