From cadbb58039f7cab1def9c931012ab04c953a6997 Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.pais@oracle.com>
Date: Mon, 8 Sep 2014 11:48:53 +0530
Subject: sparc64: correctly recognise M6 and M7 cpu type

The following patch adds support for correctly
recognising M6 and M7 cpu type.

Signed-off-by: Allen Pais <allen.pais@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/spitfire.h b/arch/sparc/include/asm/spitfire.h
index 3fc58691..56f9338 100644
--- a/arch/sparc/include/asm/spitfire.h
+++ b/arch/sparc/include/asm/spitfire.h
@@ -45,6 +45,8 @@
 #define SUN4V_CHIP_NIAGARA3	0x03
 #define SUN4V_CHIP_NIAGARA4	0x04
 #define SUN4V_CHIP_NIAGARA5	0x05
+#define SUN4V_CHIP_SPARC_M6	0x06
+#define SUN4V_CHIP_SPARC_M7	0x07
 #define SUN4V_CHIP_SPARC64X	0x8a
 #define SUN4V_CHIP_UNKNOWN	0xff
 
diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c
index 82a3a71..dfad8b1 100644
--- a/arch/sparc/kernel/cpu.c
+++ b/arch/sparc/kernel/cpu.c
@@ -494,6 +494,18 @@ static void __init sun4v_cpu_probe(void)
 		sparc_pmu_type = "niagara5";
 		break;
 
+	case SUN4V_CHIP_SPARC_M6:
+		sparc_cpu_type = "SPARC-M6";
+		sparc_fpu_type = "SPARC-M6 integrated FPU";
+		sparc_pmu_type = "sparc-m6";
+		break;
+
+	case SUN4V_CHIP_SPARC_M7:
+		sparc_cpu_type = "SPARC-M7";
+		sparc_fpu_type = "SPARC-M7 integrated FPU";
+		sparc_pmu_type = "sparc-m7";
+		break;
+
 	case SUN4V_CHIP_SPARC64X:
 		sparc_cpu_type = "SPARC64-X";
 		sparc_fpu_type = "SPARC64-X integrated FPU";
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index 452f04f..4fdeb80 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -427,6 +427,12 @@ sun4v_chip_type:
 	cmp	%g2, '5'
 	be,pt	%xcc, 5f
 	 mov	SUN4V_CHIP_NIAGARA5, %g4
+	cmp	%g2, '6'
+	be,pt	%xcc, 5f
+	 mov	SUN4V_CHIP_SPARC_M6, %g4
+	cmp	%g2, '7'
+	be,pt	%xcc, 5f
+	 mov	SUN4V_CHIP_SPARC_M7, %g4
 	ba,pt	%xcc, 49f
 	 nop
 
@@ -585,6 +591,12 @@ niagara_tlb_fixup:
 	cmp	%g1, SUN4V_CHIP_NIAGARA5
 	be,pt	%xcc, niagara4_patch
 	 nop
+	cmp	%g1, SUN4V_CHIP_SPARC_M6
+	be,pt	%xcc, niagara4_patch
+	 nop
+	cmp	%g1, SUN4V_CHIP_SPARC_M7
+	be,pt	%xcc, niagara4_patch
+	 nop
 
 	call	generic_patch_copyops
 	 nop
-- 
cgit v0.10.2


From 9bd3ee33f6b97de092610d8dcabc4cb98d99505c Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.pais@oracle.com>
Date: Mon, 8 Sep 2014 11:48:54 +0530
Subject: sparc64: support M6 and M7 for building CPU distribution map

Add M6 and M7 chip type in cpumap.c to correctly build CPU distribution map that spans all online CPUs.

Signed-off-by: Allen Pais <allen.pais@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
index de1c844..e69ec0e 100644
--- a/arch/sparc/kernel/cpumap.c
+++ b/arch/sparc/kernel/cpumap.c
@@ -326,6 +326,8 @@ static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
 	case SUN4V_CHIP_NIAGARA3:
 	case SUN4V_CHIP_NIAGARA4:
 	case SUN4V_CHIP_NIAGARA5:
+	case SUN4V_CHIP_SPARC_M6:
+	case SUN4V_CHIP_SPARC_M7:
 	case SUN4V_CHIP_SPARC64X:
 		rover_inc_table = niagara_iterate_method;
 		break;
-- 
cgit v0.10.2


From 408316258521168614bfb4da0e070490d3e65a17 Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.pais@oracle.com>
Date: Mon, 8 Sep 2014 11:48:55 +0530
Subject: sparc64: cpu hardware caps support for sparc M6 and M7

Signed-off-by: Allen Pais <allen.pais@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 3fdb455..1c7bfdf 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -500,12 +500,16 @@ static void __init init_sparc64_elf_hwcap(void)
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
+		    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
+		    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 			cap |= HWCAP_SPARC_BLKINIT;
 		if (sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
 		    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
+		    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
+		    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
 		    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 			cap |= HWCAP_SPARC_N2;
 	}
@@ -533,6 +537,8 @@ static void __init init_sparc64_elf_hwcap(void)
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
+			    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
+			    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 				cap |= (AV_SPARC_VIS | AV_SPARC_VIS2 |
 					AV_SPARC_ASI_BLK_INIT |
@@ -540,6 +546,8 @@ static void __init init_sparc64_elf_hwcap(void)
 			if (sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
 			    sun4v_chip_type == SUN4V_CHIP_NIAGARA5 ||
+			    sun4v_chip_type == SUN4V_CHIP_SPARC_M6 ||
+			    sun4v_chip_type == SUN4V_CHIP_SPARC_M7 ||
 			    sun4v_chip_type == SUN4V_CHIP_SPARC64X)
 				cap |= (AV_SPARC_VIS3 | AV_SPARC_HPC |
 					AV_SPARC_FMAF);
-- 
cgit v0.10.2


From 74cad25c076a2f5253312c2fe82d1a4daecc1323 Mon Sep 17 00:00:00 2001
From: Andreas Larsson <andreas@gaisler.com>
Date: Fri, 29 Aug 2014 17:08:21 +0200
Subject: sparc: Let memset return the address argument

This makes memset follow the standard (instead of returning 0 on success). This
is needed when certain versions of gcc optimizes around memset calls and assume
that the address argument is preserved in %o0.

Signed-off-by: Andreas Larsson <andreas@gaisler.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/lib/memset.S b/arch/sparc/lib/memset.S
index 99c017b..f75e690 100644
--- a/arch/sparc/lib/memset.S
+++ b/arch/sparc/lib/memset.S
@@ -3,8 +3,9 @@
  * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
  *
- * Returns 0, if ok, and number of bytes not yet set if exception
- * occurs and we were called as clear_user.
+ * Calls to memset returns initial %o0. Calls to bzero returns 0, if ok, and
+ * number of bytes not yet set if exception occurs and we were called as
+ * clear_user.
  */
 
 #include <asm/ptrace.h>
@@ -65,6 +66,8 @@ __bzero_begin:
 	.globl	__memset_start, __memset_end
 __memset_start:
 memset:
+	mov	%o0, %g1
+	mov	1, %g4
 	and	%o1, 0xff, %g3
 	sll	%g3, 8, %g2
 	or	%g3, %g2, %g3
@@ -89,6 +92,7 @@ memset:
 	 sub	%o0, %o2, %o0
 
 __bzero:
+	clr	%g4
 	mov	%g0, %g3
 1:
 	cmp	%o1, 7
@@ -151,8 +155,8 @@ __bzero:
 	bne,a	8f
 	 EX(stb	%g3, [%o0], and %o1, 1)
 8:
-	retl
-	 clr	%o0
+	b	0f
+	 nop
 7:
 	be	13b
 	 orcc	%o1, 0, %g0
@@ -164,6 +168,12 @@ __bzero:
 	bne	8b
 	 EX(stb	%g3, [%o0 - 1], add %o1, 1)
 0:
+	andcc	%g4, 1, %g0
+	be	5f
+	 nop
+	retl
+	 mov	%g1, %o0
+5:
 	retl
 	 clr	%o0
 __memset_end:
-- 
cgit v0.10.2


From 384859d2af8ead22c9e5a570a4ab89f1b563c8e5 Mon Sep 17 00:00:00 2001
From: Andreas Larsson <andreas@gaisler.com>
Date: Fri, 29 Aug 2014 17:09:18 +0200
Subject: sparc: leon: Fix race condition between leon_cycles_offset and
 timer_interrupt

This makes sure that leon_cycles_offset takes the pending bit into
account and that leon_clear_clock_irq clears the pending bit. Otherwise,
if leon_cycles_offset is executed after the timer has wrapped but before
timer_interrupt has increased timer_cs_internal_counter, time can be
perceived to go backwards.

Signed-off-by: Andreas Larsson <andreas@gaisler.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/leon_kernel.c b/arch/sparc/kernel/leon_kernel.c
index 683c4af..9bbb8f2 100644
--- a/arch/sparc/kernel/leon_kernel.c
+++ b/arch/sparc/kernel/leon_kernel.c
@@ -37,6 +37,7 @@ unsigned long amba_system_id;
 static DEFINE_SPINLOCK(leon_irq_lock);
 
 static unsigned long leon3_gptimer_idx; /* Timer Index (0..6) within Timer Core */
+static unsigned long leon3_gptimer_ackmask; /* For clearing pending bit */
 unsigned long leon3_gptimer_irq; /* interrupt controller irq number */
 unsigned int sparc_leon_eirq;
 #define LEON_IMASK(cpu) (&leon3_irqctrl_regs->mask[cpu])
@@ -260,11 +261,19 @@ void leon_update_virq_handling(unsigned int virq,
 
 static u32 leon_cycles_offset(void)
 {
-	u32 rld, val, off;
+	u32 rld, val, ctrl, off;
+
 	rld = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].rld);
 	val = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].val);
-	off = rld - val;
-	return rld - val;
+	ctrl = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl);
+	if (LEON3_GPTIMER_CTRL_ISPENDING(ctrl)) {
+		val = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].val);
+		off = 2 * rld - val;
+	} else {
+		off = rld - val;
+	}
+
+	return off;
 }
 
 #ifdef CONFIG_SMP
@@ -302,6 +311,7 @@ void __init leon_init_timers(void)
 	int ampopts;
 	int err;
 	u32 config;
+	u32 ctrl;
 
 	sparc_config.get_cycles_offset = leon_cycles_offset;
 	sparc_config.cs_period = 1000000 / HZ;
@@ -374,6 +384,16 @@ void __init leon_init_timers(void)
 	if (!(leon3_gptimer_regs && leon3_irqctrl_regs && leon3_gptimer_irq))
 		goto bad;
 
+	ctrl = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl);
+	LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl,
+			      ctrl | LEON3_GPTIMER_CTRL_PENDING);
+	ctrl = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl);
+
+	if ((ctrl & LEON3_GPTIMER_CTRL_PENDING) != 0)
+		leon3_gptimer_ackmask = ~LEON3_GPTIMER_CTRL_PENDING;
+	else
+		leon3_gptimer_ackmask = ~0;
+
 	LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].val, 0);
 	LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].rld,
 				(((1000000 / HZ) - 1)));
@@ -452,6 +472,11 @@ bad:
 
 static void leon_clear_clock_irq(void)
 {
+	u32 ctrl;
+
+	ctrl = LEON3_BYPASS_LOAD_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl);
+	LEON3_BYPASS_STORE_PA(&leon3_gptimer_regs->e[leon3_gptimer_idx].ctrl,
+			      ctrl & leon3_gptimer_ackmask);
 }
 
 static void leon_load_profile_irq(int cpu, unsigned int limit)
-- 
cgit v0.10.2


From b84ca92e1672432aac43cccaaa572881e7cc9fdd Mon Sep 17 00:00:00 2001
From: Andreas Larsson <andreas@gaisler.com>
Date: Mon, 8 Sep 2014 09:48:52 +0200
Subject: sparc32, leon: Make leon_dma_ops avaiable when !CONFIG_PCI

The leon_dma_ops struct is needed for leon regardless of PCI configuration.

Signed-off-by: Andreas Larsson <andreas@gaisler.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 1ee0271..5b1b52a 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -20,10 +20,12 @@ extern struct bus_type pci_bus_type;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
+#ifdef CONFIG_SPARC_LEON
 	if (sparc_cpu_model == sparc_leon)
 		return leon_dma_ops;
-	else if (dev->bus == &pci_bus_type)
+#endif
+#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
+	if (dev->bus == &pci_bus_type)
 		return &pci32_dma_ops;
 #endif
 	return dma_ops;
-- 
cgit v0.10.2


From d1105287aabe88dbb3af825140badaa05cf0442c Mon Sep 17 00:00:00 2001
From: Daniel Hellstrom <daniel@gaisler.com>
Date: Wed, 10 Sep 2014 14:17:52 +0200
Subject: sparc32: dma_alloc_coherent must honour gfp flags

dma_zalloc_coherent() calls dma_alloc_coherent(__GFP_ZERO)
but the sparc32 implementations sbus_alloc_coherent() and
pci32_alloc_coherent() doesn't take the gfp flags into
account.

Tested on the SPARC32/LEON GRETH Ethernet driver which fails
due to dma_alloc_coherent(__GFP_ZERO) returns non zeroed
pages.

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 7f08ec8..28fed53 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -278,7 +278,8 @@ static void *sbus_alloc_coherent(struct device *dev, size_t len,
 	}
 
 	order = get_order(len_total);
-	if ((va = __get_free_pages(GFP_KERNEL|__GFP_COMP, order)) == 0)
+	va = __get_free_pages(gfp, order);
+	if (va == 0)
 		goto err_nopages;
 
 	if ((res = kzalloc(sizeof(struct resource), GFP_KERNEL)) == NULL)
@@ -443,7 +444,7 @@ static void *pci32_alloc_coherent(struct device *dev, size_t len,
 	}
 
 	order = get_order(len_total);
-	va = (void *) __get_free_pages(GFP_KERNEL, order);
+	va = (void *) __get_free_pages(gfp, order);
 	if (va == NULL) {
 		printk("pci_alloc_consistent: no %ld pages\n", len_total>>PAGE_SHIFT);
 		goto err_nopages;
-- 
cgit v0.10.2


From 4ccb9272892c33ef1c19a783cfa87103b30c2784 Mon Sep 17 00:00:00 2001
From: bob picco <bpicco@meloft.net>
Date: Tue, 16 Sep 2014 09:26:47 -0400
Subject: sparc64: sun4v TLB error power off events

We've witnessed a few TLB events causing the machine to power off because
of prom_halt. In one case it was some nfs related area during rmmod. Another
was an mmapper of /dev/mem. A more recent one is an ITLB issue with
a bad pagesize which could be a hardware bug. Bugs happen but we should
attempt to not power off the machine and/or hang it when possible.

This is a DTLB error from an mmapper of /dev/mem:
[root@sparcie ~]# SUN4V-DTLB: Error at TPC[fffff80100903e6c], tl 1
SUN4V-DTLB: TPC<0xfffff80100903e6c>
SUN4V-DTLB: O7[fffff801081979d0]
SUN4V-DTLB: O7<0xfffff801081979d0>
SUN4V-DTLB: vaddr[fffff80100000000] ctx[1250] pte[98000000000f0610] error[2]
.

This is recent mainline for ITLB:
[ 3708.179864] SUN4V-ITLB: TPC<0xfffffc010071cefc>
[ 3708.188866] SUN4V-ITLB: O7[fffffc010071cee8]
[ 3708.197377] SUN4V-ITLB: O7<0xfffffc010071cee8>
[ 3708.206539] SUN4V-ITLB: vaddr[e0003] ctx[1a3c] pte[2900000dcc800eeb] error[4]
.

Normally sun4v_itlb_error_report() and sun4v_dtlb_error_report() would call
prom_halt() and drop us to OF command prompt "ok". This isn't the case for
LDOMs and the machine powers off.

For the HV reported error of HV_ENORADDR for HV HV_MMU_MAP_ADDR_TRAP we cause
a SIGBUS error by qualifying it within do_sparc64_fault() for fault code mask
of FAULT_CODE_BAD_RA. This is done when trap level (%tl) is less or equal
one("1"). Otherwise, for %tl > 1,  we proceed eventually to die_if_kernel().

The logic of this patch was partially inspired by David Miller's feedback.

Power off of large sparc64 machines is painful. Plus die_if_kernel provides
more context. A reset sequence isn't a brief period on large sparc64 but
better than power-off/power-on sequence.

Cc: sparclinux@vger.kernel.org
Signed-off-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index a5f01ac..f85dc85 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,6 +102,7 @@ struct thread_info {
 #define FAULT_CODE_ITLB		0x04	/* Miss happened in I-TLB	   */
 #define FAULT_CODE_WINFIXUP	0x08	/* Miss happened during spill/fill */
 #define FAULT_CODE_BLKCOMMIT	0x10	/* Use blk-commit ASI in copy_page */
+#define	FAULT_CODE_BAD_RA	0x20	/* Bad RA for sun4v		   */
 
 #if PAGE_SHIFT == 13
 #define THREAD_SIZE (2*PAGE_SIZE)
diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S
index e0c09bf8..6179e19 100644
--- a/arch/sparc/kernel/sun4v_tlb_miss.S
+++ b/arch/sparc/kernel/sun4v_tlb_miss.S
@@ -195,6 +195,11 @@ sun4v_tsb_miss_common:
 	 ldx	[%g2 + TRAP_PER_CPU_PGD_PADDR], %g7
 
 sun4v_itlb_error:
+	rdpr	%tl, %g1
+	cmp	%g1, 1
+	ble,pt	%icc, sun4v_bad_ra
+	 or	%g0, FAULT_CODE_BAD_RA | FAULT_CODE_ITLB, %g1
+
 	sethi	%hi(sun4v_err_itlb_vaddr), %g1
 	stx	%g4, [%g1 + %lo(sun4v_err_itlb_vaddr)]
 	sethi	%hi(sun4v_err_itlb_ctx), %g1
@@ -206,15 +211,10 @@ sun4v_itlb_error:
 	sethi	%hi(sun4v_err_itlb_error), %g1
 	stx	%o0, [%g1 + %lo(sun4v_err_itlb_error)]
 
+	sethi	%hi(1f), %g7
 	rdpr	%tl, %g4
-	cmp	%g4, 1
-	ble,pt	%icc, 1f
-	 sethi	%hi(2f), %g7
 	ba,pt	%xcc, etraptl1
-	 or	%g7, %lo(2f), %g7
-
-1:	ba,pt	%xcc, etrap
-2:	 or	%g7, %lo(2b), %g7
+1:	 or	%g7, %lo(1f), %g7
 	mov	%l4, %o1
 	call	sun4v_itlb_error_report
 	 add	%sp, PTREGS_OFF, %o0
@@ -222,6 +222,11 @@ sun4v_itlb_error:
 	/* NOTREACHED */
 
 sun4v_dtlb_error:
+	rdpr	%tl, %g1
+	cmp	%g1, 1
+	ble,pt	%icc, sun4v_bad_ra
+	 or	%g0, FAULT_CODE_BAD_RA | FAULT_CODE_DTLB, %g1
+
 	sethi	%hi(sun4v_err_dtlb_vaddr), %g1
 	stx	%g4, [%g1 + %lo(sun4v_err_dtlb_vaddr)]
 	sethi	%hi(sun4v_err_dtlb_ctx), %g1
@@ -233,21 +238,23 @@ sun4v_dtlb_error:
 	sethi	%hi(sun4v_err_dtlb_error), %g1
 	stx	%o0, [%g1 + %lo(sun4v_err_dtlb_error)]
 
+	sethi	%hi(1f), %g7
 	rdpr	%tl, %g4
-	cmp	%g4, 1
-	ble,pt	%icc, 1f
-	 sethi	%hi(2f), %g7
 	ba,pt	%xcc, etraptl1
-	 or	%g7, %lo(2f), %g7
-
-1:	ba,pt	%xcc, etrap
-2:	 or	%g7, %lo(2b), %g7
+1:	 or	%g7, %lo(1f), %g7
 	mov	%l4, %o1
 	call	sun4v_dtlb_error_report
 	 add	%sp, PTREGS_OFF, %o0
 
 	/* NOTREACHED */
 
+sun4v_bad_ra:
+	or	%g0, %g4, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 or	%g1, %g0, %g4
+
+	/* NOTREACHED */
+
 	/* Instruction Access Exception, tl0. */
 sun4v_iacc:
 	ldxa	[%g0] ASI_SCRATCHPAD, %g2
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index fb6640e..981a769 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2104,6 +2104,11 @@ void sun4v_nonresum_overflow(struct pt_regs *regs)
 	atomic_inc(&sun4v_nonresum_oflow_cnt);
 }
 
+static void sun4v_tlb_error(struct pt_regs *regs)
+{
+	die_if_kernel("TLB/TSB error", regs);
+}
+
 unsigned long sun4v_err_itlb_vaddr;
 unsigned long sun4v_err_itlb_ctx;
 unsigned long sun4v_err_itlb_pte;
@@ -2111,8 +2116,7 @@ unsigned long sun4v_err_itlb_error;
 
 void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
 {
-	if (tl > 1)
-		dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+	dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
 
 	printk(KERN_EMERG "SUN4V-ITLB: Error at TPC[%lx], tl %d\n",
 	       regs->tpc, tl);
@@ -2125,7 +2129,7 @@ void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
 	       sun4v_err_itlb_vaddr, sun4v_err_itlb_ctx,
 	       sun4v_err_itlb_pte, sun4v_err_itlb_error);
 
-	prom_halt();
+	sun4v_tlb_error(regs);
 }
 
 unsigned long sun4v_err_dtlb_vaddr;
@@ -2135,8 +2139,7 @@ unsigned long sun4v_err_dtlb_error;
 
 void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
 {
-	if (tl > 1)
-		dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+	dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
 
 	printk(KERN_EMERG "SUN4V-DTLB: Error at TPC[%lx], tl %d\n",
 	       regs->tpc, tl);
@@ -2149,7 +2152,7 @@ void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
 	       sun4v_err_dtlb_vaddr, sun4v_err_dtlb_ctx,
 	       sun4v_err_dtlb_pte, sun4v_err_dtlb_error);
 
-	prom_halt();
+	sun4v_tlb_error(regs);
 }
 
 void hypervisor_tlbop_error(unsigned long err, unsigned long op)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 587cd05..18fcd71 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -346,6 +346,9 @@ retry:
 		down_read(&mm->mmap_sem);
 	}
 
+	if (fault_code & FAULT_CODE_BAD_RA)
+		goto do_sigbus;
+
 	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
-- 
cgit v0.10.2


From 3dee9df54836d5f844f3d58281d3f3e6331b467f Mon Sep 17 00:00:00 2001
From: bob picco <bpicco@meloft.net>
Date: Tue, 16 Sep 2014 09:28:15 -0400
Subject: sparc64: find_node adjustment

We have seen an issue with guest boot into LDOM that causes early boot failures
because of no matching rules for node identitity of the memory. I analyzed this
on my T4 and concluded there might not be a solution. I saw the issue in
mainline too when booting into the control/primary domain - with guests
configured.  Note, this could be a firmware bug on some older machines.

I'll provide a full explanation of the issues below. Should we not find a
matching BEST latency group for a real address (RA) then we will assume node 0.
On the T4-2 here with the information provided I can't see an alternative.

Technically the LDOM shown below should match the MBLOCK to the
favorable latency group. However other factors must be considered too. Were
the memory controllers configured "fine" grained interleave or "coarse"
grain interleaved -  T4. Also should a "group" MD node be considered a NUMA
node?

There has to be at least one Machine Description (MD) "group" and hence one
NUMA node. The group can have one or more latency groups (lg) - more than one
memory controller. The current code chooses the smallest latency as the most
favorable per group. The latency and lg information is in MLGROUP below.
MBLOCK is the base and size of the RAs for the machine as fetched from OBP
/memory "available" property. My machine has one MBLOCK but more would be
possible - with holes?

For a T4-2 the following information has been gathered:
with LDOM guest
MEMBLOCK configuration:
 memory size = 0x27f870000
 memory.cnt  = 0x3
 memory[0x0]    [0x00000020400000-0x0000029fc67fff], 0x27f868000 bytes
 memory[0x1]    [0x0000029fd8a000-0x0000029fd8bfff], 0x2000 bytes
 memory[0x2]    [0x0000029fd92000-0x0000029fd97fff], 0x6000 bytes
 reserved.cnt  = 0x2
 reserved[0x0]  [0x00000020800000-0x000000216c15c0], 0xec15c1 bytes
 reserved[0x1]  [0x00000024800000-0x0000002c180c1e], 0x7980c1f bytes
MBLOCK[0]: base[20000000] size[280000000] offset[0]
(note: "base" and "size" reported in "MBLOCK" encompass the "memory[X]" values)
(note: (RA + offset) & mask = val is the formula to detect a match for the
memory controller. should there be no match for find_node node, a return
value of -1 resulted for the node - BAD)

There is one group. It has these forward links
MLGROUP[1]: node[545] latency[1f7e8] match[200000000] mask[200000000]
MLGROUP[2]: node[54d] latency[2de60] match[0] mask[200000000]
NUMA NODE[0]: node[545] mask[200000000] val[200000000] (latency[1f7e8])
(note: "val" is the best lg's (smallest latency) "match")

no LDOM guest - bare metal
MEMBLOCK configuration:
 memory size = 0xfdf2d0000
 memory.cnt  = 0x3
 memory[0x0]    [0x00000020400000-0x00000fff6adfff], 0xfdf2ae000 bytes
 memory[0x1]    [0x00000fff6d2000-0x00000fff6e7fff], 0x16000 bytes
 memory[0x2]    [0x00000fff766000-0x00000fff771fff], 0xc000 bytes
 reserved.cnt  = 0x2
 reserved[0x0]  [0x00000020800000-0x00000021a04580], 0x1204581 bytes
 reserved[0x1]  [0x00000024800000-0x0000002c7d29fc], 0x7fd29fd bytes
MBLOCK[0]: base[20000000] size[fe0000000] offset[0]

there are two groups
group node[16d5]
MLGROUP[0]: node[1765] latency[1f7e8] match[0] mask[200000000]
MLGROUP[3]: node[177d] latency[2de60] match[200000000] mask[200000000]
NUMA NODE[0]: node[1765] mask[200000000] val[0] (latency[1f7e8])
group node[171d]
MLGROUP[2]: node[1775] latency[2de60] match[0] mask[200000000]
MLGROUP[1]: node[176d] latency[1f7e8] match[200000000] mask[200000000]
NUMA NODE[1]: node[176d] mask[200000000] val[200000000] (latency[1f7e8])
(note: for this two "group" bare metal machine, 1/2 memory is in group one's
lg and 1/2 memory is in group two's lg).

Cc: sparclinux@vger.kernel.org
Signed-off-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 98ac8e8..3b04505 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -840,7 +840,10 @@ static int find_node(unsigned long addr)
 		if ((addr & p->mask) == p->val)
 			return i;
 	}
-	return -1;
+	/* The following condition has been observed on LDOM guests.*/
+	WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node"
+		" rule. Some physical memory will be owned by node 0.");
+	return 0;
 }
 
 static u64 memblock_nid_range(u64 start, u64 end, int *nid)
-- 
cgit v0.10.2


From 7c21d533ab2ffa1e681bdaf4a53ce3046f6e0e17 Mon Sep 17 00:00:00 2001
From: bob picco <bpicco@meloft.net>
Date: Tue, 16 Sep 2014 09:29:54 -0400
Subject: sparc64: mem boot option correction

The "mem" boot option can result in many unexpected consequences. This patch
attempts to prevent boot hangs which have been experienced on T4-4 and T5-8.
Basically the boot loader allocates vmlinuz and initrd higher in available
OBP physical memory. For example, on a 2Tb T5-8 it isn't possible to boot
with mem=20G.

The patch utilizes memblock to avoid reserved regions and trim memory which
is only free. Other improvements are possible for a multi-node machine.

This is a snippet of the boot log with mem=20G on T5-8 with the patch applied:
MEMBLOCK configuration:	<- before memory reduction
 memory size = 0x1ffad6ce000 reserved size = 0xa1adf44
 memory.cnt  = 0xb
 memory[0x0]    [0x00000030400000-0x00003fdde47fff], 0x3fada48000 bytes
 memory[0x1]    [0x00003fdde4e000-0x00003fdde4ffff], 0x2000 bytes
 memory[0x2]    [0x00080000000000-0x00083fffffffff], 0x4000000000 bytes
 memory[0x3]    [0x00100000000000-0x00103fffffffff], 0x4000000000 bytes
 memory[0x4]    [0x00180000000000-0x00183fffffffff], 0x4000000000 bytes
 memory[0x5]    [0x00200000000000-0x00203fffffffff], 0x4000000000 bytes
 memory[0x6]    [0x00280000000000-0x00283fffffffff], 0x4000000000 bytes
 memory[0x7]    [0x00300000000000-0x00303fffffffff], 0x4000000000 bytes
 memory[0x8]    [0x00380000000000-0x00383fffc71fff], 0x3fffc72000 bytes
 memory[0x9]    [0x00383fffc92000-0x00383fffca1fff], 0x10000 bytes
 memory[0xa]    [0x00383fffcb4000-0x00383fffcb5fff], 0x2000 bytes
 reserved.cnt  = 0x2
 reserved[0x0]  [0x00380000000000-0x0038000117e7f8], 0x117e7f9 bytes
 reserved[0x1]  [0x00380004000000-0x0038000d02f74a], 0x902f74b bytes
...
MEMBLOCK configuration:	<- after reduction of memory
 memory size = 0x50a1adf44 reserved size = 0xa1adf44
 memory.cnt  = 0x4
 memory[0x0]    [0x00380000000000-0x0038000117e7f8], 0x117e7f9 bytes
 memory[0x1]    [0x00380004000000-0x0038050d01d74a], 0x50901d74b bytes
 memory[0x2]    [0x00383fffc92000-0x00383fffca1fff], 0x10000 bytes
 memory[0x3]    [0x00383fffcb4000-0x00383fffcb5fff], 0x2000 bytes
 reserved.cnt  = 0x2
 reserved[0x0]  [0x00380000000000-0x0038000117e7f8], 0x117e7f9 bytes
 reserved[0x1]  [0x00380004000000-0x0038000d02f74a], 0x902f74b bytes
...
Early memory node ranges
  node   7: [mem 0x380000000000-0x38000117dfff]
  node   7: [mem 0x380004000000-0x380f0d01bfff]
  node   7: [mem 0x383fffc92000-0x383fffca1fff]
  node   7: [mem 0x383fffcb4000-0x383fffcb5fff]
Could not find start_pfn for node 0
Could not find start_pfn for node 1
Could not find start_pfn for node 2
Could not find start_pfn for node 3
Could not find start_pfn for node 4
Could not find start_pfn for node 5
Could not find start_pfn for node 6
.

The patch was tested on T4-1, T5-8 and Jalap?no.

Cc: sparclinux@vger.kernel.org
Signed-off-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 1c7bfdf..e629b83 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -141,21 +141,9 @@ static void __init boot_flags_init(char *commands)
 				process_switch(*commands++);
 			continue;
 		}
-		if (!strncmp(commands, "mem=", 4)) {
-			/*
-			 * "mem=XXX[kKmM]" overrides the PROM-reported
-			 * memory size.
-			 */
-			cmdline_memory_size = simple_strtoul(commands + 4,
-							     &commands, 0);
-			if (*commands == 'K' || *commands == 'k') {
-				cmdline_memory_size <<= 10;
-				commands++;
-			} else if (*commands=='M' || *commands=='m') {
-				cmdline_memory_size <<= 20;
-				commands++;
-			}
-		}
+		if (!strncmp(commands, "mem=", 4))
+			cmdline_memory_size = memparse(commands + 4, &commands);
+
 		while (*commands && *commands != ' ')
 			commands++;
 	}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3b04505..c8bccaf 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1861,6 +1861,52 @@ pgd_t swapper_pg_dir[PTRS_PER_PGD];
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
 
+static phys_addr_t __init available_memory(void)
+{
+	phys_addr_t available = 0ULL;
+	phys_addr_t pa_start, pa_end;
+	u64 i;
+
+	for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
+		available = available + (pa_end  - pa_start);
+
+	return available;
+}
+
+/* We need to exclude reserved regions. This exclusion will include
+ * vmlinux and initrd. To be more precise the initrd size could be used to
+ * compute a new lower limit because it is freed later during initialization.
+ */
+static void __init reduce_memory(phys_addr_t limit_ram)
+{
+	phys_addr_t avail_ram = available_memory();
+	phys_addr_t pa_start, pa_end;
+	u64 i;
+
+	if (limit_ram >= avail_ram)
+		return;
+
+	for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
+		phys_addr_t region_size = pa_end - pa_start;
+		phys_addr_t clip_start = pa_start;
+
+		avail_ram = avail_ram - region_size;
+		/* Are we consuming too much? */
+		if (avail_ram < limit_ram) {
+			phys_addr_t give_back = limit_ram - avail_ram;
+
+			region_size = region_size - give_back;
+			clip_start = clip_start + give_back;
+		}
+
+		memblock_remove(clip_start, region_size);
+
+		if (avail_ram <= limit_ram)
+			break;
+		i = 0UL;
+	}
+}
+
 void __init paging_init(void)
 {
 	unsigned long end_pfn, shift, phys_base;
@@ -1940,7 +1986,8 @@ void __init paging_init(void)
 
 	find_ramdisk(phys_base);
 
-	memblock_enforce_memory_limit(cmdline_memory_size);
+	if (cmdline_memory_size)
+		reduce_memory(cmdline_memory_size);
 
 	memblock_allow_resize();
 	memblock_dump_all();
-- 
cgit v0.10.2


From 05aa1651e8b9ca078b1808a2fe7b50703353ec02 Mon Sep 17 00:00:00 2001
From: bob picco <bpicco@meloft.net>
Date: Tue, 16 Sep 2014 10:09:06 -0400
Subject: sparc64: T5 PMU

The T5 (niagara5) has different PCR related HV fast trap values and a new
HV API Group. This patch utilizes these and shares when possible with niagara4.

We use the same sparc_pmu niagara4_pmu. Should there be new effort to
obtain the MCU perf statistics then this would have to be changed.

Cc: sparclinux@vger.kernel.org
Signed-off-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h
index 94b39ca..4f6725f 100644
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -2947,6 +2947,16 @@ unsigned long sun4v_vt_set_perfreg(unsigned long reg_num,
 				   unsigned long reg_val);
 #endif
 
+#define	HV_FAST_T5_GET_PERFREG		0x1a8
+#define	HV_FAST_T5_SET_PERFREG		0x1a9
+
+#ifndef	__ASSEMBLY__
+unsigned long sun4v_t5_get_perfreg(unsigned long reg_num,
+				   unsigned long *reg_val);
+unsigned long sun4v_t5_set_perfreg(unsigned long reg_num,
+				   unsigned long reg_val);
+#endif
+
 /* Function numbers for HV_CORE_TRAP.  */
 #define HV_CORE_SET_VER			0x00
 #define HV_CORE_PUTCHAR			0x01
@@ -2978,6 +2988,7 @@ unsigned long sun4v_vt_set_perfreg(unsigned long reg_num,
 #define HV_GRP_VF_CPU			0x0205
 #define HV_GRP_KT_CPU			0x0209
 #define HV_GRP_VT_CPU			0x020c
+#define HV_GRP_T5_CPU			0x0211
 #define HV_GRP_DIAG			0x0300
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/kernel/hvapi.c b/arch/sparc/kernel/hvapi.c
index c0a2de0..5c55145 100644
--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@@ -46,6 +46,7 @@ static struct api_info api_table[] = {
 	{ .group = HV_GRP_VF_CPU,				},
 	{ .group = HV_GRP_KT_CPU,				},
 	{ .group = HV_GRP_VT_CPU,				},
+	{ .group = HV_GRP_T5_CPU,				},
 	{ .group = HV_GRP_DIAG,		.flags = FLAG_PRE_API	},
 };
 
diff --git a/arch/sparc/kernel/hvcalls.S b/arch/sparc/kernel/hvcalls.S
index f3ab509..caedf83 100644
--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@@ -821,3 +821,19 @@ ENTRY(sun4v_vt_set_perfreg)
 	retl
 	 nop
 ENDPROC(sun4v_vt_set_perfreg)
+
+ENTRY(sun4v_t5_get_perfreg)
+	mov	%o1, %o4
+	mov	HV_FAST_T5_GET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	retl
+	 nop
+ENDPROC(sun4v_t5_get_perfreg)
+
+ENTRY(sun4v_t5_set_perfreg)
+	mov	HV_FAST_T5_SET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+ENDPROC(sun4v_t5_set_perfreg)
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 269af58..7e967c8 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -191,12 +191,41 @@ static const struct pcr_ops n4_pcr_ops = {
 	.pcr_nmi_disable	= PCR_N4_PICNPT,
 };
 
+static u64 n5_pcr_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	(void) sun4v_t5_get_perfreg(reg_num, &val);
+
+	return val;
+}
+
+static void n5_pcr_write(unsigned long reg_num, u64 val)
+{
+	(void) sun4v_t5_set_perfreg(reg_num, val);
+}
+
+static const struct pcr_ops n5_pcr_ops = {
+	.read_pcr		= n5_pcr_read,
+	.write_pcr		= n5_pcr_write,
+	.read_pic		= n4_pic_read,
+	.write_pic		= n4_pic_write,
+	.nmi_picl_value		= n4_picl_value,
+	.pcr_nmi_enable		= (PCR_N4_PICNPT | PCR_N4_STRACE |
+				   PCR_N4_UTRACE | PCR_N4_TOE |
+				   (26 << PCR_N4_SL_SHIFT)),
+	.pcr_nmi_disable	= PCR_N4_PICNPT,
+};
+
+
 static unsigned long perf_hsvc_group;
 static unsigned long perf_hsvc_major;
 static unsigned long perf_hsvc_minor;
 
 static int __init register_perf_hsvc(void)
 {
+	unsigned long hverror;
+
 	if (tlb_type == hypervisor) {
 		switch (sun4v_chip_type) {
 		case SUN4V_CHIP_NIAGARA1:
@@ -215,6 +244,10 @@ static int __init register_perf_hsvc(void)
 			perf_hsvc_group = HV_GRP_VT_CPU;
 			break;
 
+		case SUN4V_CHIP_NIAGARA5:
+			perf_hsvc_group = HV_GRP_T5_CPU;
+			break;
+
 		default:
 			return -ENODEV;
 		}
@@ -222,10 +255,12 @@ static int __init register_perf_hsvc(void)
 
 		perf_hsvc_major = 1;
 		perf_hsvc_minor = 0;
-		if (sun4v_hvapi_register(perf_hsvc_group,
-					 perf_hsvc_major,
-					 &perf_hsvc_minor)) {
-			printk("perfmon: Could not register hvapi.\n");
+		hverror = sun4v_hvapi_register(perf_hsvc_group,
+					       perf_hsvc_major,
+					       &perf_hsvc_minor);
+		if (hverror) {
+			pr_err("perfmon: Could not register hvapi(0x%lx).\n",
+			       hverror);
 			return -ENODEV;
 		}
 	}
@@ -254,6 +289,10 @@ static int __init setup_sun4v_pcr_ops(void)
 		pcr_ops = &n4_pcr_ops;
 		break;
 
+	case SUN4V_CHIP_NIAGARA5:
+		pcr_ops = &n5_pcr_ops;
+		break;
+
 	default:
 		ret = -ENODEV;
 		break;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index d35c490..c9759ad 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1662,7 +1662,8 @@ static bool __init supported_pmu(void)
 		sparc_pmu = &niagara2_pmu;
 		return true;
 	}
-	if (!strcmp(sparc_pmu_type, "niagara4")) {
+	if (!strcmp(sparc_pmu_type, "niagara4") ||
+	    !strcmp(sparc_pmu_type, "niagara5")) {
 		sparc_pmu = &niagara4_pmu;
 		return true;
 	}
-- 
cgit v0.10.2


From c21c4ab0d6921f7160a43216fa6973b5924de561 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 16 Sep 2014 11:37:08 -0400
Subject: sparc64: Move request_irq() from ldc_bind() to ldc_alloc()

The request_irq() needs to be done from ldc_alloc()
to avoid the following (caught by lockdep)

 [00000000004a0738] __might_sleep+0xf8/0x120
 [000000000058bea4] kmem_cache_alloc_trace+0x184/0x2c0
 [00000000004faf80] request_threaded_irq+0x80/0x160
 [000000000044f71c] ldc_bind+0x7c/0x220
 [0000000000452454] vio_port_up+0x54/0xe0
 [00000000101f6778] probe_disk+0x38/0x220 [sunvdc]
 [00000000101f6b8c] vdc_port_probe+0x22c/0x300 [sunvdc]
 [0000000000451a88] vio_device_probe+0x48/0x60
 [000000000074c56c] really_probe+0x6c/0x300
 [000000000074c83c] driver_probe_device+0x3c/0xa0
 [000000000074c92c] __driver_attach+0x8c/0xa0
 [000000000074a6ec] bus_for_each_dev+0x6c/0xa0
 [000000000074c1dc] driver_attach+0x1c/0x40
 [000000000074b0fc] bus_add_driver+0xbc/0x280

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Dwight Engen <dwight.engen@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/ldc.h b/arch/sparc/include/asm/ldc.h
index c8c67f6..58ab64d 100644
--- a/arch/sparc/include/asm/ldc.h
+++ b/arch/sparc/include/asm/ldc.h
@@ -53,13 +53,14 @@ struct ldc_channel;
 /* Allocate state for a channel.  */
 struct ldc_channel *ldc_alloc(unsigned long id,
 			      const struct ldc_channel_config *cfgp,
-			      void *event_arg);
+			      void *event_arg,
+			      const char *name);
 
 /* Shut down and free state for a channel.  */
 void ldc_free(struct ldc_channel *lp);
 
 /* Register TX and RX queues of the link with the hypervisor.  */
-int ldc_bind(struct ldc_channel *lp, const char *name);
+int ldc_bind(struct ldc_channel *lp);
 
 /* For non-RAW protocols we need to complete a handshake before
  * communication can proceed.  ldc_connect() does that, if the
diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index dff60ab..f87a55d 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -1200,14 +1200,14 @@ static int ds_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	ds_cfg.tx_irq = vdev->tx_irq;
 	ds_cfg.rx_irq = vdev->rx_irq;
 
-	lp = ldc_alloc(vdev->channel_id, &ds_cfg, dp);
+	lp = ldc_alloc(vdev->channel_id, &ds_cfg, dp, "DS");
 	if (IS_ERR(lp)) {
 		err = PTR_ERR(lp);
 		goto out_free_ds_states;
 	}
 	dp->lp = lp;
 
-	err = ldc_bind(lp, "DS");
+	err = ldc_bind(lp);
 	if (err)
 		goto out_free_ldc;
 
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 66dacd5..27bb554 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -1078,7 +1078,8 @@ static void ldc_iommu_release(struct ldc_channel *lp)
 
 struct ldc_channel *ldc_alloc(unsigned long id,
 			      const struct ldc_channel_config *cfgp,
-			      void *event_arg)
+			      void *event_arg,
+			      const char *name)
 {
 	struct ldc_channel *lp;
 	const struct ldc_mode_ops *mops;
@@ -1093,6 +1094,8 @@ struct ldc_channel *ldc_alloc(unsigned long id,
 	err = -EINVAL;
 	if (!cfgp)
 		goto out_err;
+	if (!name)
+		goto out_err;
 
 	switch (cfgp->mode) {
 	case LDC_MODE_RAW:
@@ -1185,6 +1188,21 @@ struct ldc_channel *ldc_alloc(unsigned long id,
 
 	INIT_HLIST_HEAD(&lp->mh_list);
 
+	snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name);
+	snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name);
+
+	err = request_irq(lp->cfg.rx_irq, ldc_rx, 0,
+			  lp->rx_irq_name, lp);
+	if (err)
+		goto out_free_txq;
+
+	err = request_irq(lp->cfg.tx_irq, ldc_tx, 0,
+			  lp->tx_irq_name, lp);
+	if (err) {
+		free_irq(lp->cfg.rx_irq, lp);
+		goto out_free_txq;
+	}
+
 	return lp;
 
 out_free_txq:
@@ -1237,31 +1255,14 @@ EXPORT_SYMBOL(ldc_free);
  * state.  This does not initiate a handshake, ldc_connect() does
  * that.
  */
-int ldc_bind(struct ldc_channel *lp, const char *name)
+int ldc_bind(struct ldc_channel *lp)
 {
 	unsigned long hv_err, flags;
 	int err = -EINVAL;
 
-	if (!name ||
-	    (lp->state != LDC_STATE_INIT))
+	if (lp->state != LDC_STATE_INIT)
 		return -EINVAL;
 
-	snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name);
-	snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name);
-
-	err = request_irq(lp->cfg.rx_irq, ldc_rx, 0,
-			  lp->rx_irq_name, lp);
-	if (err)
-		return err;
-
-	err = request_irq(lp->cfg.tx_irq, ldc_tx, 0,
-			  lp->tx_irq_name, lp);
-	if (err) {
-		free_irq(lp->cfg.rx_irq, lp);
-		return err;
-	}
-
-
 	spin_lock_irqsave(&lp->lock, flags);
 
 	enable_irq(lp->cfg.rx_irq);
diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c
index f8e7dd5..9c5fbd0 100644
--- a/arch/sparc/kernel/viohs.c
+++ b/arch/sparc/kernel/viohs.c
@@ -714,7 +714,7 @@ int vio_ldc_alloc(struct vio_driver_state *vio,
 	cfg.tx_irq = vio->vdev->tx_irq;
 	cfg.rx_irq = vio->vdev->rx_irq;
 
-	lp = ldc_alloc(vio->vdev->channel_id, &cfg, event_arg);
+	lp = ldc_alloc(vio->vdev->channel_id, &cfg, event_arg, vio->name);
 	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 
@@ -746,7 +746,7 @@ void vio_port_up(struct vio_driver_state *vio)
 
 	err = 0;
 	if (state == LDC_STATE_INIT) {
-		err = ldc_bind(vio->lp, vio->name);
+		err = ldc_bind(vio->lp);
 		if (err)
 			printk(KERN_WARNING "%s: Port %lu bind failed, "
 			       "err=%d\n",
-- 
cgit v0.10.2


From 9d0713edf72461438bc3526e4ea55fec47754cd9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 24 Sep 2014 21:05:30 -0700
Subject: sparc64: Fix hibernation code refrence to PAGE_OFFSET.

We changed PAGE_OFFSET to be a variable rather than a constant,
but this reference here in the hibernate assembler got missed.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/power/hibernate_asm.S b/arch/sparc/power/hibernate_asm.S
index 7994216..d7d9017 100644
--- a/arch/sparc/power/hibernate_asm.S
+++ b/arch/sparc/power/hibernate_asm.S
@@ -54,8 +54,8 @@ ENTRY(swsusp_arch_resume)
 	 nop
 
 	/* Write PAGE_OFFSET to %g7 */
-	sethi	%uhi(PAGE_OFFSET), %g7
-	sllx	%g7, 32, %g7
+	sethi	%hi(PAGE_OFFSET), %g7
+	ldx	[%g7 + %lo(PAGE_OFFSET)], %g7
 
 	setuw	(PAGE_SIZE-8), %g3
 
-- 
cgit v0.10.2


From 163a4e7473061388bba0899a1a063bae44e1715a Mon Sep 17 00:00:00 2001
From: David L Stevens <david.stevens@oracle.com>
Date: Mon, 29 Sep 2014 19:47:59 -0400
Subject: sparc: VIO protocol version 1.6

Add VIO protocol version 1.6 interfaces.

Signed-off-by: David L Stevens <david.stevens@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/vio.h b/arch/sparc/include/asm/vio.h
index e0f6c39..6b135a8 100644
--- a/arch/sparc/include/asm/vio.h
+++ b/arch/sparc/include/asm/vio.h
@@ -65,6 +65,7 @@ struct vio_dring_register {
 	u16			options;
 #define VIO_TX_DRING		0x0001
 #define VIO_RX_DRING		0x0002
+#define VIO_RX_DRING_DATA	0x0004
 	u16			resv;
 	u32			num_cookies;
 	struct ldc_trans_cookie	cookies[0];
@@ -80,6 +81,8 @@ struct vio_dring_unregister {
 #define VIO_PKT_MODE		0x01 /* Packet based transfer	*/
 #define VIO_DESC_MODE		0x02 /* In-band descriptors	*/
 #define VIO_DRING_MODE		0x03 /* Descriptor rings	*/
+/* in vers >= 1.2, VIO_DRING_MODE is 0x04 and transfer mode is a bitmask */
+#define VIO_NEW_DRING_MODE	0x04
 
 struct vio_dring_data {
 	struct vio_msg_tag	tag;
@@ -205,10 +208,20 @@ struct vio_net_attr_info {
 	u8			addr_type;
 #define VNET_ADDR_ETHERMAC	0x01
 	u16			ack_freq;
-	u32			resv1;
+	u8			plnk_updt;
+#define PHYSLINK_UPDATE_NONE		0x00
+#define PHYSLINK_UPDATE_STATE		0x01
+#define PHYSLINK_UPDATE_STATE_ACK	0x02
+#define PHYSLINK_UPDATE_STATE_NACK	0x03
+	u8			options;
+	u16			resv1;
 	u64			addr;
 	u64			mtu;
-	u64			resv2[3];
+	u16			cflags;
+#define VNET_LSO_IPV4_CAPAB		0x0001
+	u16			ipv4_lso_maxlen;
+	u32			resv2;
+	u64			resv3[2];
 };
 
 #define VNET_NUM_MCAST		7
@@ -366,6 +379,33 @@ struct vio_driver_state {
 	struct vio_driver_ops	*ops;
 };
 
+static inline bool vio_version_before(struct vio_driver_state *vio,
+				      u16 major, u16 minor)
+{
+	u32 have = (u32)vio->ver.major << 16 | vio->ver.minor;
+	u32 want = (u32)major << 16 | minor;
+
+	return have < want;
+}
+
+static inline bool vio_version_after(struct vio_driver_state *vio,
+				      u16 major, u16 minor)
+{
+	u32 have = (u32)vio->ver.major << 16 | vio->ver.minor;
+	u32 want = (u32)major << 16 | minor;
+
+	return have > want;
+}
+
+static inline bool vio_version_after_eq(struct vio_driver_state *vio,
+					u16 major, u16 minor)
+{
+	u32 have = (u32)vio->ver.major << 16 | vio->ver.minor;
+	u32 want = (u32)major << 16 | minor;
+
+	return have >= want;
+}
+
 #define viodbg(TYPE, f, a...) \
 do {	if (vio->debug & VIO_DEBUG_##TYPE) \
 		printk(KERN_INFO "vio: ID[%lu] " f, \
diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c
index 9c5fbd0..526fcb5 100644
--- a/arch/sparc/kernel/viohs.c
+++ b/arch/sparc/kernel/viohs.c
@@ -426,6 +426,13 @@ static int process_dreg_info(struct vio_driver_state *vio,
 	if (vio->dr_state & VIO_DR_STATE_RXREG)
 		goto send_nack;
 
+	/* v1.6 and higher, ACK with desired, supported mode, or NACK */
+	if (vio_version_after_eq(vio, 1, 6)) {
+		if (!(pkt->options & VIO_TX_DRING))
+			goto send_nack;
+		pkt->options = VIO_TX_DRING;
+	}
+
 	BUG_ON(vio->desc_buf);
 
 	vio->desc_buf = kzalloc(pkt->descr_size, GFP_ATOMIC);
@@ -453,8 +460,11 @@ static int process_dreg_info(struct vio_driver_state *vio,
 	pkt->tag.stype = VIO_SUBTYPE_ACK;
 	pkt->dring_ident = ++dr->ident;
 
-	viodbg(HS, "SEND DRING_REG ACK ident[%llx]\n",
-	       (unsigned long long) pkt->dring_ident);
+	viodbg(HS, "SEND DRING_REG ACK ident[%llx] "
+	       "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+	       (unsigned long long) pkt->dring_ident,
+	       pkt->num_descr, pkt->descr_size, pkt->options,
+	       pkt->num_cookies);
 
 	len = (sizeof(*pkt) +
 	       (dr->ncookies * sizeof(struct ldc_trans_cookie)));
-- 
cgit v0.10.2


From 9bce21828d54a95143f1b74619705c2dd8e88b92 Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.pais@oracle.com>
Date: Fri, 19 Sep 2014 09:42:14 -0400
Subject: sunvdc: add cdrom and v1.1 protocol support

Interpret the media type from v1.1 protocol to support CDROM/DVD.

For v1.0 protocol, a disk's size continues to be calculated from the
geometry returned by the vdisk server. The geometry returned by the server
can be less than the actual number of sectors available in the backing
image/device due to the rounding in the division used to compute the
geometry in the vdisk server.

In v1.1 protocol a disk's actual size in sectors is returned during the
handshake. Use this size when v1.1 protocol is negotiated. Since this size
will always be larger than the former geometry computed size, disks created
under v1.0 will be forwards compatible to v1.1, but not vice versa.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/vio.h b/arch/sparc/include/asm/vio.h
index 6b135a8..94a02db 100644
--- a/arch/sparc/include/asm/vio.h
+++ b/arch/sparc/include/asm/vio.h
@@ -121,12 +121,18 @@ struct vio_disk_attr_info {
 	u8			vdisk_type;
 #define VD_DISK_TYPE_SLICE	0x01 /* Slice in block device	*/
 #define VD_DISK_TYPE_DISK	0x02 /* Entire block device	*/
-	u16			resv1;
+	u8			vdisk_mtype;		/* v1.1 */
+#define VD_MEDIA_TYPE_FIXED	0x01 /* Fixed device */
+#define VD_MEDIA_TYPE_CD	0x02 /* CD Device    */
+#define VD_MEDIA_TYPE_DVD	0x03 /* DVD Device   */
+	u8			resv1;
 	u32			vdisk_block_size;
 	u64			operations;
-	u64			vdisk_size;
+	u64			vdisk_size;		/* v1.1 */
 	u64			max_xfer_size;
-	u64			resv2[2];
+	u32			phys_block_size;	/* v1.2 */
+	u32			resv2;
+	u64			resv3[1];
 };
 
 struct vio_disk_desc {
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 5814deb..66ddf70 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -9,6 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 #include <linux/genhd.h>
+#include <linux/cdrom.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -22,8 +23,8 @@
 
 #define DRV_MODULE_NAME		"sunvdc"
 #define PFX DRV_MODULE_NAME	": "
-#define DRV_MODULE_VERSION	"1.0"
-#define DRV_MODULE_RELDATE	"June 25, 2007"
+#define DRV_MODULE_VERSION	"1.1"
+#define DRV_MODULE_RELDATE	"February 13, 2013"
 
 static char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
@@ -65,6 +66,7 @@ struct vdc_port {
 	u64			operations;
 	u32			vdisk_size;
 	u8			vdisk_type;
+	u8			vdisk_mtype;
 
 	char			disk_name[32];
 
@@ -79,9 +81,16 @@ static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 
 /* Ordered from largest major to lowest */
 static struct vio_version vdc_versions[] = {
+	{ .major = 1, .minor = 1 },
 	{ .major = 1, .minor = 0 },
 };
 
+static inline int vdc_version_supported(struct vdc_port *port,
+					u16 major, u16 minor)
+{
+	return port->vio.ver.major == major && port->vio.ver.minor >= minor;
+}
+
 #define VDCBLK_NAME	"vdisk"
 static int vdc_major;
 #define PARTITION_SHIFT	3
@@ -103,9 +112,41 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
+/* Add ioctl/CDROM_GET_CAPABILITY to support cdrom_id in udev
+ * when vdisk_mtype is VD_MEDIA_TYPE_CD or VD_MEDIA_TYPE_DVD.
+ * Needed to be able to install inside an ldom from an iso image.
+ */
+static int vdc_ioctl(struct block_device *bdev, fmode_t mode,
+		     unsigned command, unsigned long argument)
+{
+	int i;
+	struct gendisk *disk;
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		pr_debug(PFX "Multisession CDs not supported\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case CDROM_GET_CAPABILITY:
+		disk = bdev->bd_disk;
+
+		if (bdev->bd_disk && (disk->flags & GENHD_FL_CD))
+			return 0;
+		return -EINVAL;
+
+	default:
+		pr_debug(PFX "ioctl %08x not supported\n", command);
+		return -EINVAL;
+	}
+}
+
 static const struct block_device_operations vdc_fops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= vdc_getgeo,
+	.ioctl		= vdc_ioctl,
 };
 
 static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
@@ -165,9 +206,9 @@ static int vdc_handle_attr(struct vio_driver_state *vio, void *arg)
 	struct vio_disk_attr_info *pkt = arg;
 
 	viodbg(HS, "GOT ATTR stype[0x%x] ops[%llx] disk_size[%llu] disk_type[%x] "
-	       "xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
+	       "mtype[0x%x] xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
 	       pkt->tag.stype, pkt->operations,
-	       pkt->vdisk_size, pkt->vdisk_type,
+	       pkt->vdisk_size, pkt->vdisk_type, pkt->vdisk_mtype,
 	       pkt->xfer_mode, pkt->vdisk_block_size,
 	       pkt->max_xfer_size);
 
@@ -192,8 +233,11 @@ static int vdc_handle_attr(struct vio_driver_state *vio, void *arg)
 		}
 
 		port->operations = pkt->operations;
-		port->vdisk_size = pkt->vdisk_size;
 		port->vdisk_type = pkt->vdisk_type;
+		if (vdc_version_supported(port, 1, 1)) {
+			port->vdisk_size = pkt->vdisk_size;
+			port->vdisk_mtype = pkt->vdisk_mtype;
+		}
 		if (pkt->max_xfer_size < port->max_xfer_size)
 			port->max_xfer_size = pkt->max_xfer_size;
 		port->vdisk_block_size = pkt->vdisk_block_size;
@@ -663,18 +707,25 @@ static int probe_disk(struct vdc_port *port)
 		return err;
 	}
 
-	err = generic_request(port, VD_OP_GET_DISKGEOM,
-			      &port->geom, sizeof(port->geom));
-	if (err < 0) {
-		printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns "
-		       "error %d\n", err);
-		return err;
+	if (vdc_version_supported(port, 1, 1)) {
+		/* vdisk_size should be set during the handshake, if it wasn't
+		 * then the underlying disk is reserved by another system
+		 */
+		if (port->vdisk_size == -1)
+			return -ENODEV;
+	} else {
+		err = generic_request(port, VD_OP_GET_DISKGEOM,
+				      &port->geom, sizeof(port->geom));
+		if (err < 0) {
+			printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns "
+			       "error %d\n", err);
+			return err;
+		}
+		port->vdisk_size = ((u64)port->geom.num_cyl *
+				    (u64)port->geom.num_hd *
+				    (u64)port->geom.num_sec);
 	}
 
-	port->vdisk_size = ((u64)port->geom.num_cyl *
-			    (u64)port->geom.num_hd *
-			    (u64)port->geom.num_sec);
-
 	q = blk_init_queue(do_vdc_request, &port->vio.lock);
 	if (!q) {
 		printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
@@ -704,9 +755,32 @@ static int probe_disk(struct vdc_port *port)
 
 	set_capacity(g, port->vdisk_size);
 
-	printk(KERN_INFO PFX "%s: %u sectors (%u MB)\n",
+	if (vdc_version_supported(port, 1, 1)) {
+		switch (port->vdisk_mtype) {
+		case VD_MEDIA_TYPE_CD:
+			pr_info(PFX "Virtual CDROM %s\n", port->disk_name);
+			g->flags |= GENHD_FL_CD;
+			g->flags |= GENHD_FL_REMOVABLE;
+			set_disk_ro(g, 1);
+			break;
+
+		case VD_MEDIA_TYPE_DVD:
+			pr_info(PFX "Virtual DVD %s\n", port->disk_name);
+			g->flags |= GENHD_FL_CD;
+			g->flags |= GENHD_FL_REMOVABLE;
+			set_disk_ro(g, 1);
+			break;
+
+		case VD_MEDIA_TYPE_FIXED:
+			pr_info(PFX "Virtual Hard disk %s\n", port->disk_name);
+			break;
+		}
+	}
+
+	pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
 	       g->disk_name,
-	       port->vdisk_size, (port->vdisk_size >> (20 - 9)));
+	       port->vdisk_size, (port->vdisk_size >> (20 - 9)),
+	       port->vio.ver.major, port->vio.ver.minor);
 
 	add_disk(g);
 
@@ -765,6 +839,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	else
 		snprintf(port->disk_name, sizeof(port->disk_name),
 			 VDCBLK_NAME "%c", 'a' + ((int)vdev->dev_no % 26));
+	port->vdisk_size = -1;
 
 	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
 			      vdc_versions, ARRAY_SIZE(vdc_versions),
-- 
cgit v0.10.2


From de5b73f08468b4fc5e2f6d1505f650262622f78b Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.pais@oracle.com>
Date: Fri, 19 Sep 2014 09:42:26 -0400
Subject: sunvdc: compute vdisk geometry from capacity

The LDom diskserver doesn't return reliable geometry data. In addition,
the types for all fields in the vio_disk_geom are u16, which were being
truncated in the cast into the u8's of the Linux struct hd_geometry.

Modify vdc_getgeo() to compute the geometry from the disk's capacity in a
manner consistent with xen-blkfront::blkif_getgeo().

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 66ddf70..1616ad0 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -70,7 +70,6 @@ struct vdc_port {
 
 	char			disk_name[32];
 
-	struct vio_disk_geom	geom;
 	struct vio_disk_vtoc	label;
 };
 
@@ -103,11 +102,15 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
 static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	struct vdc_port *port = disk->private_data;
+	sector_t nsect = get_capacity(disk);
+	sector_t cylinders = nsect;
 
-	geo->heads = (u8) port->geom.num_hd;
-	geo->sectors = (u8) port->geom.num_sec;
-	geo->cylinders = port->geom.num_cyl;
+	geo->heads = 0xff;
+	geo->sectors = 0x3f;
+	sector_div(cylinders, geo->heads * geo->sectors);
+	geo->cylinders = cylinders;
+	if ((sector_t)(geo->cylinders + 1) * geo->heads * geo->sectors < nsect)
+		geo->cylinders = 0xffff;
 
 	return 0;
 }
@@ -714,16 +717,18 @@ static int probe_disk(struct vdc_port *port)
 		if (port->vdisk_size == -1)
 			return -ENODEV;
 	} else {
+		struct vio_disk_geom geom;
+
 		err = generic_request(port, VD_OP_GET_DISKGEOM,
-				      &port->geom, sizeof(port->geom));
+				      &geom, sizeof(geom));
 		if (err < 0) {
 			printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns "
 			       "error %d\n", err);
 			return err;
 		}
-		port->vdisk_size = ((u64)port->geom.num_cyl *
-				    (u64)port->geom.num_hd *
-				    (u64)port->geom.num_sec);
+		port->vdisk_size = ((u64)geom.num_cyl *
+				    (u64)geom.num_hd *
+				    (u64)geom.num_sec);
 	}
 
 	q = blk_init_queue(do_vdc_request, &port->vio.lock);
-- 
cgit v0.10.2


From 5eed69ffd248c9f68f56c710caf07db134aef28b Mon Sep 17 00:00:00 2001
From: Dwight Engen <dwight.engen@oracle.com>
Date: Fri, 19 Sep 2014 09:42:53 -0400
Subject: sunvdc: limit each sg segment to a page

ldc_map_sg() could fail its check that the number of pages referred to
by the sg scatterlist was <= the number of cookies.

This fixes the issue by doing a similar thing to the xen-blkfront driver,
ensuring that the scatterlist will only ever contain a segment count <=
port->ring_cookies, and each segment will be page aligned, and <= page
size. This ensures that the scatterlist is always mappable.

Orabug: 19347817
OraBZ: 15945

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 1616ad0..1a9360d 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -747,6 +747,10 @@ static int probe_disk(struct vdc_port *port)
 
 	port->disk = g;
 
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(q, PAGE_SIZE);
+
 	blk_queue_max_segments(q, port->ring_cookies);
 	blk_queue_max_hw_sectors(q, port->max_xfer_size);
 	g->major = vdc_major;
-- 
cgit v0.10.2


From d0aedcd4f14a22e23b313f42b7e6e6ebfc0fbc31 Mon Sep 17 00:00:00 2001
From: Dwight Engen <dwight.engen@oracle.com>
Date: Fri, 19 Sep 2014 09:43:02 -0400
Subject: vio: fix reuse of vio_dring slot

vio_dring_avail() will allow use of every dring entry, but when the last
entry is allocated then dr->prod == dr->cons which is indistinguishable from
the ring empty condition. This causes the next allocation to reuse an entry.
When this happens in sunvdc, the server side vds driver begins nack'ing the
messages and ends up resetting the ldc channel. This problem does not effect
sunvnet since it checks for < 2.

The fix here is to just never allocate the very last dring slot so that full
and empty are not the same condition. The request start path was changed to
check for the ring being full a bit earlier, and to stop the blk_queue if
there is no space left. The blk_queue will be restarted once the ring is
only half full again. The number of ring entries was increased to 512 which
matches the sunvnet and Solaris vdc drivers, and greatly reduces the
frequency of hitting the ring full condition and the associated blk_queue
stop/starting. The checks in sunvent were adjusted to account for
vio_dring_avail() returning 1 less.

Orabug: 19441666
OraBZ: 14983

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/vio.h b/arch/sparc/include/asm/vio.h
index 94a02db..7fce9b1 100644
--- a/arch/sparc/include/asm/vio.h
+++ b/arch/sparc/include/asm/vio.h
@@ -278,7 +278,7 @@ static inline u32 vio_dring_avail(struct vio_dring_state *dr,
 				  unsigned int ring_size)
 {
 	return (dr->pending -
-		((dr->prod - dr->cons) & (ring_size - 1)));
+		((dr->prod - dr->cons) & (ring_size - 1)) - 1);
 }
 
 #define VIO_MAX_TYPE_LEN	32
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 1a9360d..756b8ec 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("Sun LDOM virtual disk client driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
-#define VDC_TX_RING_SIZE	256
+#define VDC_TX_RING_SIZE	512
 
 #define WAITING_FOR_LINK_UP	0x01
 #define WAITING_FOR_TX_SPACE	0x02
@@ -283,7 +283,9 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 
 	__blk_end_request(req, (desc->status ? -EIO : 0), desc->size);
 
-	if (blk_queue_stopped(port->disk->queue))
+	/* restart blk queue when ring is half emptied */
+	if (blk_queue_stopped(port->disk->queue) &&
+	    vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
 		blk_start_queue(port->disk->queue);
 }
 
@@ -435,12 +437,6 @@ static int __send_request(struct request *req)
 	for (i = 0; i < nsg; i++)
 		len += sg[i].length;
 
-	if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
-		blk_stop_queue(port->disk->queue);
-		err = -ENOMEM;
-		goto out;
-	}
-
 	desc = vio_dring_cur(dr);
 
 	err = ldc_map_sg(port->vio.lp, sg, nsg,
@@ -480,21 +476,32 @@ static int __send_request(struct request *req)
 		port->req_id++;
 		dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1);
 	}
-out:
 
 	return err;
 }
 
-static void do_vdc_request(struct request_queue *q)
+static void do_vdc_request(struct request_queue *rq)
 {
-	while (1) {
-		struct request *req = blk_fetch_request(q);
+	struct request *req;
 
-		if (!req)
-			break;
+	while ((req = blk_peek_request(rq)) != NULL) {
+		struct vdc_port *port;
+		struct vio_dring_state *dr;
 
-		if (__send_request(req) < 0)
-			__blk_end_request_all(req, -EIO);
+		port = req->rq_disk->private_data;
+		dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+		if (unlikely(vdc_tx_dring_avail(dr) < 1))
+			goto wait;
+
+		blk_start_request(req);
+
+		if (__send_request(req) < 0) {
+			blk_requeue_request(rq, req);
+wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			break;
+		}
 	}
 }
 
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index 23c89ab..b7cca71 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -690,7 +690,7 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_lock_irqsave(&port->vio.lock, flags);
 
 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
-	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
+	if (unlikely(vnet_tx_dring_avail(dr) < 1)) {
 		if (!netif_queue_stopped(dev)) {
 			netif_stop_queue(dev);
 
@@ -746,7 +746,7 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	dev->stats.tx_bytes += skb->len;
 
 	dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
-	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
+	if (unlikely(vnet_tx_dring_avail(dr) < 1)) {
 		netif_stop_queue(dev);
 		if (vnet_tx_dring_avail(dr) > VNET_TX_WAKEUP_THRESH(dr))
 			netif_wake_queue(dev);
-- 
cgit v0.10.2


From ca605b7dd740c8909408d67911d8ddd272c2b320 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 23 Sep 2014 11:37:44 -0400
Subject: sparc64: Add vio_set_intr() to enable/disable Rx interrupts

The vio_set_intr() API should be used by VIO consumers to enable/disable
Rx interrupts to facilitate deferred processing in softirq/bottom-half
context.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/asm/vio.h b/arch/sparc/include/asm/vio.h
index 7fce9b1..d758c8d 100644
--- a/arch/sparc/include/asm/vio.h
+++ b/arch/sparc/include/asm/vio.h
@@ -298,6 +298,7 @@ struct vio_dev {
 
 	unsigned int		tx_irq;
 	unsigned int		rx_irq;
+	u64			rx_ino;
 
 	struct device		dev;
 };
@@ -453,5 +454,6 @@ int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
 		    char *name);
 
 void vio_port_up(struct vio_driver_state *vio);
+int vio_set_intr(unsigned long dev_ino, int state);
 
 #endif /* _SPARC64_VIO_H */
diff --git a/arch/sparc/kernel/vio.c b/arch/sparc/kernel/vio.c
index 8647fcc..cb5789c 100644
--- a/arch/sparc/kernel/vio.c
+++ b/arch/sparc/kernel/vio.c
@@ -180,8 +180,10 @@ static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp,
 			vdev->tx_irq = sun4v_build_virq(cdev_cfg_handle, *irq);
 
 		irq = mdesc_get_property(hp, target, "rx-ino", NULL);
-		if (irq)
+		if (irq) {
 			vdev->rx_irq = sun4v_build_virq(cdev_cfg_handle, *irq);
+			vdev->rx_ino = *irq;
+		}
 
 		chan_id = mdesc_get_property(hp, target, "id", NULL);
 		if (chan_id)
@@ -189,6 +191,15 @@ static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp,
 	}
 }
 
+int vio_set_intr(unsigned long dev_ino, int state)
+{
+	int err;
+
+	err = sun4v_vintr_set_valid(cdev_cfg_handle, dev_ino, state);
+	return err;
+}
+EXPORT_SYMBOL(vio_set_intr);
+
 static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 				      struct device *parent)
 {
-- 
cgit v0.10.2


From 473ad7f4fb005d1bb727e4ef27d370d28703a062 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 4 Oct 2014 21:05:14 -0700
Subject: sparc64: Fix reversed start/end in flush_tlb_kernel_range()

When we have to split up a flush request into multiple pieces
(in order to avoid the firmware range) we don't specify the
arguments in the right order for the second piece.

Fix the order, or else we get hangs as the code tries to
flush "a lot" of entries and we get lockups like this:

[ 4422.981276] NMI watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [expect:117032]
[ 4422.996130] Modules linked in: ipv6 loop usb_storage igb ptp sg sr_mod ehci_pci ehci_hcd pps_core n2_rng rng_core
[ 4423.016617] CPU: 12 PID: 117032 Comm: expect Not tainted 3.17.0-rc4+ #1608
[ 4423.030331] task: fff8003cc730e220 ti: fff8003d99d54000 task.ti: fff8003d99d54000
[ 4423.045282] TSTATE: 0000000011001602 TPC: 00000000004521e8 TNPC: 00000000004521ec Y: 00000000    Not tainted
[ 4423.064905] TPC: <__flush_tlb_kernel_range+0x28/0x40>
[ 4423.074964] g0: 000000000052fd10 g1: 00000001295a8000 g2: ffffff7176ffc000 g3: 0000000000002000
[ 4423.092324] g4: fff8003cc730e220 g5: fff8003dfedcc000 g6: fff8003d99d54000 g7: 0000000000000006
[ 4423.109687] o0: 0000000000000000 o1: 0000000000000000 o2: 0000000000000003 o3: 00000000f0000000
[ 4423.127058] o4: 0000000000000080 o5: 00000001295a8000 sp: fff8003d99d56d01 ret_pc: 000000000052ff54
[ 4423.145121] RPC: <__purge_vmap_area_lazy+0x314/0x3a0>
[ 4423.155185] l0: 0000000000000000 l1: 0000000000000000 l2: 0000000000a38040 l3: 0000000000000000
[ 4423.172559] l4: fff8003dae8965e0 l5: ffffffffffffffff l6: 0000000000000000 l7: 00000000f7e2b138
[ 4423.189913] i0: fff8003d99d576a0 i1: fff8003d99d576a8 i2: fff8003d99d575e8 i3: 0000000000000000
[ 4423.207284] i4: 0000000000008008 i5: fff8003d99d575c8 i6: fff8003d99d56df1 i7: 0000000000530c24
[ 4423.224640] I7: <free_vmap_area_noflush+0x64/0x80>
[ 4423.234193] Call Trace:
[ 4423.239051]  [0000000000530c24] free_vmap_area_noflush+0x64/0x80
[ 4423.251029]  [0000000000531a7c] remove_vm_area+0x5c/0x80
[ 4423.261628]  [0000000000531b80] __vunmap+0x20/0x120
[ 4423.271352]  [000000000071cf18] n_tty_close+0x18/0x40
[ 4423.281423]  [00000000007222b0] tty_ldisc_close+0x30/0x60
[ 4423.292183]  [00000000007225a4] tty_ldisc_reinit+0x24/0xa0
[ 4423.303120]  [0000000000722ab4] tty_ldisc_hangup+0xd4/0x1e0
[ 4423.314232]  [0000000000719aa0] __tty_hangup+0x280/0x3c0
[ 4423.324835]  [0000000000724cb4] pty_close+0x134/0x1a0
[ 4423.334905]  [000000000071aa24] tty_release+0x104/0x500
[ 4423.345316]  [00000000005511d0] __fput+0x90/0x1e0
[ 4423.354701]  [000000000047fa54] task_work_run+0x94/0xe0
[ 4423.365126]  [0000000000404b44] __handle_signal+0xc/0x2c

Fixes: 4ca9a23765da ("sparc64: Guard against flushing openfirmware mappings.")
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c8bccaf..bd08ed4 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2837,8 +2837,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 			do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS);
 		}
 		if (end > HI_OBP_ADDRESS) {
-			flush_tsb_kernel_range(end, HI_OBP_ADDRESS);
-			do_flush_tlb_kernel_range(end, HI_OBP_ADDRESS);
+			flush_tsb_kernel_range(HI_OBP_ADDRESS, end);
+			do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end);
 		}
 	} else {
 		flush_tsb_kernel_range(start, end);
-- 
cgit v0.10.2


From ac55c768143aa34cc3789c4820cbb0809a76fd9c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 26 Sep 2014 21:19:46 -0700
Subject: sparc64: Switch to 4-level page tables.

This has become necessary with chips that support more than 43-bits
of physical addressing.

Based almost entirely upon a patch by Bob Picco.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index bf10998..09ceb68 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -57,18 +57,21 @@ void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topa
 typedef struct { unsigned long pte; } pte_t;
 typedef struct { unsigned long iopte; } iopte_t;
 typedef struct { unsigned long pmd; } pmd_t;
+typedef struct { unsigned long pud; } pud_t;
 typedef struct { unsigned long pgd; } pgd_t;
 typedef struct { unsigned long pgprot; } pgprot_t;
 
 #define pte_val(x)	((x).pte)
 #define iopte_val(x)	((x).iopte)
 #define pmd_val(x)      ((x).pmd)
+#define pud_val(x)      ((x).pud)
 #define pgd_val(x)	((x).pgd)
 #define pgprot_val(x)	((x).pgprot)
 
 #define __pte(x)	((pte_t) { (x) } )
 #define __iopte(x)	((iopte_t) { (x) } )
 #define __pmd(x)        ((pmd_t) { (x) } )
+#define __pud(x)        ((pud_t) { (x) } )
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __pgprot(x)	((pgprot_t) { (x) } )
 
@@ -77,18 +80,21 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 typedef unsigned long pte_t;
 typedef unsigned long iopte_t;
 typedef unsigned long pmd_t;
+typedef unsigned long pud_t;
 typedef unsigned long pgd_t;
 typedef unsigned long pgprot_t;
 
 #define pte_val(x)	(x)
 #define iopte_val(x)	(x)
 #define pmd_val(x)      (x)
+#define pud_val(x)      (x)
 #define pgd_val(x)	(x)
 #define pgprot_val(x)	(x)
 
 #define __pte(x)	(x)
 #define __iopte(x)	(x)
 #define __pmd(x)        (x)
+#define __pud(x)        (x)
 #define __pgd(x)	(x)
 #define __pgprot(x)	(x)
 
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 39a7ac4..5e31871 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -15,6 +15,13 @@
 
 extern struct kmem_cache *pgtable_cache;
 
+static inline void __pgd_populate(pgd_t *pgd, pud_t *pud)
+{
+	pgd_set(pgd, pud);
+}
+
+#define pgd_populate(MM, PGD, PUD)	__pgd_populate(PGD, PUD)
+
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL);
@@ -25,7 +32,23 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	kmem_cache_free(pgtable_cache, pgd);
 }
 
-#define pud_populate(MM, PUD, PMD)	pud_set(PUD, PMD)
+static inline void __pud_populate(pud_t *pud, pmd_t *pmd)
+{
+	pud_set(pud, pmd);
+}
+
+#define pud_populate(MM, PUD, PMD)	__pud_populate(PUD, PMD)
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return kmem_cache_alloc(pgtable_cache,
+				GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	kmem_cache_free(pgtable_cache, pud);
+}
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -91,4 +114,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pte_t *pte,
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
 	pgtable_free_tlb(tlb, pmd, false)
 
+#define __pud_free_tlb(tlb, pud, addr)		      \
+	pgtable_free_tlb(tlb, pud, false)
+
 #endif /* _SPARC64_PGALLOC_H */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 3770bf5..31ac919 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -20,8 +20,6 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 
-#include <asm-generic/pgtable-nopud.h>
-
 /* The kernel image occupies 0x4000000 to 0x6000000 (4MB --> 96MB).
  * The page copy blockops can use 0x6000000 to 0x8000000.
  * The 8K TSB is mapped in the 0x8000000 to 0x8400000 range.
@@ -55,13 +53,21 @@
 #define PMD_MASK	(~(PMD_SIZE-1))
 #define PMD_BITS	(PAGE_SHIFT - 3)
 
-/* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS)
+/* PUD_SHIFT determines the size of the area a third-level page
+ * table can map
+ */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_BITS)
+#define PUD_SIZE	(_AC(1,UL) << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+#define PUD_BITS	(PAGE_SHIFT - 3)
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_BITS)
 #define PGDIR_SIZE	(_AC(1,UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PGDIR_BITS	(PAGE_SHIFT - 3)
 
-#if (PGDIR_SHIFT + PGDIR_BITS) != 43
+#if (PGDIR_SHIFT + PGDIR_BITS) != 53
 #error Page table parameters do not cover virtual address space properly.
 #endif
 
@@ -93,6 +99,7 @@ static inline bool kern_addr_valid(unsigned long addr)
 /* Entries per page directory level. */
 #define PTRS_PER_PTE	(1UL << (PAGE_SHIFT-3))
 #define PTRS_PER_PMD	(1UL << PMD_BITS)
+#define PTRS_PER_PUD	(1UL << PUD_BITS)
 #define PTRS_PER_PGD	(1UL << PGDIR_BITS)
 
 /* Kernel has a separate 44bit address space. */
@@ -101,6 +108,9 @@ static inline bool kern_addr_valid(unsigned long addr)
 #define pmd_ERROR(e)							\
 	pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n",		\
 	       __FILE__, __LINE__, &(e), pmd_val(e), __builtin_return_address(0))
+#define pud_ERROR(e)							\
+	pr_err("%s:%d: bad pud %p(%016lx) seen at (%pS)\n",		\
+	       __FILE__, __LINE__, &(e), pud_val(e), __builtin_return_address(0))
 #define pgd_ERROR(e)							\
 	pr_err("%s:%d: bad pgd %p(%016lx) seen at (%pS)\n",		\
 	       __FILE__, __LINE__, &(e), pgd_val(e), __builtin_return_address(0))
@@ -779,6 +789,11 @@ static inline int pmd_present(pmd_t pmd)
 #define pud_bad(pud)			((pud_val(pud) & ~PAGE_MASK) || \
 					 !__kern_addr_valid(pud_val(pud)))
 
+#define pgd_none(pgd)			(!pgd_val(pgd))
+
+#define pgd_bad(pgd)			((pgd_val(pgd) & ~PAGE_MASK) || \
+					 !__kern_addr_valid(pgd_val(pgd)))
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 		pmd_t *pmdp, pmd_t pmd);
@@ -815,10 +830,17 @@ static inline unsigned long __pmd_page(pmd_t pmd)
 #define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0UL)
 #define pud_present(pud)		(pud_val(pud) != 0U)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
+#define pgd_page_vaddr(pgd)		\
+	((unsigned long) __va(pgd_val(pgd)))
+#define pgd_present(pgd)		(pgd_val(pgd) != 0U)
+#define pgd_clear(pgdp)			(pgd_val(*(pgd)) = 0UL)
 
 /* Same in both SUN4V and SUN4U.  */
 #define pte_none(pte) 			(!pte_val(pte))
 
+#define pgd_set(pgdp, pudp)	\
+	(pgd_val(*(pgdp)) = (__pa((unsigned long) (pudp))))
+
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address)	(((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
 #define pgd_offset(mm, address)	((mm)->pgd + pgd_index(address))
@@ -826,6 +848,11 @@ static inline unsigned long __pmd_page(pmd_t pmd)
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
+/* Find an entry in the third-level page table.. */
+#define pud_index(address)	(((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+#define pud_offset(pgdp, address)	\
+	((pud_t *) pgd_page_vaddr(*(pgdp)) + pud_index(address))
+
 /* Find an entry in the second-level page table.. */
 #define pmd_offset(pudp, address)	\
 	((pmd_t *) pud_page_vaddr(*(pudp)) + \
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index 90916f9..2e268b6 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -145,6 +145,11 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	andn		REG2, 0x7, REG2; \
 	ldx		[REG1 + REG2], REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		VADDR, 64 - (PUD_SHIFT + PUD_BITS), REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	andn		REG2, 0x7, REG2; \
+	ldxa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
 	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
 	andn		REG2, 0x7, REG2; \
@@ -198,6 +203,11 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	andn		REG2, 0x7, REG2; \
 	ldxa		[PHYS_PGD + REG2] ASI_PHYS_USE_EC, REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		VADDR, 64 - (PUD_SHIFT + PUD_BITS), REG2; \
+	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
+	andn		REG2, 0x7, REG2; \
+	ldxa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+	brz,pn		REG1, FAIL_LABEL; \
 	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
 	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
 	andn		REG2, 0x7, REG2; \
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f7ba875..c9300bf 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1467,6 +1467,13 @@ static void __init pcpu_populate_pte(unsigned long addr)
 	pud_t *pud;
 	pmd_t *pmd;
 
+	if (pgd_none(*pgd)) {
+		pud_t *new;
+
+		new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+		pgd_populate(&init_mm, pgd, new);
+	}
+
 	pud = pud_offset(pgd, addr);
 	if (pud_none(*pud)) {
 		pmd_t *new;
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index bd08ed4..091f846 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1390,6 +1390,13 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 		pmd_t *pmd;
 		pte_t *pte;
 
+		if (pgd_none(*pgd)) {
+			pud_t *new;
+
+			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+			alloc_bytes += PAGE_SIZE;
+			pgd_populate(&init_mm, pgd, new);
+		}
 		pud = pud_offset(pgd, vstart);
 		if (pud_none(*pud)) {
 			pmd_t *new;
@@ -1856,7 +1863,12 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 /* paging_init() sets up the page tables */
 
 static unsigned long last_valid_pfn;
-pgd_t swapper_pg_dir[PTRS_PER_PGD];
+
+/* These must be page aligned in order to not trigger the
+ * alignment tests of pgd_bad() and pud_bad().
+ */
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__ ((aligned (PAGE_SIZE)));
+static pud_t swapper_pud_dir[PTRS_PER_PUD] __attribute__ ((aligned (PAGE_SIZE)));
 
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
@@ -1911,6 +1923,8 @@ void __init paging_init(void)
 {
 	unsigned long end_pfn, shift, phys_base;
 	unsigned long real_end, i;
+	pud_t *pud;
+	pmd_t *pmd;
 	int node;
 
 	setup_page_offset();
@@ -2008,9 +2022,18 @@ void __init paging_init(void)
 	
 	memset(swapper_low_pmd_dir, 0, sizeof(swapper_low_pmd_dir));
 
-	/* Now can init the kernel/bad page tables. */
-	pud_set(pud_offset(&swapper_pg_dir[0], 0),
-		swapper_low_pmd_dir + (shift / sizeof(pgd_t)));
+	/* The kernel page tables we publish into what the rest of the
+	 * world sees must be adjusted so that they see the PAGE_OFFSET
+	 * address of these in-kerenel data structures.  However right
+	 * here we must access them from the kernel image side, because
+	 * the trap tables haven't been taken over and therefore we cannot
+	 * take TLB misses in the PAGE_OFFSET linear mappings yet.
+	 */
+	pud = swapper_pud_dir + (shift / sizeof(pud_t));
+	pgd_set(&swapper_pg_dir[0], pud);
+
+	pmd = swapper_low_pmd_dir + (shift / sizeof(pmd_t));
+	pud_set(&swapper_pud_dir[0], pmd);
 	
 	inherit_prom_mappings();
 	
-- 
cgit v0.10.2


From 4397bed080598001e88f612deb8b080bb1cc2322 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 26 Sep 2014 21:58:33 -0700
Subject: sparc64: Define VA hole at run time, rather than at compile time.

Now that we use 4-level page tables, we can provide up to 53-bits of
virtual address space to the user.

Adjust the VA hole based upon the capabilities of the cpu type probed.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index 09ceb68..2211a80 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -102,21 +102,14 @@ typedef unsigned long pgprot_t;
 
 typedef pte_t *pgtable_t;
 
-/* These two values define the virtual address space range in which we
- * must forbid 64-bit user processes from making mappings.  It used to
- * represent precisely the virtual address space hole present in most
- * early sparc64 chips including UltraSPARC-I.  But now it also is
- * further constrained by the limits of our page tables, which is
- * 43-bits of virtual address.
- */
-#define SPARC64_VA_HOLE_TOP	_AC(0xfffffc0000000000,UL)
-#define SPARC64_VA_HOLE_BOTTOM	_AC(0x0000040000000000,UL)
+extern unsigned long sparc64_va_hole_top;
+extern unsigned long sparc64_va_hole_bottom;
 
 /* The next two defines specify the actual exclusion region we
  * enforce, wherein we use a 4GB red zone on each side of the VA hole.
  */
-#define VA_EXCLUDE_START (SPARC64_VA_HOLE_BOTTOM - (1UL << 32UL))
-#define VA_EXCLUDE_END   (SPARC64_VA_HOLE_TOP + (1UL << 32UL))
+#define VA_EXCLUDE_START (sparc64_va_hole_bottom - (1UL << 32UL))
+#define VA_EXCLUDE_END   (sparc64_va_hole_top + (1UL << 32UL))
 
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_32BIT) ? \
 				 _AC(0x0000000070000000,UL) : \
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 091f846..c241c57 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1630,25 +1630,46 @@ static void __init page_offset_shift_patch(unsigned long phys_bits)
 	}
 }
 
+unsigned long sparc64_va_hole_top =    0xfffff80000000000UL;
+unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL;
+
 static void __init setup_page_offset(void)
 {
 	unsigned long max_phys_bits = 40;
 
 	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+		/* Cheetah/Panther support a full 64-bit virtual
+		 * address, so we can use all that our page tables
+		 * support.
+		 */
+		sparc64_va_hole_top =    0xfff0000000000000UL;
+		sparc64_va_hole_bottom = 0x0010000000000000UL;
+
 		max_phys_bits = 42;
 	} else if (tlb_type == hypervisor) {
 		switch (sun4v_chip_type) {
 		case SUN4V_CHIP_NIAGARA1:
 		case SUN4V_CHIP_NIAGARA2:
+			/* T1 and T2 support 48-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xffff800000000000UL;
+			sparc64_va_hole_bottom = 0x0000800000000000UL;
+
 			max_phys_bits = 39;
 			break;
 		case SUN4V_CHIP_NIAGARA3:
+			/* T3 supports 48-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xffff800000000000UL;
+			sparc64_va_hole_bottom = 0x0000800000000000UL;
+
 			max_phys_bits = 43;
 			break;
 		case SUN4V_CHIP_NIAGARA4:
 		case SUN4V_CHIP_NIAGARA5:
 		case SUN4V_CHIP_SPARC64X:
 		default:
+			/* T4 and later support 52-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xfff8000000000000UL;
+			sparc64_va_hole_bottom = 0x0008000000000000UL;
 			max_phys_bits = 47;
 			break;
 		}
-- 
cgit v0.10.2


From 8c82dc0e883821c098c8b0b130ffebabf9aab5df Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 17 Sep 2014 10:14:56 -0700
Subject: sparc64: Adjust KTSB assembler to support larger physical addresses.

As currently coded the KTSB accesses in the kernel only support up to
47 bits of physical addressing.

Adjust the instruction and patching sequence in order to support
arbitrary 64 bits addresses.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index 2e268b6..a2f5419 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -256,8 +256,6 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	(KERNEL_TSB_SIZE_BYTES / 16)
 #define KERNEL_TSB4M_NENTRIES	4096
 
-#define KTSB_PHYS_SHIFT		15
-
 	/* Do a kernel TSB lookup at tl>0 on VADDR+TAG, branch to OK_LABEL
 	 * on TSB hit.  REG1, REG2, REG3, and REG4 are used as temporaries
 	 * and the found TTE will be left in REG1.  REG3 and REG4 must
@@ -266,17 +264,15 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	 * VADDR and TAG will be preserved and not clobbered by this macro.
 	 */
 #define KERN_TSB_LOOKUP_TL1(VADDR, TAG, REG1, REG2, REG3, REG4, OK_LABEL) \
-661:	sethi		%hi(swapper_tsb), REG1;			\
-	or		REG1, %lo(swapper_tsb), REG1; \
+661:	sethi		%uhi(swapper_tsb), REG1; \
+	sethi		%hi(swapper_tsb), REG2; \
+	or		REG1, %ulo(swapper_tsb), REG1; \
+	or		REG2, %lo(swapper_tsb), REG2; \
 	.section	.swapper_tsb_phys_patch, "ax"; \
 	.word		661b; \
 	.previous; \
-661:	nop; \
-	.section	.tsb_ldquad_phys_patch, "ax"; \
-	.word		661b; \
-	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
-	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
-	.previous; \
+	sllx		REG1, 32, REG1; \
+	or		REG1, REG2, REG1; \
 	srlx		VADDR, PAGE_SHIFT, REG2; \
 	and		REG2, (KERNEL_TSB_NENTRIES - 1), REG2; \
 	sllx		REG2, 4, REG2; \
@@ -291,17 +287,15 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	 * we can make use of that for the index computation.
 	 */
 #define KERN_TSB4M_LOOKUP_TL1(TAG, REG1, REG2, REG3, REG4, OK_LABEL) \
-661:	sethi		%hi(swapper_4m_tsb), REG1;	     \
-	or		REG1, %lo(swapper_4m_tsb), REG1; \
+661:	sethi		%uhi(swapper_4m_tsb), REG1; \
+	sethi		%hi(swapper_4m_tsb), REG2; \
+	or		REG1, %ulo(swapper_4m_tsb), REG1; \
+	or		REG2, %lo(swapper_4m_tsb), REG2; \
 	.section	.swapper_4m_tsb_phys_patch, "ax"; \
 	.word		661b; \
 	.previous; \
-661:	nop; \
-	.section	.tsb_ldquad_phys_patch, "ax"; \
-	.word		661b; \
-	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
-	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
-	.previous; \
+	sllx		REG1, 32, REG1; \
+	or		REG1, REG2, REG1; \
 	and		TAG, (KERNEL_TSB4M_NENTRIES - 1), REG2; \
 	sllx		REG2, 4, REG2; \
 	add		REG1, REG2, REG2; \
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c241c57..35fcc9c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1733,19 +1733,41 @@ static void __init tsb_phys_patch(void)
 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 
+/* The swapper TSBs are loaded with a base sequence of:
+ *
+ *	sethi	%uhi(SYMBOL), REG1
+ *	sethi	%hi(SYMBOL), REG2
+ *	or	REG1, %ulo(SYMBOL), REG1
+ *	or	REG2, %lo(SYMBOL), REG2
+ *	sllx	REG1, 32, REG1
+ *	or	REG1, REG2, REG1
+ *
+ * When we use physical addressing for the TSB accesses, we patch the
+ * first four instructions in the above sequence.
+ */
+
 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
 {
-	pa >>= KTSB_PHYS_SHIFT;
+	unsigned long high_bits, low_bits;
+
+	high_bits = (pa >> 32) & 0xffffffff;
+	low_bits = (pa >> 0) & 0xffffffff;
 
 	while (start < end) {
 		unsigned int *ia = (unsigned int *)(unsigned long)*start;
 
-		ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10);
+		ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10);
 		__asm__ __volatile__("flush	%0" : : "r" (ia));
 
-		ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff);
+		ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10);
 		__asm__ __volatile__("flush	%0" : : "r" (ia + 1));
 
+		ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff);
+		__asm__ __volatile__("flush	%0" : : "r" (ia + 2));
+
+		ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff);
+		__asm__ __volatile__("flush	%0" : : "r" (ia + 3));
+
 		start++;
 	}
 }
-- 
cgit v0.10.2


From 0dd5b7b09e13dae32869371e08e1048349fd040c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 24 Sep 2014 20:56:11 -0700
Subject: sparc64: Fix physical memory management regressions with large
 max_phys_bits.

If max_phys_bits needs to be > 43 (f.e. for T4 chips), things like
DEBUG_PAGEALLOC stop working because the 3-level page tables only
can cover up to 43 bits.

Another problem is that when we increased MAX_PHYS_ADDRESS_BITS up to
47, several statically allocated tables became enormous.

Compounding this is that we will need to support up to 49 bits of
physical addressing for M7 chips.

The two tables in question are sparc64_valid_addr_bitmap and
kpte_linear_bitmap.

The first holds a bitmap, with 1 bit for each 4MB chunk of physical
memory, indicating whether that chunk actually exists in the machine
and is valid.

The second table is a set of 2-bit values which tell how large of a
mapping (4MB, 256MB, 2GB, 16GB, respectively) we can use at each 256MB
chunk of ram in the system.

These tables are huge and take up an enormous amount of the BSS
section of the sparc64 kernel image.  Specifically, the
sparc64_valid_addr_bitmap is 4MB, and the kpte_linear_bitmap is 128K.

So let's solve the space wastage and the DEBUG_PAGEALLOC problem
at the same time, by using the kernel page tables (as designed) to
manage this information.

We have to keep using large mappings when DEBUG_PAGEALLOC is disabled,
and we do this by encoding huge PMDs and PUDs.

On a T4-2 with 256GB of ram the kernel page table takes up 16K with
DEBUG_PAGEALLOC disabled and 256MB with it enabled.  Furthermore, this
memory is dynamically allocated at run time rather than coded
statically into the kernel image.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index 2211a80..732ba17 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -128,9 +128,6 @@ extern unsigned long PAGE_OFFSET;
  */
 #define MAX_PHYS_ADDRESS_BITS	47
 
-/* These two shift counts are used when indexing sparc64_valid_addr_bitmap
- * and kpte_linear_bitmap.
- */
 #define ILOG2_4MB		22
 #define ILOG2_256MB		28
 
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 31ac919..a305b22 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -79,22 +79,7 @@
 
 #include <linux/sched.h>
 
-extern unsigned long sparc64_valid_addr_bitmap[];
-
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-static inline bool __kern_addr_valid(unsigned long paddr)
-{
-	if ((paddr >> MAX_PHYS_ADDRESS_BITS) != 0UL)
-		return false;
-	return test_bit(paddr >> ILOG2_4MB, sparc64_valid_addr_bitmap);
-}
-
-static inline bool kern_addr_valid(unsigned long addr)
-{
-	unsigned long paddr = __pa(addr);
-
-	return __kern_addr_valid(paddr);
-}
+bool kern_addr_valid(unsigned long addr);
 
 /* Entries per page directory level. */
 #define PTRS_PER_PTE	(1UL << (PAGE_SHIFT-3))
@@ -122,6 +107,7 @@ static inline bool kern_addr_valid(unsigned long addr)
 #define _PAGE_R	  	  _AC(0x8000000000000000,UL) /* Keep ref bit uptodate*/
 #define _PAGE_SPECIAL     _AC(0x0200000000000000,UL) /* Special page         */
 #define _PAGE_PMD_HUGE    _AC(0x0100000000000000,UL) /* Huge page            */
+#define _PAGE_PUD_HUGE    _PAGE_PMD_HUGE
 
 /* Advertise support for _PAGE_SPECIAL */
 #define __HAVE_ARCH_PTE_SPECIAL
@@ -668,26 +654,26 @@ static inline unsigned long pmd_large(pmd_t pmd)
 	return pte_val(pte) & _PAGE_PMD_HUGE;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline unsigned long pmd_young(pmd_t pmd)
+static inline unsigned long pmd_pfn(pmd_t pmd)
 {
 	pte_t pte = __pte(pmd_val(pmd));
 
-	return pte_young(pte);
+	return pte_pfn(pte);
 }
 
-static inline unsigned long pmd_write(pmd_t pmd)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline unsigned long pmd_young(pmd_t pmd)
 {
 	pte_t pte = __pte(pmd_val(pmd));
 
-	return pte_write(pte);
+	return pte_young(pte);
 }
 
-static inline unsigned long pmd_pfn(pmd_t pmd)
+static inline unsigned long pmd_write(pmd_t pmd)
 {
 	pte_t pte = __pte(pmd_val(pmd));
 
-	return pte_pfn(pte);
+	return pte_write(pte);
 }
 
 static inline unsigned long pmd_trans_huge(pmd_t pmd)
@@ -781,18 +767,15 @@ static inline int pmd_present(pmd_t pmd)
  * the top bits outside of the range of any physical address size we
  * support are clear as well.  We also validate the physical itself.
  */
-#define pmd_bad(pmd)			((pmd_val(pmd) & ~PAGE_MASK) || \
-					 !__kern_addr_valid(pmd_val(pmd)))
+#define pmd_bad(pmd)			(pmd_val(pmd) & ~PAGE_MASK)
 
 #define pud_none(pud)			(!pud_val(pud))
 
-#define pud_bad(pud)			((pud_val(pud) & ~PAGE_MASK) || \
-					 !__kern_addr_valid(pud_val(pud)))
+#define pud_bad(pud)			(pud_val(pud) & ~PAGE_MASK)
 
 #define pgd_none(pgd)			(!pgd_val(pgd))
 
-#define pgd_bad(pgd)			((pgd_val(pgd) & ~PAGE_MASK) || \
-					 !__kern_addr_valid(pgd_val(pgd)))
+#define pgd_bad(pgd)			(pgd_val(pgd) & ~PAGE_MASK)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void set_pmd_at(struct mm_struct *mm, unsigned long addr,
@@ -835,6 +818,20 @@ static inline unsigned long __pmd_page(pmd_t pmd)
 #define pgd_present(pgd)		(pgd_val(pgd) != 0U)
 #define pgd_clear(pgdp)			(pgd_val(*(pgd)) = 0UL)
 
+static inline unsigned long pud_large(pud_t pud)
+{
+	pte_t pte = __pte(pud_val(pud));
+
+	return pte_val(pte) & _PAGE_PMD_HUGE;
+}
+
+static inline unsigned long pud_pfn(pud_t pud)
+{
+	pte_t pte = __pte(pud_val(pud));
+
+	return pte_pfn(pte);
+}
+
 /* Same in both SUN4V and SUN4U.  */
 #define pte_none(pte) 			(!pte_val(pte))
 
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index a2f5419..ecb49cf 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -133,9 +133,24 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	sub	TSB, 0x8, TSB;   \
 	TSB_STORE(TSB, TAG);
 
-	/* Do a kernel page table walk.  Leaves physical PTE pointer in
-	 * REG1.  Jumps to FAIL_LABEL on early page table walk termination.
-	 * VADDR will not be clobbered, but REG2 will.
+	/* Do a kernel page table walk.  Leaves valid PTE value in
+	 * REG1.  Jumps to FAIL_LABEL on early page table walk
+	 * termination.  VADDR will not be clobbered, but REG2 will.
+	 *
+	 * There are two masks we must apply to propagate bits from
+	 * the virtual address into the PTE physical address field
+	 * when dealing with huge pages.  This is because the page
+	 * table boundaries do not match the huge page size(s) the
+	 * hardware supports.
+	 *
+	 * In these cases we propagate the bits that are below the
+	 * page table level where we saw the huge page mapping, but
+	 * are still within the relevant physical bits for the huge
+	 * page size in question.  So for PMD mappings (which fall on
+	 * bit 23, for 8MB per PMD) we must propagate bit 22 for a
+	 * 4MB huge page.  For huge PUDs (which fall on bit 33, for
+	 * 8GB per PUD), we have to accomodate 256MB and 2GB huge
+	 * pages.  So for those we propagate bits 32 to 28.
 	 */
 #define KERN_PGTABLE_WALK(VADDR, REG1, REG2, FAIL_LABEL)	\
 	sethi		%hi(swapper_pg_dir), REG1; \
@@ -150,15 +165,35 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
 	andn		REG2, 0x7, REG2; \
 	ldxa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
 	brz,pn		REG1, FAIL_LABEL; \
-	 sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
+	sethi		%uhi(_PAGE_PUD_HUGE), REG2; \
+	brz,pn		REG1, FAIL_LABEL; \
+	 sllx		REG2, 32, REG2; \
+	andcc		REG1, REG2, %g0; \
+	sethi		%hi(0xf8000000), REG2; \
+	bne,pt		%xcc, 697f; \
+	 sllx		REG2, 1, REG2; \
+	sllx		VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
 	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
 	andn		REG2, 0x7, REG2; \
 	ldxa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+	sethi		%uhi(_PAGE_PMD_HUGE), REG2; \
 	brz,pn		REG1, FAIL_LABEL; \
-	 sllx		VADDR, 64 - PMD_SHIFT, REG2; \
+	 sllx		REG2, 32, REG2; \
+	andcc		REG1, REG2, %g0; \
+	be,pn		%xcc, 698f; \
+	 sethi		%hi(0x400000), REG2; \
+697:	brgez,pn	REG1, FAIL_LABEL; \
+	 andn		REG1, REG2, REG1; \
+	and		VADDR, REG2, REG2; \
+	ba,pt		%xcc, 699f; \
+	 or		REG1, REG2, REG1; \
+698:	sllx		VADDR, 64 - PMD_SHIFT, REG2; \
 	srlx		REG2, 64 - PAGE_SHIFT, REG2; \
 	andn		REG2, 0x7, REG2; \
-	add		REG1, REG2, REG1;
+	ldxa		[REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
+	brgez,pn	REG1, FAIL_LABEL; \
+	 nop; \
+699:
 
 	/* PMD has been loaded into REG1, interpret the value, seeing
 	 * if it is a HUGE PMD or a normal one.  If it is not valid
diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S
index 605d492..94a1e66 100644
--- a/arch/sparc/kernel/ktlb.S
+++ b/arch/sparc/kernel/ktlb.S
@@ -47,14 +47,6 @@ kvmap_itlb_vmalloc_addr:
 	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_itlb_longpath)
 
 	TSB_LOCK_TAG(%g1, %g2, %g7)
-
-	/* Load and check PTE.  */
-	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
-	mov		1, %g7
-	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
-	brgez,a,pn	%g5, kvmap_itlb_longpath
-	 TSB_STORE(%g1, %g7)
-
 	TSB_WRITE(%g1, %g5, %g6)
 
 	/* fallthrough to TLB load */
@@ -118,6 +110,12 @@ kvmap_dtlb_obp:
 	ba,pt		%xcc, kvmap_dtlb_load
 	 nop
 
+kvmap_linear_early:
+	sethi		%hi(kern_linear_pte_xor), %g7
+	ldx		[%g7 + %lo(kern_linear_pte_xor)], %g2
+	ba,pt		%xcc, kvmap_dtlb_tsb4m_load
+	 xor		%g2, %g4, %g5
+
 	.align		32
 kvmap_dtlb_tsb4m_load:
 	TSB_LOCK_TAG(%g1, %g2, %g7)
@@ -146,105 +144,17 @@ kvmap_dtlb_4v:
 	/* Correct TAG_TARGET is already in %g6, check 4mb TSB.  */
 	KERN_TSB4M_LOOKUP_TL1(%g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
 #endif
-	/* TSB entry address left in %g1, lookup linear PTE.
-	 * Must preserve %g1 and %g6 (TAG).
-	 */
-kvmap_dtlb_tsb4m_miss:
-	/* Clear the PAGE_OFFSET top virtual bits, shift
-	 * down to get PFN, and make sure PFN is in range.
-	 */
-661:	sllx		%g4, 0, %g5
-	.section	.page_offset_shift_patch, "ax"
-	.word		661b
-	.previous
-
-	/* Check to see if we know about valid memory at the 4MB
-	 * chunk this physical address will reside within.
+	/* Linear mapping TSB lookup failed.  Fallthrough to kernel
+	 * page table based lookup.
 	 */
-661:	srlx		%g5, MAX_PHYS_ADDRESS_BITS, %g2
-	.section	.page_offset_shift_patch, "ax"
-	.word		661b
-	.previous
-
-	brnz,pn		%g2, kvmap_dtlb_longpath
-	 nop
-
-	/* This unconditional branch and delay-slot nop gets patched
-	 * by the sethi sequence once the bitmap is properly setup.
-	 */
-	.globl		valid_addr_bitmap_insn
-valid_addr_bitmap_insn:
-	ba,pt		%xcc, 2f
-	 nop
-	.subsection	2
-	.globl		valid_addr_bitmap_patch
-valid_addr_bitmap_patch:
-	sethi		%hi(sparc64_valid_addr_bitmap), %g7
-	or		%g7, %lo(sparc64_valid_addr_bitmap), %g7
-	.previous
-
-661:	srlx		%g5, ILOG2_4MB, %g2
-	.section	.page_offset_shift_patch, "ax"
-	.word		661b
-	.previous
-
-	srlx		%g2, 6, %g5
-	and		%g2, 63, %g2
-	sllx		%g5, 3, %g5
-	ldx		[%g7 + %g5], %g5
-	mov		1, %g7
-	sllx		%g7, %g2, %g7
-	andcc		%g5, %g7, %g0
-	be,pn		%xcc, kvmap_dtlb_longpath
-
-2:	 sethi		%hi(kpte_linear_bitmap), %g2
-
-	/* Get the 256MB physical address index. */
-661:	sllx		%g4, 0, %g5
-	.section	.page_offset_shift_patch, "ax"
-	.word		661b
-	.previous
-
-	or		%g2, %lo(kpte_linear_bitmap), %g2
-
-661:	srlx		%g5, ILOG2_256MB, %g5
-	.section	.page_offset_shift_patch, "ax"
-	.word		661b
-	.previous
-
-	and		%g5, (32 - 1), %g7
-
-	/* Divide by 32 to get the offset into the bitmask.  */
-	srlx		%g5, 5, %g5
-	add		%g7, %g7, %g7
-	sllx		%g5, 3, %g5
-
-	/* kern_linear_pte_xor[(mask >> shift) & 3)] */
-	ldx		[%g2 + %g5], %g2
-	srlx		%g2, %g7, %g7
-	sethi		%hi(kern_linear_pte_xor), %g5
-	and		%g7, 3, %g7
-	or		%g5, %lo(kern_linear_pte_xor), %g5
-	sllx		%g7, 3, %g7
-	ldx		[%g5 + %g7], %g2
-
 	.globl		kvmap_linear_patch
 kvmap_linear_patch:
-	ba,pt		%xcc, kvmap_dtlb_tsb4m_load
-	 xor		%g2, %g4, %g5
+	ba,a,pt		%xcc, kvmap_linear_early
 
 kvmap_dtlb_vmalloc_addr:
 	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
 
 	TSB_LOCK_TAG(%g1, %g2, %g7)
-
-	/* Load and check PTE.  */
-	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
-	mov		1, %g7
-	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
-	brgez,a,pn	%g5, kvmap_dtlb_longpath
-	 TSB_STORE(%g1, %g7)
-
 	TSB_WRITE(%g1, %g5, %g6)
 
 	/* fallthrough to TLB load */
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 932ff90..0bacceb 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -122,11 +122,6 @@ SECTIONS
 		*(.swapper_4m_tsb_phys_patch)
 		__swapper_4m_tsb_phys_patch_end = .;
 	}
-	.page_offset_shift_patch : {
-		__page_offset_shift_patch = .;
-		*(.page_offset_shift_patch)
-		__page_offset_shift_patch_end = .;
-	}
 	.popc_3insn_patch : {
 		__popc_3insn_patch = .;
 		*(.popc_3insn_patch)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 35fcc9c..848440a 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -75,7 +75,6 @@ unsigned long kern_linear_pte_xor[4] __read_mostly;
  * 'cpu' properties, but we need to have this table setup before the
  * MDESC is initialized.
  */
-unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
 
 #ifndef CONFIG_DEBUG_PAGEALLOC
 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
@@ -84,6 +83,7 @@ unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
  */
 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
 #endif
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 
 static unsigned long cpu_pgsz_mask;
 
@@ -165,10 +165,6 @@ static void __init read_obp_memory(const char *property,
 	     cmp_p64, NULL);
 }
 
-unsigned long sparc64_valid_addr_bitmap[VALID_ADDR_BITMAP_BYTES /
-					sizeof(unsigned long)];
-EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
-
 /* Kernel physical address base and size in bytes.  */
 unsigned long kern_base __read_mostly;
 unsigned long kern_size __read_mostly;
@@ -1369,9 +1365,145 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
 static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
 static int pall_ents __initdata;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+static unsigned long max_phys_bits = 40;
+
+bool kern_addr_valid(unsigned long addr)
+{
+	unsigned long above = ((long)addr) >> max_phys_bits;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (above != 0 && above != -1UL)
+		return false;
+
+	if (addr >= (unsigned long) KERNBASE &&
+	    addr < (unsigned long)&_end)
+		return true;
+
+	if (addr >= PAGE_OFFSET) {
+		unsigned long pa = __pa(addr);
+
+		return pfn_valid(pa >> PAGE_SHIFT);
+	}
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return pfn_valid(pud_pfn(*pud));
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return 0;
+
+	return pfn_valid(pte_pfn(*pte));
+}
+EXPORT_SYMBOL(kern_addr_valid);
+
+static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
+					      unsigned long vend,
+					      pud_t *pud)
+{
+	const unsigned long mask16gb = (1UL << 34) - 1UL;
+	u64 pte_val = vstart;
+
+	/* Each PUD is 8GB */
+	if ((vstart & mask16gb) ||
+	    (vend - vstart <= mask16gb)) {
+		pte_val ^= kern_linear_pte_xor[2];
+		pud_val(*pud) = pte_val | _PAGE_PUD_HUGE;
+
+		return vstart + PUD_SIZE;
+	}
+
+	pte_val ^= kern_linear_pte_xor[3];
+	pte_val |= _PAGE_PUD_HUGE;
+
+	vend = vstart + mask16gb + 1UL;
+	while (vstart < vend) {
+		pud_val(*pud) = pte_val;
+
+		pte_val += PUD_SIZE;
+		vstart += PUD_SIZE;
+		pud++;
+	}
+	return vstart;
+}
+
+static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend,
+				   bool guard)
+{
+	if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE)
+		return true;
+
+	return false;
+}
+
+static unsigned long __ref kernel_map_hugepmd(unsigned long vstart,
+					      unsigned long vend,
+					      pmd_t *pmd)
+{
+	const unsigned long mask256mb = (1UL << 28) - 1UL;
+	const unsigned long mask2gb = (1UL << 31) - 1UL;
+	u64 pte_val = vstart;
+
+	/* Each PMD is 8MB */
+	if ((vstart & mask256mb) ||
+	    (vend - vstart <= mask256mb)) {
+		pte_val ^= kern_linear_pte_xor[0];
+		pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE;
+
+		return vstart + PMD_SIZE;
+	}
+
+	if ((vstart & mask2gb) ||
+	    (vend - vstart <= mask2gb)) {
+		pte_val ^= kern_linear_pte_xor[1];
+		pte_val |= _PAGE_PMD_HUGE;
+		vend = vstart + mask256mb + 1UL;
+	} else {
+		pte_val ^= kern_linear_pte_xor[2];
+		pte_val |= _PAGE_PMD_HUGE;
+		vend = vstart + mask2gb + 1UL;
+	}
+
+	while (vstart < vend) {
+		pmd_val(*pmd) = pte_val;
+
+		pte_val += PMD_SIZE;
+		vstart += PMD_SIZE;
+		pmd++;
+	}
+
+	return vstart;
+}
+
+static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend,
+				   bool guard)
+{
+	if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE)
+		return true;
+
+	return false;
+}
+
 static unsigned long __ref kernel_map_range(unsigned long pstart,
-					    unsigned long pend, pgprot_t prot)
+					    unsigned long pend, pgprot_t prot,
+					    bool use_huge)
 {
 	unsigned long vstart = PAGE_OFFSET + pstart;
 	unsigned long vend = PAGE_OFFSET + pend;
@@ -1401,15 +1533,23 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 		if (pud_none(*pud)) {
 			pmd_t *new;
 
+			if (kernel_can_map_hugepud(vstart, vend, use_huge)) {
+				vstart = kernel_map_hugepud(vstart, vend, pud);
+				continue;
+			}
 			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
 			alloc_bytes += PAGE_SIZE;
 			pud_populate(&init_mm, pud, new);
 		}
 
 		pmd = pmd_offset(pud, vstart);
-		if (!pmd_present(*pmd)) {
+		if (pmd_none(*pmd)) {
 			pte_t *new;
 
+			if (kernel_can_map_hugepmd(vstart, vend, use_huge)) {
+				vstart = kernel_map_hugepmd(vstart, vend, pmd);
+				continue;
+			}
 			new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
 			alloc_bytes += PAGE_SIZE;
 			pmd_populate_kernel(&init_mm, pmd, new);
@@ -1432,100 +1572,34 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 	return alloc_bytes;
 }
 
-extern unsigned int kvmap_linear_patch[1];
-#endif /* CONFIG_DEBUG_PAGEALLOC */
-
-static void __init kpte_set_val(unsigned long index, unsigned long val)
-{
-	unsigned long *ptr = kpte_linear_bitmap;
-
-	val <<= ((index % (BITS_PER_LONG / 2)) * 2);
-	ptr += (index / (BITS_PER_LONG / 2));
-
-	*ptr |= val;
-}
-
-static const unsigned long kpte_shift_min = 28; /* 256MB */
-static const unsigned long kpte_shift_max = 34; /* 16GB */
-static const unsigned long kpte_shift_incr = 3;
-
-static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end,
-					   unsigned long shift)
+static void __init flush_all_kernel_tsbs(void)
 {
-	unsigned long size = (1UL << shift);
-	unsigned long mask = (size - 1UL);
-	unsigned long remains = end - start;
-	unsigned long val;
-
-	if (remains < size || (start & mask))
-		return start;
-
-	/* VAL maps:
-	 *
-	 *	shift 28 --> kern_linear_pte_xor index 1
-	 *	shift 31 --> kern_linear_pte_xor index 2
-	 *	shift 34 --> kern_linear_pte_xor index 3
-	 */
-	val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1;
-
-	remains &= ~mask;
-	if (shift != kpte_shift_max)
-		remains = size;
-
-	while (remains) {
-		unsigned long index = start >> kpte_shift_min;
+	int i;
 
-		kpte_set_val(index, val);
+	for (i = 0; i < KERNEL_TSB_NENTRIES; i++) {
+		struct tsb *ent = &swapper_tsb[i];
 
-		start += 1UL << kpte_shift_min;
-		remains -= 1UL << kpte_shift_min;
+		ent->tag = (1UL << TSB_TAG_INVALID_BIT);
 	}
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) {
+		struct tsb *ent = &swapper_4m_tsb[i];
 
-	return start;
-}
-
-static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
-{
-	unsigned long smallest_size, smallest_mask;
-	unsigned long s;
-
-	smallest_size = (1UL << kpte_shift_min);
-	smallest_mask = (smallest_size - 1UL);
-
-	while (start < end) {
-		unsigned long orig_start = start;
-
-		for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) {
-			start = kpte_mark_using_shift(start, end, s);
-
-			if (start != orig_start)
-				break;
-		}
-
-		if (start == orig_start)
-			start = (start + smallest_size) & ~smallest_mask;
+		ent->tag = (1UL << TSB_TAG_INVALID_BIT);
 	}
+#endif
 }
 
-static void __init init_kpte_bitmap(void)
-{
-	unsigned long i;
-
-	for (i = 0; i < pall_ents; i++) {
-		unsigned long phys_start, phys_end;
-
-		phys_start = pall[i].phys_addr;
-		phys_end = phys_start + pall[i].reg_size;
-
-		mark_kpte_bitmap(phys_start, phys_end);
-	}
-}
+extern unsigned int kvmap_linear_patch[1];
 
 static void __init kernel_physical_mapping_init(void)
 {
-#ifdef CONFIG_DEBUG_PAGEALLOC
 	unsigned long i, mem_alloced = 0UL;
+	bool use_huge = true;
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	use_huge = false;
+#endif
 	for (i = 0; i < pall_ents; i++) {
 		unsigned long phys_start, phys_end;
 
@@ -1533,7 +1607,7 @@ static void __init kernel_physical_mapping_init(void)
 		phys_end = phys_start + pall[i].reg_size;
 
 		mem_alloced += kernel_map_range(phys_start, phys_end,
-						PAGE_KERNEL);
+						PAGE_KERNEL, use_huge);
 	}
 
 	printk("Allocated %ld bytes for kernel page tables.\n",
@@ -1542,8 +1616,9 @@ static void __init kernel_physical_mapping_init(void)
 	kvmap_linear_patch[0] = 0x01000000; /* nop */
 	flushi(&kvmap_linear_patch[0]);
 
+	flush_all_kernel_tsbs();
+
 	__flush_tlb_all();
-#endif
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1553,7 +1628,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	unsigned long phys_end = phys_start + (numpages * PAGE_SIZE);
 
 	kernel_map_range(phys_start, phys_end,
-			 (enable ? PAGE_KERNEL : __pgprot(0)));
+			 (enable ? PAGE_KERNEL : __pgprot(0)), false);
 
 	flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
 			       PAGE_OFFSET + phys_end);
@@ -1581,62 +1656,11 @@ unsigned long __init find_ecache_flush_span(unsigned long size)
 unsigned long PAGE_OFFSET;
 EXPORT_SYMBOL(PAGE_OFFSET);
 
-static void __init page_offset_shift_patch_one(unsigned int *insn, unsigned long phys_bits)
-{
-	unsigned long final_shift;
-	unsigned int val = *insn;
-	unsigned int cnt;
-
-	/* We are patching in ilog2(max_supported_phys_address), and
-	 * we are doing so in a manner similar to a relocation addend.
-	 * That is, we are adding the shift value to whatever value
-	 * is in the shift instruction count field already.
-	 */
-	cnt = (val & 0x3f);
-	val &= ~0x3f;
-
-	/* If we are trying to shift >= 64 bits, clear the destination
-	 * register.  This can happen when phys_bits ends up being equal
-	 * to MAX_PHYS_ADDRESS_BITS.
-	 */
-	final_shift = (cnt + (64 - phys_bits));
-	if (final_shift >= 64) {
-		unsigned int rd = (val >> 25) & 0x1f;
-
-		val = 0x80100000 | (rd << 25);
-	} else {
-		val |= final_shift;
-	}
-	*insn = val;
-
-	__asm__ __volatile__("flush	%0"
-			     : /* no outputs */
-			     : "r" (insn));
-}
-
-static void __init page_offset_shift_patch(unsigned long phys_bits)
-{
-	extern unsigned int __page_offset_shift_patch;
-	extern unsigned int __page_offset_shift_patch_end;
-	unsigned int *p;
-
-	p = &__page_offset_shift_patch;
-	while (p < &__page_offset_shift_patch_end) {
-		unsigned int *insn = (unsigned int *)(unsigned long)*p;
-
-		page_offset_shift_patch_one(insn, phys_bits);
-
-		p++;
-	}
-}
-
 unsigned long sparc64_va_hole_top =    0xfffff80000000000UL;
 unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL;
 
 static void __init setup_page_offset(void)
 {
-	unsigned long max_phys_bits = 40;
-
 	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 		/* Cheetah/Panther support a full 64-bit virtual
 		 * address, so we can use all that our page tables
@@ -1685,8 +1709,6 @@ static void __init setup_page_offset(void)
 
 	pr_info("PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
 		PAGE_OFFSET, max_phys_bits);
-
-	page_offset_shift_patch(max_phys_bits);
 }
 
 static void __init tsb_phys_patch(void)
@@ -1731,7 +1753,6 @@ static void __init tsb_phys_patch(void)
 #define NUM_KTSB_DESCR	1
 #endif
 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
-extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 
 /* The swapper TSBs are loaded with a base sequence of:
  *
@@ -2077,11 +2098,9 @@ void __init paging_init(void)
 
 	pmd = swapper_low_pmd_dir + (shift / sizeof(pmd_t));
 	pud_set(&swapper_pud_dir[0], pmd);
-	
+
 	inherit_prom_mappings();
 	
-	init_kpte_bitmap();
-
 	/* Ok, we can use our TLB miss and window trap handlers safely.  */
 	setup_tba();
 
@@ -2188,70 +2207,6 @@ int page_in_phys_avail(unsigned long paddr)
 	return 0;
 }
 
-static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
-static int pavail_rescan_ents __initdata;
-
-/* Certain OBP calls, such as fetching "available" properties, can
- * claim physical memory.  So, along with initializing the valid
- * address bitmap, what we do here is refetch the physical available
- * memory list again, and make sure it provides at least as much
- * memory as 'pavail' does.
- */
-static void __init setup_valid_addr_bitmap_from_pavail(unsigned long *bitmap)
-{
-	int i;
-
-	read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
-
-	for (i = 0; i < pavail_ents; i++) {
-		unsigned long old_start, old_end;
-
-		old_start = pavail[i].phys_addr;
-		old_end = old_start + pavail[i].reg_size;
-		while (old_start < old_end) {
-			int n;
-
-			for (n = 0; n < pavail_rescan_ents; n++) {
-				unsigned long new_start, new_end;
-
-				new_start = pavail_rescan[n].phys_addr;
-				new_end = new_start +
-					pavail_rescan[n].reg_size;
-
-				if (new_start <= old_start &&
-				    new_end >= (old_start + PAGE_SIZE)) {
-					set_bit(old_start >> ILOG2_4MB, bitmap);
-					goto do_next_page;
-				}
-			}
-
-			prom_printf("mem_init: Lost memory in pavail\n");
-			prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
-				    pavail[i].phys_addr,
-				    pavail[i].reg_size);
-			prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
-				    pavail_rescan[i].phys_addr,
-				    pavail_rescan[i].reg_size);
-			prom_printf("mem_init: Cannot continue, aborting.\n");
-			prom_halt();
-
-		do_next_page:
-			old_start += PAGE_SIZE;
-		}
-	}
-}
-
-static void __init patch_tlb_miss_handler_bitmap(void)
-{
-	extern unsigned int valid_addr_bitmap_insn[];
-	extern unsigned int valid_addr_bitmap_patch[];
-
-	valid_addr_bitmap_insn[1] = valid_addr_bitmap_patch[1];
-	mb();
-	valid_addr_bitmap_insn[0] = valid_addr_bitmap_patch[0];
-	flushi(&valid_addr_bitmap_insn[0]);
-}
-
 static void __init register_page_bootmem_info(void)
 {
 #ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2264,18 +2219,6 @@ static void __init register_page_bootmem_info(void)
 }
 void __init mem_init(void)
 {
-	unsigned long addr, last;
-
-	addr = PAGE_OFFSET + kern_base;
-	last = PAGE_ALIGN(kern_size) + addr;
-	while (addr < last) {
-		set_bit(__pa(addr) >> ILOG2_4MB, sparc64_valid_addr_bitmap);
-		addr += PAGE_SIZE;
-	}
-
-	setup_valid_addr_bitmap_from_pavail(sparc64_valid_addr_bitmap);
-	patch_tlb_miss_handler_bitmap();
-
 	high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
 	register_page_bootmem_info();
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 0668b36..29ff73f 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -8,15 +8,8 @@
  */
 
 #define MAX_PHYS_ADDRESS	(1UL << MAX_PHYS_ADDRESS_BITS)
-#define KPTE_BITMAP_CHUNK_SZ		(256UL * 1024UL * 1024UL)
-#define KPTE_BITMAP_BYTES	\
-	((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4)
-#define VALID_ADDR_BITMAP_CHUNK_SZ	(4UL * 1024UL * 1024UL)
-#define VALID_ADDR_BITMAP_BYTES	\
-	((MAX_PHYS_ADDRESS / VALID_ADDR_BITMAP_CHUNK_SZ) / 8)
 
 extern unsigned long kern_linear_pte_xor[4];
-extern unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
 extern unsigned int sparc64_highest_unlocked_tlb_ent;
 extern unsigned long sparc64_kern_pri_context;
 extern unsigned long sparc64_kern_pri_nuc_bits;
-- 
cgit v0.10.2


From c06240c7f5c39c83dfd7849c0770775562441b96 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 24 Sep 2014 21:20:14 -0700
Subject: sparc64: Use kernel page tables for vmemmap.

For sparse memory configurations, the vmemmap array behaves terribly
and it takes up an inordinate amount of space in the BSS section of
the kernel image unconditionally.

Just build huge PMDs and look them up just like we do for TLB misses
in the vmalloc area.

Kernel BSS shrinks by about 2MB.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S
index 94a1e66..2627a7f 100644
--- a/arch/sparc/kernel/ktlb.S
+++ b/arch/sparc/kernel/ktlb.S
@@ -186,13 +186,8 @@ kvmap_dtlb_load:
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 kvmap_vmemmap:
-	sub		%g4, %g5, %g5
-	srlx		%g5, ILOG2_4MB, %g5
-	sethi		%hi(vmemmap_table), %g1
-	sllx		%g5, 3, %g5
-	or		%g1, %lo(vmemmap_table), %g1
-	ba,pt		%xcc, kvmap_dtlb_load
-	 ldx		[%g1 + %g5], %g5
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
+	ba,a,pt		%xcc, kvmap_dtlb_load
 #endif
 
 kvmap_dtlb_nonlinear:
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 848440a..6d5d562 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2308,18 +2308,9 @@ unsigned long _PAGE_CACHE __read_mostly;
 EXPORT_SYMBOL(_PAGE_CACHE);
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-unsigned long vmemmap_table[VMEMMAP_SIZE];
-
-static long __meminitdata addr_start, addr_end;
-static int __meminitdata node_start;
-
 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
 			       int node)
 {
-	unsigned long phys_start = (vstart - VMEMMAP_BASE);
-	unsigned long phys_end = (vend - VMEMMAP_BASE);
-	unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
-	unsigned long end = VMEMMAP_ALIGN(phys_end);
 	unsigned long pte_base;
 
 	pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
@@ -2330,47 +2321,52 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
 			    _PAGE_CP_4V | _PAGE_CV_4V |
 			    _PAGE_P_4V | _PAGE_W_4V);
 
-	for (; addr < end; addr += VMEMMAP_CHUNK) {
-		unsigned long *vmem_pp =
-			vmemmap_table + (addr >> VMEMMAP_CHUNK_SHIFT);
-		void *block;
+	pte_base |= _PAGE_PMD_HUGE;
 
-		if (!(*vmem_pp & _PAGE_VALID)) {
-			block = vmemmap_alloc_block(1UL << ILOG2_4MB, node);
-			if (!block)
+	vstart = vstart & PMD_MASK;
+	vend = ALIGN(vend, PMD_SIZE);
+	for (; vstart < vend; vstart += PMD_SIZE) {
+		pgd_t *pgd = pgd_offset_k(vstart);
+		unsigned long pte;
+		pud_t *pud;
+		pmd_t *pmd;
+
+		if (pgd_none(*pgd)) {
+			pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
+
+			if (!new)
 				return -ENOMEM;
+			pgd_populate(&init_mm, pgd, new);
+		}
 
-			*vmem_pp = pte_base | __pa(block);
+		pud = pud_offset(pgd, vstart);
+		if (pud_none(*pud)) {
+			pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
 
-			/* check to see if we have contiguous blocks */
-			if (addr_end != addr || node_start != node) {
-				if (addr_start)
-					printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
-					       addr_start, addr_end-1, node_start);
-				addr_start = addr;
-				node_start = node;
-			}
-			addr_end = addr + VMEMMAP_CHUNK;
+			if (!new)
+				return -ENOMEM;
+			pud_populate(&init_mm, pud, new);
 		}
-	}
-	return 0;
-}
 
-void __meminit vmemmap_populate_print_last(void)
-{
-	if (addr_start) {
-		printk(KERN_DEBUG " [%lx-%lx] on node %d\n",
-		       addr_start, addr_end-1, node_start);
-		addr_start = 0;
-		addr_end = 0;
-		node_start = 0;
+		pmd = pmd_offset(pud, vstart);
+
+		pte = pmd_val(*pmd);
+		if (!(pte & _PAGE_VALID)) {
+			void *block = vmemmap_alloc_block(PMD_SIZE, node);
+
+			if (!block)
+				return -ENOMEM;
+
+			pmd_val(*pmd) = pte_base | __pa(block);
+		}
 	}
+
+	return 0;
 }
 
 void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
-
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void prot_init_common(unsigned long page_none,
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 29ff73f..a4c0960 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -31,15 +31,4 @@ extern unsigned long kern_locked_tte_data;
 
 void prom_world(int enter);
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-#define VMEMMAP_CHUNK_SHIFT	22
-#define VMEMMAP_CHUNK		(1UL << VMEMMAP_CHUNK_SHIFT)
-#define VMEMMAP_CHUNK_MASK	~(VMEMMAP_CHUNK - 1UL)
-#define VMEMMAP_ALIGN(x)	(((x)+VMEMMAP_CHUNK-1UL)&VMEMMAP_CHUNK_MASK)
-
-#define VMEMMAP_SIZE	((((1UL << MAX_PHYSADDR_BITS) >> PAGE_SHIFT) * \
-			  sizeof(struct page)) >> VMEMMAP_CHUNK_SHIFT)
-extern unsigned long vmemmap_table[VMEMMAP_SIZE];
-#endif
-
 #endif /* _SPARC64_MM_INIT_H */
-- 
cgit v0.10.2


From 7c0fa0f24bb76ce3d67be7f737b799846a04570f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 24 Sep 2014 21:49:29 -0700
Subject: sparc64: Increase MAX_PHYS_ADDRESS_BITS to 53.

Make sure, at compile time, that the kernel can properly support
whatever MAX_PHYS_ADDRESS_BITS is defined to.

On M7 chips, use a max_phys_bits value of 49.

Based upon a patch by Bob Picco.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index 732ba17..6784a33 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -122,11 +122,11 @@ extern unsigned long PAGE_OFFSET;
 
 #endif /* !(__ASSEMBLY__) */
 
-/* The maximum number of physical memory address bits we support, this
- * is used to size various tables used to manage kernel TLB misses and
- * also the sparsemem code.
+/* The maximum number of physical memory address bits we support.  The
+ * largest value we can support is whatever "KPGD_SHIFT + KPTE_BITS"
+ * evaluates to.
  */
-#define MAX_PHYS_ADDRESS_BITS	47
+#define MAX_PHYS_ADDRESS_BITS	53
 
 #define ILOG2_4MB		22
 #define ILOG2_256MB		28
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index a305b22..0552957 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -67,6 +67,10 @@
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PGDIR_BITS	(PAGE_SHIFT - 3)
 
+#if (MAX_PHYS_ADDRESS_BITS > PGDIR_SHIFT + PGDIR_BITS)
+#error MAX_PHYS_ADDRESS_BITS exceeds what kernel page tables can support
+#endif
+
 #if (PGDIR_SHIFT + PGDIR_BITS) != 53
 #error Page table parameters do not cover virtual address space properly.
 #endif
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 6d5d562..e0c1206 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1690,12 +1690,19 @@ static void __init setup_page_offset(void)
 		case SUN4V_CHIP_NIAGARA4:
 		case SUN4V_CHIP_NIAGARA5:
 		case SUN4V_CHIP_SPARC64X:
-		default:
+		case SUN4V_CHIP_SPARC_M6:
 			/* T4 and later support 52-bit virtual addresses.  */
 			sparc64_va_hole_top =    0xfff8000000000000UL;
 			sparc64_va_hole_bottom = 0x0008000000000000UL;
 			max_phys_bits = 47;
 			break;
+		case SUN4V_CHIP_SPARC_M7:
+		default:
+			/* M7 and later support 52-bit virtual addresses.  */
+			sparc64_va_hole_top =    0xfff8000000000000UL;
+			sparc64_va_hole_bottom = 0x0008000000000000UL;
+			max_phys_bits = 49;
+			break;
 		}
 	}
 
-- 
cgit v0.10.2


From bb4e6e85daa52a9f6210fa06a5ec6269598a202b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 27 Sep 2014 11:05:21 -0700
Subject: sparc64: Adjust vmalloc region size based upon available virtual
 address bits.

In order to accomodate embedded per-cpu allocation with large numbers
of cpus and numa nodes, we have to use as much virtual address space
as possible for the vmalloc region.  Otherwise we can get things like:

PERCPU: max_distance=0x380001c10000 too large for vmalloc space 0xff00000000

So, once we select a value for PAGE_OFFSET, derive the size of the
vmalloc region based upon that.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index 6784a33..8c2a8c9 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -117,7 +117,6 @@ extern unsigned long sparc64_va_hole_bottom;
 
 #include <asm-generic/memory_model.h>
 
-#define PAGE_OFFSET_BY_BITS(X)	(-(_AC(1,UL) << (X)))
 extern unsigned long PAGE_OFFSET;
 
 #endif /* !(__ASSEMBLY__) */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 0552957..c093922 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -40,10 +40,7 @@
 #define LOW_OBP_ADDRESS		_AC(0x00000000f0000000,UL)
 #define HI_OBP_ADDRESS		_AC(0x0000000100000000,UL)
 #define VMALLOC_START		_AC(0x0000000100000000,UL)
-#define VMALLOC_END		_AC(0x0000010000000000,UL)
-#define VMEMMAP_BASE		_AC(0x0000010000000000,UL)
-
-#define vmemmap			((struct page *)VMEMMAP_BASE)
+#define VMEMMAP_BASE		VMALLOC_END
 
 /* PMD_SHIFT determines the size of the area a second-level page
  * table can map
@@ -81,6 +78,10 @@
 
 #ifndef __ASSEMBLY__
 
+extern unsigned long VMALLOC_END;
+
+#define vmemmap			((struct page *)VMEMMAP_BASE)
+
 #include <linux/sched.h>
 
 bool kern_addr_valid(unsigned long addr);
diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S
index 2627a7f..ef0d8e9 100644
--- a/arch/sparc/kernel/ktlb.S
+++ b/arch/sparc/kernel/ktlb.S
@@ -199,8 +199,8 @@ kvmap_dtlb_nonlinear:
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	/* Do not use the TSB for vmemmap.  */
-	mov		(VMEMMAP_BASE >> 40), %g5
-	sllx		%g5, 40, %g5
+	sethi		%hi(VMEMMAP_BASE), %g5
+	ldx		[%g5 + %lo(VMEMMAP_BASE)], %g5
 	cmp		%g4,%g5
 	bgeu,pn		%xcc, kvmap_vmemmap
 	 nop
@@ -212,8 +212,8 @@ kvmap_dtlb_tsbmiss:
 	sethi		%hi(MODULES_VADDR), %g5
 	cmp		%g4, %g5
 	blu,pn		%xcc, kvmap_dtlb_longpath
-	 mov		(VMALLOC_END >> 40), %g5
-	sllx		%g5, 40, %g5
+	 sethi		%hi(VMALLOC_END), %g5
+	ldx		[%g5 + %lo(VMALLOC_END)], %g5
 	cmp		%g4, %g5
 	bgeu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index e0c1206..0ead74b 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1369,25 +1369,24 @@ static unsigned long max_phys_bits = 40;
 
 bool kern_addr_valid(unsigned long addr)
 {
-	unsigned long above = ((long)addr) >> max_phys_bits;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 
-	if (above != 0 && above != -1UL)
-		return false;
-
-	if (addr >= (unsigned long) KERNBASE &&
-	    addr < (unsigned long)&_end)
-		return true;
-
-	if (addr >= PAGE_OFFSET) {
+	if ((long)addr < 0L) {
 		unsigned long pa = __pa(addr);
 
+		if ((addr >> max_phys_bits) != 0UL)
+			return false;
+
 		return pfn_valid(pa >> PAGE_SHIFT);
 	}
 
+	if (addr >= (unsigned long) KERNBASE &&
+	    addr < (unsigned long)&_end)
+		return true;
+
 	pgd = pgd_offset_k(addr);
 	if (pgd_none(*pgd))
 		return 0;
@@ -1656,6 +1655,9 @@ unsigned long __init find_ecache_flush_span(unsigned long size)
 unsigned long PAGE_OFFSET;
 EXPORT_SYMBOL(PAGE_OFFSET);
 
+unsigned long VMALLOC_END   = 0x0000010000000000UL;
+EXPORT_SYMBOL(VMALLOC_END);
+
 unsigned long sparc64_va_hole_top =    0xfffff80000000000UL;
 unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL;
 
@@ -1712,10 +1714,16 @@ static void __init setup_page_offset(void)
 		prom_halt();
 	}
 
-	PAGE_OFFSET = PAGE_OFFSET_BY_BITS(max_phys_bits);
+	PAGE_OFFSET = sparc64_va_hole_top;
+	VMALLOC_END = ((sparc64_va_hole_bottom >> 1) +
+		       (sparc64_va_hole_bottom >> 2));
 
-	pr_info("PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
+	pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
 		PAGE_OFFSET, max_phys_bits);
+	pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n",
+		VMALLOC_START, VMALLOC_END);
+	pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n",
+		VMEMMAP_BASE, VMEMMAP_BASE << 1);
 }
 
 static void __init tsb_phys_patch(void)
-- 
cgit v0.10.2


From ee6a9333fa58e11577c1b531b8e0f5ffc0fd6f50 Mon Sep 17 00:00:00 2001
From: bob picco <bpicco@meloft.net>
Date: Thu, 25 Sep 2014 12:25:03 -0700
Subject: sparc64: sparse irq

This patch attempts to do a few things. The highlights are: 1) enable
SPARSE_IRQ unconditionally, 2) kills off !SPARSE_IRQ code 3) allocates
ivector_table at boot time and 4) default to cookie only VIRQ mechanism
for supported firmware. The first firmware with cookie only support for
me appears on T5. You can optionally force the HV firmware to not cookie
only mode which is the sysino support.

The sysino is a deprecated HV mechanism according to the most recent
SPARC Virtual Machine Specification. HV_GRP_INTR is what controls the
cookie/sysino firmware versioning.

The history of this interface is:

1) Major version 1.0 only supported sysino based interrupt interfaces.

2) Major version 2.0 added cookie based VIRQs, however due to the fact
   that OSs were using the VIRQs without negoatiating major version
   2.0 (Linux and Solaris are both guilty), the VIRQs calls were
   allowed even with major version 1.0

   To complicate things even further, the VIRQ interfaces were only
   actually hooked up in the hypervisor for LDC interrupt sources.
   VIRQ calls on other device types would result in HV_EINVAL errors.

   So effectively, major version 2.0 is unusable.

3) Major version 3.0 was created to signal use of VIRQs and the fact
   that the hypervisor has these calls hooked up for all interrupt
   sources, not just those for LDC devices.

A new boot option is provided should cookie only HV support have issues.
hvirq - this is the version for HV_GRP_INTR. This is related to HV API
versioning.  The code attempts major=3 first by default. The option can
be used to override this default.

I've tested with SPARSE_IRQ on T5-8, M7-4 and T4-X and Jalap?no.

Signed-off-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a537816..96ac69c 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -67,6 +67,7 @@ config SPARC64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_DEBUG_KMEMLEAK
+	select SPARSE_IRQ
 	select RTC_DRV_CMOS
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 91d2193..3f70f90 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -37,7 +37,7 @@
  *
  * ino_bucket->irq allocation is made during {sun4v_,}build_irq().
  */
-#define NR_IRQS    255
+#define NR_IRQS		(2048)
 
 void irq_install_pre_handler(int irq,
 			     void (*func)(unsigned int, void *, void *),
@@ -57,11 +57,8 @@ unsigned int sun4u_build_msi(u32 portid, unsigned int *irq_p,
 			     unsigned long iclr_base);
 void sun4u_destroy_msi(unsigned int irq);
 
-unsigned char irq_alloc(unsigned int dev_handle,
-			unsigned int dev_ino);
-#ifdef CONFIG_PCI_MSI
+unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino);
 void irq_free(unsigned int irq);
-#endif
 
 void __init init_IRQ(void);
 void fixup_irqs(void);
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 666193f..4033c23 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -47,8 +47,6 @@
 #include "cpumap.h"
 #include "kstack.h"
 
-#define NUM_IVECS	(IMAP_INR + 1)
-
 struct ino_bucket *ivector_table;
 unsigned long ivector_table_pa;
 
@@ -107,55 +105,196 @@ static void bucket_set_irq(unsigned long bucket_pa, unsigned int irq)
 
 #define irq_work_pa(__cpu)	&(trap_block[(__cpu)].irq_worklist_pa)
 
-static struct {
-	unsigned int dev_handle;
-	unsigned int dev_ino;
-	unsigned int in_use;
-} irq_table[NR_IRQS];
-static DEFINE_SPINLOCK(irq_alloc_lock);
+static unsigned long hvirq_major __initdata;
+static int __init early_hvirq_major(char *p)
+{
+	int rc = kstrtoul(p, 10, &hvirq_major);
+
+	return rc;
+}
+early_param("hvirq", early_hvirq_major);
+
+static int hv_irq_version;
+
+/* Major version 2.0 of HV_GRP_INTR added support for the VIRQ cookie
+ * based interfaces, but:
+ *
+ * 1) Several OSs, Solaris and Linux included, use them even when only
+ *    negotiating version 1.0 (or failing to negotiate at all).  So the
+ *    hypervisor has a workaround that provides the VIRQ interfaces even
+ *    when only verion 1.0 of the API is in use.
+ *
+ * 2) Second, and more importantly, with major version 2.0 these VIRQ
+ *    interfaces only were actually hooked up for LDC interrupts, even
+ *    though the Hypervisor specification clearly stated:
+ *
+ *	The new interrupt API functions will be available to a guest
+ *	when it negotiates version 2.0 in the interrupt API group 0x2. When
+ *	a guest negotiates version 2.0, all interrupt sources will only
+ *	support using the cookie interface, and any attempt to use the
+ *	version 1.0 interrupt APIs numbered 0xa0 to 0xa6 will result in the
+ *	ENOTSUPPORTED error being returned.
+ *
+ *   with an emphasis on "all interrupt sources".
+ *
+ * To correct this, major version 3.0 was created which does actually
+ * support VIRQs for all interrupt sources (not just LDC devices).  So
+ * if we want to move completely over the cookie based VIRQs we must
+ * negotiate major version 3.0 or later of HV_GRP_INTR.
+ */
+static bool sun4v_cookie_only_virqs(void)
+{
+	if (hv_irq_version >= 3)
+		return true;
+	return false;
+}
 
-unsigned char irq_alloc(unsigned int dev_handle, unsigned int dev_ino)
+static void __init irq_init_hv(void)
 {
-	unsigned long flags;
-	unsigned char ent;
+	unsigned long hv_error, major, minor = 0;
+
+	if (tlb_type != hypervisor)
+		return;
 
-	BUILD_BUG_ON(NR_IRQS >= 256);
+	if (hvirq_major)
+		major = hvirq_major;
+	else
+		major = 3;
 
-	spin_lock_irqsave(&irq_alloc_lock, flags);
+	hv_error = sun4v_hvapi_register(HV_GRP_INTR, major, &minor);
+	if (!hv_error)
+		hv_irq_version = major;
+	else
+		hv_irq_version = 1;
 
-	for (ent = 1; ent < NR_IRQS; ent++) {
-		if (!irq_table[ent].in_use)
+	pr_info("SUN4V: Using IRQ API major %d, cookie only virqs %s\n",
+		hv_irq_version,
+		sun4v_cookie_only_virqs() ? "enabled" : "disabled");
+}
+
+/* This function is for the timer interrupt.*/
+int __init arch_probe_nr_irqs(void)
+{
+	return 1;
+}
+
+#define DEFAULT_NUM_IVECS	(0xfffU)
+static unsigned int nr_ivec = DEFAULT_NUM_IVECS;
+#define NUM_IVECS (nr_ivec)
+
+static unsigned int __init size_nr_ivec(void)
+{
+	if (tlb_type == hypervisor) {
+		switch (sun4v_chip_type) {
+		/* Athena's devhandle|devino is large.*/
+		case SUN4V_CHIP_SPARC64X:
+			nr_ivec = 0xffff;
 			break;
+		}
 	}
-	if (ent >= NR_IRQS) {
-		printk(KERN_ERR "IRQ: Out of virtual IRQs.\n");
-		ent = 0;
-	} else {
-		irq_table[ent].dev_handle = dev_handle;
-		irq_table[ent].dev_ino = dev_ino;
-		irq_table[ent].in_use = 1;
-	}
+	return nr_ivec;
+}
+
+struct irq_handler_data {
+	union {
+		struct {
+			unsigned int dev_handle;
+			unsigned int dev_ino;
+		};
+		unsigned long sysino;
+	};
+	struct ino_bucket bucket;
+	unsigned long	iclr;
+	unsigned long	imap;
+};
+
+static inline unsigned int irq_data_to_handle(struct irq_data *data)
+{
+	struct irq_handler_data *ihd = data->handler_data;
+
+	return ihd->dev_handle;
+}
+
+static inline unsigned int irq_data_to_ino(struct irq_data *data)
+{
+	struct irq_handler_data *ihd = data->handler_data;
 
-	spin_unlock_irqrestore(&irq_alloc_lock, flags);
+	return ihd->dev_ino;
+}
+
+static inline unsigned long irq_data_to_sysino(struct irq_data *data)
+{
+	struct irq_handler_data *ihd = data->handler_data;
 
-	return ent;
+	return ihd->sysino;
 }
 
-#ifdef CONFIG_PCI_MSI
 void irq_free(unsigned int irq)
 {
-	unsigned long flags;
+	void *data = irq_get_handler_data(irq);
 
-	if (irq >= NR_IRQS)
-		return;
+	kfree(data);
+	irq_set_handler_data(irq, NULL);
+	irq_free_descs(irq, 1);
+}
 
-	spin_lock_irqsave(&irq_alloc_lock, flags);
+unsigned int irq_alloc(unsigned int dev_handle, unsigned int dev_ino)
+{
+	int irq;
 
-	irq_table[irq].in_use = 0;
+	irq = __irq_alloc_descs(-1, 1, 1, numa_node_id(), NULL);
+	if (irq <= 0)
+		goto out;
 
-	spin_unlock_irqrestore(&irq_alloc_lock, flags);
+	return irq;
+out:
+	return 0;
+}
+
+static unsigned int cookie_exists(u32 devhandle, unsigned int devino)
+{
+	unsigned long hv_err, cookie;
+	struct ino_bucket *bucket;
+	unsigned int irq = 0U;
+
+	hv_err = sun4v_vintr_get_cookie(devhandle, devino, &cookie);
+	if (hv_err) {
+		pr_err("HV get cookie failed hv_err = %ld\n", hv_err);
+		goto out;
+	}
+
+	if (cookie & ((1UL << 63UL))) {
+		cookie = ~cookie;
+		bucket = (struct ino_bucket *) __va(cookie);
+		irq = bucket->__irq;
+	}
+out:
+	return irq;
+}
+
+static unsigned int sysino_exists(u32 devhandle, unsigned int devino)
+{
+	unsigned long sysino = sun4v_devino_to_sysino(devhandle, devino);
+	struct ino_bucket *bucket;
+	unsigned int irq;
+
+	bucket = &ivector_table[sysino];
+	irq = bucket_get_irq(__pa(bucket));
+
+	return irq;
+}
+
+void ack_bad_irq(unsigned int irq)
+{
+	pr_crit("BAD IRQ ack %d\n", irq);
+}
+
+void irq_install_pre_handler(int irq,
+			     void (*func)(unsigned int, void *, void *),
+			     void *arg1, void *arg2)
+{
+	pr_warn("IRQ pre handler NOT supported.\n");
 }
-#endif
 
 /*
  * /proc/interrupts printing:
@@ -206,15 +345,6 @@ static unsigned int sun4u_compute_tid(unsigned long imap, unsigned long cpuid)
 	return tid;
 }
 
-struct irq_handler_data {
-	unsigned long	iclr;
-	unsigned long	imap;
-
-	void		(*pre_handler)(unsigned int, void *, void *);
-	void		*arg1;
-	void		*arg2;
-};
-
 #ifdef CONFIG_SMP
 static int irq_choose_cpu(unsigned int irq, const struct cpumask *affinity)
 {
@@ -316,8 +446,8 @@ static void sun4u_irq_eoi(struct irq_data *data)
 
 static void sun4v_irq_enable(struct irq_data *data)
 {
-	unsigned int ino = irq_table[data->irq].dev_ino;
 	unsigned long cpuid = irq_choose_cpu(data->irq, data->affinity);
+	unsigned int ino = irq_data_to_sysino(data);
 	int err;
 
 	err = sun4v_intr_settarget(ino, cpuid);
@@ -337,8 +467,8 @@ static void sun4v_irq_enable(struct irq_data *data)
 static int sun4v_set_affinity(struct irq_data *data,
 			       const struct cpumask *mask, bool force)
 {
-	unsigned int ino = irq_table[data->irq].dev_ino;
 	unsigned long cpuid = irq_choose_cpu(data->irq, mask);
+	unsigned int ino = irq_data_to_sysino(data);
 	int err;
 
 	err = sun4v_intr_settarget(ino, cpuid);
@@ -351,7 +481,7 @@ static int sun4v_set_affinity(struct irq_data *data,
 
 static void sun4v_irq_disable(struct irq_data *data)
 {
-	unsigned int ino = irq_table[data->irq].dev_ino;
+	unsigned int ino = irq_data_to_sysino(data);
 	int err;
 
 	err = sun4v_intr_setenabled(ino, HV_INTR_DISABLED);
@@ -362,7 +492,7 @@ static void sun4v_irq_disable(struct irq_data *data)
 
 static void sun4v_irq_eoi(struct irq_data *data)
 {
-	unsigned int ino = irq_table[data->irq].dev_ino;
+	unsigned int ino = irq_data_to_sysino(data);
 	int err;
 
 	err = sun4v_intr_setstate(ino, HV_INTR_STATE_IDLE);
@@ -373,14 +503,13 @@ static void sun4v_irq_eoi(struct irq_data *data)
 
 static void sun4v_virq_enable(struct irq_data *data)
 {
-	unsigned long cpuid, dev_handle, dev_ino;
+	unsigned long dev_handle = irq_data_to_handle(data);
+	unsigned long dev_ino = irq_data_to_ino(data);
+	unsigned long cpuid;
 	int err;
 
 	cpuid = irq_choose_cpu(data->irq, data->affinity);
 
-	dev_handle = irq_table[data->irq].dev_handle;
-	dev_ino = irq_table[data->irq].dev_ino;
-
 	err = sun4v_vintr_set_target(dev_handle, dev_ino, cpuid);
 	if (err != HV_EOK)
 		printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
@@ -403,14 +532,13 @@ static void sun4v_virq_enable(struct irq_data *data)
 static int sun4v_virt_set_affinity(struct irq_data *data,
 				    const struct cpumask *mask, bool force)
 {
-	unsigned long cpuid, dev_handle, dev_ino;
+	unsigned long dev_handle = irq_data_to_handle(data);
+	unsigned long dev_ino = irq_data_to_ino(data);
+	unsigned long cpuid;
 	int err;
 
 	cpuid = irq_choose_cpu(data->irq, mask);
 
-	dev_handle = irq_table[data->irq].dev_handle;
-	dev_ino = irq_table[data->irq].dev_ino;
-
 	err = sun4v_vintr_set_target(dev_handle, dev_ino, cpuid);
 	if (err != HV_EOK)
 		printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
@@ -422,11 +550,10 @@ static int sun4v_virt_set_affinity(struct irq_data *data,
 
 static void sun4v_virq_disable(struct irq_data *data)
 {
-	unsigned long dev_handle, dev_ino;
+	unsigned long dev_handle = irq_data_to_handle(data);
+	unsigned long dev_ino = irq_data_to_ino(data);
 	int err;
 
-	dev_handle = irq_table[data->irq].dev_handle;
-	dev_ino = irq_table[data->irq].dev_ino;
 
 	err = sun4v_vintr_set_valid(dev_handle, dev_ino,
 				    HV_INTR_DISABLED);
@@ -438,12 +565,10 @@ static void sun4v_virq_disable(struct irq_data *data)
 
 static void sun4v_virq_eoi(struct irq_data *data)
 {
-	unsigned long dev_handle, dev_ino;
+	unsigned long dev_handle = irq_data_to_handle(data);
+	unsigned long dev_ino = irq_data_to_ino(data);
 	int err;
 
-	dev_handle = irq_table[data->irq].dev_handle;
-	dev_ino = irq_table[data->irq].dev_ino;
-
 	err = sun4v_vintr_set_state(dev_handle, dev_ino,
 				    HV_INTR_STATE_IDLE);
 	if (err != HV_EOK)
@@ -479,31 +604,10 @@ static struct irq_chip sun4v_virq = {
 	.flags			= IRQCHIP_EOI_IF_HANDLED,
 };
 
-static void pre_flow_handler(struct irq_data *d)
-{
-	struct irq_handler_data *handler_data = irq_data_get_irq_handler_data(d);
-	unsigned int ino = irq_table[d->irq].dev_ino;
-
-	handler_data->pre_handler(ino, handler_data->arg1, handler_data->arg2);
-}
-
-void irq_install_pre_handler(int irq,
-			     void (*func)(unsigned int, void *, void *),
-			     void *arg1, void *arg2)
-{
-	struct irq_handler_data *handler_data = irq_get_handler_data(irq);
-
-	handler_data->pre_handler = func;
-	handler_data->arg1 = arg1;
-	handler_data->arg2 = arg2;
-
-	__irq_set_preflow_handler(irq, pre_flow_handler);
-}
-
 unsigned int build_irq(int inofixup, unsigned long iclr, unsigned long imap)
 {
-	struct ino_bucket *bucket;
 	struct irq_handler_data *handler_data;
+	struct ino_bucket *bucket;
 	unsigned int irq;
 	int ino;
 
@@ -537,119 +641,166 @@ out:
 	return irq;
 }
 
-static unsigned int sun4v_build_common(unsigned long sysino,
-				       struct irq_chip *chip)
+static unsigned int sun4v_build_common(u32 devhandle, unsigned int devino,
+		void (*handler_data_init)(struct irq_handler_data *data,
+		u32 devhandle, unsigned int devino),
+		struct irq_chip *chip)
 {
-	struct ino_bucket *bucket;
-	struct irq_handler_data *handler_data;
+	struct irq_handler_data *data;
 	unsigned int irq;
 
-	BUG_ON(tlb_type != hypervisor);
+	irq = irq_alloc(devhandle, devino);
+	if (!irq)
+		goto out;
 
-	bucket = &ivector_table[sysino];
-	irq = bucket_get_irq(__pa(bucket));
-	if (!irq) {
-		irq = irq_alloc(0, sysino);
-		bucket_set_irq(__pa(bucket), irq);
-		irq_set_chip_and_handler_name(irq, chip, handle_fasteoi_irq,
-					      "IVEC");
+	data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
+	if (unlikely(!data)) {
+		pr_err("IRQ handler data allocation failed.\n");
+		irq_free(irq);
+		irq = 0;
+		goto out;
 	}
 
-	handler_data = irq_get_handler_data(irq);
-	if (unlikely(handler_data))
-		goto out;
+	irq_set_handler_data(irq, data);
+	handler_data_init(data, devhandle, devino);
+	irq_set_chip_and_handler_name(irq, chip, handle_fasteoi_irq, "IVEC");
+	data->imap = ~0UL;
+	data->iclr = ~0UL;
+out:
+	return irq;
+}
 
-	handler_data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
-	if (unlikely(!handler_data)) {
-		prom_printf("IRQ: kzalloc(irq_handler_data) failed.\n");
-		prom_halt();
-	}
-	irq_set_handler_data(irq, handler_data);
+static unsigned long cookie_assign(unsigned int irq, u32 devhandle,
+		unsigned int devino)
+{
+	struct irq_handler_data *ihd = irq_get_handler_data(irq);
+	unsigned long hv_error, cookie;
 
-	/* Catch accidental accesses to these things.  IMAP/ICLR handling
-	 * is done by hypervisor calls on sun4v platforms, not by direct
-	 * register accesses.
+	/* handler_irq needs to find the irq. cookie is seen signed in
+	 * sun4v_dev_mondo and treated as a non ivector_table delivery.
 	 */
-	handler_data->imap = ~0UL;
-	handler_data->iclr = ~0UL;
+	ihd->bucket.__irq = irq;
+	cookie = ~__pa(&ihd->bucket);
 
-out:
-	return irq;
+	hv_error = sun4v_vintr_set_cookie(devhandle, devino, cookie);
+	if (hv_error)
+		pr_err("HV vintr set cookie failed = %ld\n", hv_error);
+
+	return hv_error;
 }
 
-unsigned int sun4v_build_irq(u32 devhandle, unsigned int devino)
+static void cookie_handler_data(struct irq_handler_data *data,
+				u32 devhandle, unsigned int devino)
 {
-	unsigned long sysino = sun4v_devino_to_sysino(devhandle, devino);
+	data->dev_handle = devhandle;
+	data->dev_ino = devino;
+}
 
-	return sun4v_build_common(sysino, &sun4v_irq);
+static unsigned int cookie_build_irq(u32 devhandle, unsigned int devino,
+				     struct irq_chip *chip)
+{
+	unsigned long hv_error;
+	unsigned int irq;
+
+	irq = sun4v_build_common(devhandle, devino, cookie_handler_data, chip);
+
+	hv_error = cookie_assign(irq, devhandle, devino);
+	if (hv_error) {
+		irq_free(irq);
+		irq = 0;
+	}
+
+	return irq;
 }
 
-unsigned int sun4v_build_virq(u32 devhandle, unsigned int devino)
+static unsigned int sun4v_build_cookie(u32 devhandle, unsigned int devino)
 {
-	struct irq_handler_data *handler_data;
-	unsigned long hv_err, cookie;
-	struct ino_bucket *bucket;
 	unsigned int irq;
 
-	bucket = kzalloc(sizeof(struct ino_bucket), GFP_ATOMIC);
-	if (unlikely(!bucket))
-		return 0;
+	irq = cookie_exists(devhandle, devino);
+	if (irq)
+		goto out;
 
-	/* The only reference we store to the IRQ bucket is
-	 * by physical address which kmemleak can't see, tell
-	 * it that this object explicitly is not a leak and
-	 * should be scanned.
-	 */
-	kmemleak_not_leak(bucket);
+	irq = cookie_build_irq(devhandle, devino, &sun4v_virq);
 
-	__flush_dcache_range((unsigned long) bucket,
-			     ((unsigned long) bucket +
-			      sizeof(struct ino_bucket)));
+out:
+	return irq;
+}
 
-	irq = irq_alloc(devhandle, devino);
+static void sysino_set_bucket(unsigned int irq)
+{
+	struct irq_handler_data *ihd = irq_get_handler_data(irq);
+	struct ino_bucket *bucket;
+	unsigned long sysino;
+
+	sysino = sun4v_devino_to_sysino(ihd->dev_handle, ihd->dev_ino);
+	BUG_ON(sysino >= nr_ivec);
+	bucket = &ivector_table[sysino];
 	bucket_set_irq(__pa(bucket), irq);
+}
 
-	irq_set_chip_and_handler_name(irq, &sun4v_virq, handle_fasteoi_irq,
-				      "IVEC");
+static void sysino_handler_data(struct irq_handler_data *data,
+				u32 devhandle, unsigned int devino)
+{
+	unsigned long sysino;
 
-	handler_data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
-	if (unlikely(!handler_data))
-		return 0;
+	sysino = sun4v_devino_to_sysino(devhandle, devino);
+	data->sysino = sysino;
+}
 
-	/* In order to make the LDC channel startup sequence easier,
-	 * especially wrt. locking, we do not let request_irq() enable
-	 * the interrupt.
-	 */
-	irq_set_status_flags(irq, IRQ_NOAUTOEN);
-	irq_set_handler_data(irq, handler_data);
+static unsigned int sysino_build_irq(u32 devhandle, unsigned int devino,
+				     struct irq_chip *chip)
+{
+	unsigned int irq;
 
-	/* Catch accidental accesses to these things.  IMAP/ICLR handling
-	 * is done by hypervisor calls on sun4v platforms, not by direct
-	 * register accesses.
-	 */
-	handler_data->imap = ~0UL;
-	handler_data->iclr = ~0UL;
+	irq = sun4v_build_common(devhandle, devino, sysino_handler_data, chip);
+	if (!irq)
+		goto out;
 
-	cookie = ~__pa(bucket);
-	hv_err = sun4v_vintr_set_cookie(devhandle, devino, cookie);
-	if (hv_err) {
-		prom_printf("IRQ: Fatal, cannot set cookie for [%x:%x] "
-			    "err=%lu\n", devhandle, devino, hv_err);
-		prom_halt();
-	}
+	sysino_set_bucket(irq);
+out:
+	return irq;
+}
 
+static int sun4v_build_sysino(u32 devhandle, unsigned int devino)
+{
+	int irq;
+
+	irq = sysino_exists(devhandle, devino);
+	if (irq)
+		goto out;
+
+	irq = sysino_build_irq(devhandle, devino, &sun4v_irq);
+out:
 	return irq;
 }
 
-void ack_bad_irq(unsigned int irq)
+unsigned int sun4v_build_irq(u32 devhandle, unsigned int devino)
 {
-	unsigned int ino = irq_table[irq].dev_ino;
+	unsigned int irq;
 
-	if (!ino)
-		ino = 0xdeadbeef;
+	if (sun4v_cookie_only_virqs())
+		irq = sun4v_build_cookie(devhandle, devino);
+	else
+		irq = sun4v_build_sysino(devhandle, devino);
 
-	printk(KERN_CRIT "Unexpected IRQ from ino[%x] irq[%u]\n",
-	       ino, irq);
+	return irq;
+}
+
+unsigned int sun4v_build_virq(u32 devhandle, unsigned int devino)
+{
+	int irq;
+
+	irq = cookie_build_irq(devhandle, devino, &sun4v_virq);
+	if (!irq)
+		goto out;
+
+	/* This is borrowed from the original function.
+	 */
+	irq_set_status_flags(irq, IRQ_NOAUTOEN);
+
+out:
+	return irq;
 }
 
 void *hardirq_stack[NR_CPUS];
@@ -720,9 +871,12 @@ void fixup_irqs(void)
 
 	for (irq = 0; irq < NR_IRQS; irq++) {
 		struct irq_desc *desc = irq_to_desc(irq);
-		struct irq_data *data = irq_desc_get_irq_data(desc);
+		struct irq_data *data;
 		unsigned long flags;
 
+		if (!desc)
+			continue;
+		data = irq_desc_get_irq_data(desc);
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		if (desc->action && !irqd_is_per_cpu(data)) {
 			if (data->chip->irq_set_affinity)
@@ -922,16 +1076,22 @@ static struct irqaction timer_irq_action = {
 	.name = "timer",
 };
 
-/* Only invoked on boot processor. */
-void __init init_IRQ(void)
+static void __init irq_ivector_init(void)
 {
-	unsigned long size;
+	unsigned long size, order;
+	unsigned int ivecs;
 
-	map_prom_timers();
-	kill_prom_timer();
+	/* If we are doing cookie only VIRQs then we do not need the ivector
+	 * table to process interrupts.
+	 */
+	if (sun4v_cookie_only_virqs())
+		return;
 
-	size = sizeof(struct ino_bucket) * NUM_IVECS;
-	ivector_table = kzalloc(size, GFP_KERNEL);
+	ivecs = size_nr_ivec();
+	size = sizeof(struct ino_bucket) * ivecs;
+	order = get_order(size);
+	ivector_table = (struct ino_bucket *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
 	if (!ivector_table) {
 		prom_printf("Fatal error, cannot allocate ivector_table\n");
 		prom_halt();
@@ -940,6 +1100,15 @@ void __init init_IRQ(void)
 			     ((unsigned long) ivector_table) + size);
 
 	ivector_table_pa = __pa(ivector_table);
+}
+
+/* Only invoked on boot processor.*/
+void __init init_IRQ(void)
+{
+	irq_init_hv();
+	irq_ivector_init();
+	map_prom_timers();
+	kill_prom_timer();
 
 	if (tlb_type == hypervisor)
 		sun4v_init_mondo_queues();
-- 
cgit v0.10.2


From d195b71bad4347d2df51072a537f922546a904f1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 27 Sep 2014 21:30:57 -0700
Subject: sparc64: Kill unnecessary tables and increase MAX_BANKS.

swapper_low_pmd_dir and swapper_pud_dir are actually completely
useless and unnecessary.

We just need swapper_pg_dir[].  Naturally the other page table chunks
will be allocated on an as-needed basis.  Since the kernel actually
accesses these tables in the PAGE_OFFSET view, there is not even a TLB
locality advantage of placing them in the kernel image.

Use the hard coded vmlinux.ld.S slot for swapper_pg_dir which is
naturally page aligned.

Increase MAX_BANKS to 1024 in order to handle heavily fragmented
virtual guests.

Even with this MAX_BANKS increase, the kernel is 20K+ smaller.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Bob Picco <bob.picco@oracle.com>

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index c093922..bfeb626 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -927,7 +927,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 #endif
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
 
 void paging_init(void);
 unsigned long find_ecache_flush_span(unsigned long size);
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 0bacceb..0924305 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -35,8 +35,9 @@ jiffies = jiffies_64;
 
 SECTIONS
 {
-	/* swapper_low_pmd_dir is sparc64 only */
-	swapper_low_pmd_dir = 0x0000000000402000;
+#ifdef CONFIG_SPARC64
+	swapper_pg_dir = 0x0000000000402000;
+#endif
 	. = INITIAL_ADDRESS;
 	.text TEXTSTART :
 	{
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 0ead74b..2d91c62 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -87,7 +87,7 @@ extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 
 static unsigned long cpu_pgsz_mask;
 
-#define MAX_BANKS	32
+#define MAX_BANKS	1024
 
 static struct linux_prom64_registers pavail[MAX_BANKS];
 static int pavail_ents;
@@ -1943,12 +1943,6 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 
 static unsigned long last_valid_pfn;
 
-/* These must be page aligned in order to not trigger the
- * alignment tests of pgd_bad() and pud_bad().
- */
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__ ((aligned (PAGE_SIZE)));
-static pud_t swapper_pud_dir[PTRS_PER_PUD] __attribute__ ((aligned (PAGE_SIZE)));
-
 static void sun4u_pgprot_init(void);
 static void sun4v_pgprot_init(void);
 
@@ -2002,8 +1996,6 @@ void __init paging_init(void)
 {
 	unsigned long end_pfn, shift, phys_base;
 	unsigned long real_end, i;
-	pud_t *pud;
-	pmd_t *pmd;
 	int node;
 
 	setup_page_offset();
@@ -2099,20 +2091,7 @@ void __init paging_init(void)
 	 */
 	init_mm.pgd += ((shift) / (sizeof(pgd_t)));
 	
-	memset(swapper_low_pmd_dir, 0, sizeof(swapper_low_pmd_dir));
-
-	/* The kernel page tables we publish into what the rest of the
-	 * world sees must be adjusted so that they see the PAGE_OFFSET
-	 * address of these in-kerenel data structures.  However right
-	 * here we must access them from the kernel image side, because
-	 * the trap tables haven't been taken over and therefore we cannot
-	 * take TLB misses in the PAGE_OFFSET linear mappings yet.
-	 */
-	pud = swapper_pud_dir + (shift / sizeof(pud_t));
-	pgd_set(&swapper_pg_dir[0], pud);
-
-	pmd = swapper_low_pmd_dir + (shift / sizeof(pmd_t));
-	pud_set(&swapper_pud_dir[0], pmd);
+	memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
 
 	inherit_prom_mappings();
 	
-- 
cgit v0.10.2


From 1cef94c36bd4d79b5ae3a3df99ee0d76d6a4a6dc Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <dave.kleikamp@oracle.com>
Date: Tue, 7 Oct 2014 08:12:37 -0500
Subject: sparc64: Increase size of boot string to 1024 bytes

This is the longest boot string that silo supports.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Bob Picco <bob.picco@oracle.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: sparclinux@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/prom/bootstr_64.c b/arch/sparc/prom/bootstr_64.c
index ab9ccc6..7149e77 100644
--- a/arch/sparc/prom/bootstr_64.c
+++ b/arch/sparc/prom/bootstr_64.c
@@ -14,7 +14,10 @@
  *          the .bss section or it will break things.
  */
 
-#define BARG_LEN  256
+/* We limit BARG_LEN to 1024 because this is the size of the
+ * 'barg_out' command line buffer in the SILO bootloader.
+ */
+#define BARG_LEN 1024
 struct {
 	int bootstr_len;
 	int bootstr_valid;
-- 
cgit v0.10.2


From bdcf81b658ebc4c2640c3c2c55c8b31c601b6996 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 10 Oct 2014 15:49:16 -0400
Subject: sparc64: Fix lockdep warnings on reboot on Ultra-5

Inconsistently, the raw_* IRQ routines do not interact with and update
the irqflags tracing and lockdep state, whereas the raw_* spinlock
interfaces do.

This causes problems in p1275_cmd_direct() because we disable hardirqs
by hand using raw_local_irq_restore() and then do a raw_spin_lock()
which triggers a lockdep trace because the CPU's hw IRQ state doesn't
match IRQ tracing's internal software copy of that state.

The CPU's irqs are disabled, yet current->hardirqs_enabled is true.

====================
reboot: Restarting system
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1 at kernel/locking/lockdep.c:3536 check_flags+0x7c/0x240()
DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)
Modules linked in: openpromfs
CPU: 0 PID: 1 Comm: systemd-shutdow Tainted: G        W      3.17.0-dirty #145
Call Trace:
 [000000000045919c] warn_slowpath_common+0x5c/0xa0
 [0000000000459210] warn_slowpath_fmt+0x30/0x40
 [000000000048f41c] check_flags+0x7c/0x240
 [0000000000493280] lock_acquire+0x20/0x1c0
 [0000000000832b70] _raw_spin_lock+0x30/0x60
 [000000000068f2fc] p1275_cmd_direct+0x1c/0x60
 [000000000068ed28] prom_reboot+0x28/0x40
 [000000000043610c] machine_restart+0x4c/0x80
 [000000000047d2d4] kernel_restart+0x54/0x80
 [000000000047d618] SyS_reboot+0x138/0x200
 [00000000004060b4] linux_sparc_syscall32+0x34/0x60
---[ end trace 5c439fe81c05a100 ]---
possible reason: unannotated irqs-off.
irq event stamp: 2010267
hardirqs last  enabled at (2010267): [<000000000049a358>] vprintk_emit+0x4b8/0x580
hardirqs last disabled at (2010266): [<0000000000499f08>] vprintk_emit+0x68/0x580
softirqs last  enabled at (2010046): [<000000000045d278>] __do_softirq+0x378/0x4a0
softirqs last disabled at (2010039): [<000000000042bf08>] do_softirq_own_stack+0x28/0x40
Resetting ...
====================

Use local_* variables of the hw IRQ interfaces so that IRQ tracing sees
all of our changes.

Reported-by: Meelis Roos <mroos@linux.ee>
Tested-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/prom/p1275.c b/arch/sparc/prom/p1275.c
index e58b817..b2340f0 100644
--- a/arch/sparc/prom/p1275.c
+++ b/arch/sparc/prom/p1275.c
@@ -9,6 +9,7 @@
 #include <linux/smp.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 
 #include <asm/openprom.h>
 #include <asm/oplib.h>
@@ -36,8 +37,8 @@ void p1275_cmd_direct(unsigned long *args)
 {
 	unsigned long flags;
 
-	raw_local_save_flags(flags);
-	raw_local_irq_restore((unsigned long)PIL_NMI);
+	local_save_flags(flags);
+	local_irq_restore((unsigned long)PIL_NMI);
 	raw_spin_lock(&prom_entry_lock);
 
 	prom_world(1);
@@ -45,7 +46,7 @@ void p1275_cmd_direct(unsigned long *args)
 	prom_world(0);
 
 	raw_spin_unlock(&prom_entry_lock);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 void prom_cif_init(void *cif_handler, void *cif_stack)
-- 
cgit v0.10.2