From 1a18a66446f3f289b05b634f18012424d82aa63a Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Fri, 17 Jan 2014 12:25:28 +0800
Subject: powerpc: Set the correct ksp_limit on ppc32 when switching to irq
 stack

Guenter Roeck has got the following call trace on a p2020 board:
  Kernel stack overflow in process eb3e5a00, r1=eb79df90
  CPU: 0 PID: 2838 Comm: ssh Not tainted 3.13.0-rc8-juniper-00146-g19eca00 #4
  task: eb3e5a00 ti: c0616000 task.ti: ef440000
  NIP: c003a420 LR: c003a410 CTR: c0017518
  REGS: eb79dee0 TRAP: 0901   Not tainted (3.13.0-rc8-juniper-00146-g19eca00)
  MSR: 00029000 <CE,EE,ME>  CR: 24008444  XER: 00000000
  GPR00: c003a410 eb79df90 eb3e5a00 00000000 eb05d900 00000001 65d87646 00000000
  GPR08: 00000000 020b8000 00000000 00000000 44008442
  NIP [c003a420] __do_softirq+0x94/0x1ec
  LR [c003a410] __do_softirq+0x84/0x1ec
  Call Trace:
  [eb79df90] [c003a410] __do_softirq+0x84/0x1ec (unreliable)
  [eb79dfe0] [c003a970] irq_exit+0xbc/0xc8
  [eb79dff0] [c000cc1c] call_do_irq+0x24/0x3c
  [ef441f20] [c00046a8] do_IRQ+0x8c/0xf8
  [ef441f40] [c000e7f4] ret_from_except+0x0/0x18
  --- Exception: 501 at 0xfcda524
      LR = 0x10024900
  Instruction dump:
  7c781b78 3b40000a 3a73b040 543c0024 3a800000 3b3913a0 7ef5bb78 48201bf9
  5463103a 7d3b182e 7e89b92e 7c008146 <3ba00000> 7e7e9b78 48000014 57fff87f
  Kernel panic - not syncing: kernel stack overflow
  CPU: 0 PID: 2838 Comm: ssh Not tainted 3.13.0-rc8-juniper-00146-g19eca00 #4
  Call Trace:

The reason is that we have used the wrong register to calculate the
ksp_limit in commit cbc9565ee826 (powerpc: Remove ksp_limit on ppc64).
Just fix it.

As suggested by Benjamin Herrenschmidt, also add the C prototype of the
function in the comment in order to avoid such kind of errors in the
future.

Cc: stable@vger.kernel.org # 3.12
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 879f096..7c6bb4b 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -57,11 +57,14 @@ _GLOBAL(call_do_softirq)
 	mtlr	r0
 	blr
 
+/*
+ * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
+ */
 _GLOBAL(call_do_irq)
 	mflr	r0
 	stw	r0,4(r1)
 	lwz	r10,THREAD+KSP_LIMIT(r2)
-	addi	r11,r3,THREAD_INFO_GAP
+	addi	r11,r4,THREAD_INFO_GAP
 	stwu	r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
 	mr	r1,r4
 	stw	r10,8(r1)
-- 
cgit v0.10.2


From b020cc6c03a37c3526fcb1dff274f649257949e0 Mon Sep 17 00:00:00 2001
From: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Date: Fri, 17 Jan 2014 11:56:51 -0200
Subject: powerpc/pseries: Fix regression on PCI link speed

Commit 5091f0c (powerpc/pseries: Fix PCIE link speed endian issue)
introduced a regression on the PCI link speed detection using the
device-tree property. The ibm,pcie-link-speed-stats property is composed
of two 32-bit integers, the first one being the maxinum link speed and
the second the current link speed. The changes introduced by the
aforementioned commit are considering just the first integer.

Fix this issue by changing how the property is accessed, using the
helper functions to properly access the array of values. The explicit
byte swapping is not needed anymore here, since it's done by the helper
functions.

Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 70670a2..a6f7a14 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -113,7 +113,8 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
 	struct device_node *dn, *pdn;
 	struct pci_bus *bus;
-	const __be32 *pcie_link_speed_stats;
+	u32 pcie_link_speed_stats[2];
+	int rc;
 
 	bus = bridge->bus;
 
@@ -122,20 +123,21 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
 		return 0;
 
 	for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) {
-		pcie_link_speed_stats = of_get_property(pdn,
-			"ibm,pcie-link-speed-stats", NULL);
-		if (pcie_link_speed_stats)
+		rc = of_property_read_u32_array(pdn,
+				"ibm,pcie-link-speed-stats",
+				&pcie_link_speed_stats[0], 2);
+		if (!rc)
 			break;
 	}
 
 	of_node_put(pdn);
 
-	if (!pcie_link_speed_stats) {
+	if (rc) {
 		pr_err("no ibm,pcie-link-speed-stats property\n");
 		return 0;
 	}
 
-	switch (be32_to_cpup(pcie_link_speed_stats)) {
+	switch (pcie_link_speed_stats[0]) {
 	case 0x01:
 		bus->max_bus_speed = PCIE_SPEED_2_5GT;
 		break;
@@ -147,7 +149,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
 		break;
 	}
 
-	switch (be32_to_cpup(pcie_link_speed_stats)) {
+	switch (pcie_link_speed_stats[1]) {
 	case 0x01:
 		bus->cur_bus_speed = PCIE_SPEED_2_5GT;
 		break;
-- 
cgit v0.10.2


From 49d9684a54d21930372b7fb0d3d7b5617f621706 Mon Sep 17 00:00:00 2001
From: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Date: Fri, 17 Jan 2014 11:56:52 -0200
Subject: powerpc/pseries: Add Gen3 definitions for PCIE link speed

Rev3 of the PCI Express Base Specification defines a Supported Link
Speeds Vector where the bit definitions within this field are:

Bit 0 - 2.5 GT/s
Bit 1 - 5.0 GT/s
Bit 2 - 8.0 GT/s

This vector definition is used by the platform firmware to export the
maximum and current link speeds of the PCI bus via the
"ibm,pcie-link-speed-stats" device-tree property.

This patch updates pseries_root_bridge_prepare() to detect Gen3
speed buses (defined by 0x04).

Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index a6f7a14..c413ec1 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -144,6 +144,9 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
 	case 0x02:
 		bus->max_bus_speed = PCIE_SPEED_5_0GT;
 		break;
+	case 0x04:
+		bus->max_bus_speed = PCIE_SPEED_8_0GT;
+		break;
 	default:
 		bus->max_bus_speed = PCI_SPEED_UNKNOWN;
 		break;
@@ -156,6 +159,9 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge)
 	case 0x02:
 		bus->cur_bus_speed = PCIE_SPEED_5_0GT;
 		break;
+	case 0x04:
+		bus->cur_bus_speed = PCIE_SPEED_8_0GT;
+		break;
 	default:
 		bus->cur_bus_speed = PCI_SPEED_UNKNOWN;
 		break;
-- 
cgit v0.10.2


From 88247e8d7ba6639f2c199e147ebbc91f7673150c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 09:13:36 +0530
Subject: powerpc/mm: Add new "set" flag argument to pte/pmd update function

pte_update() is a powerpc-ism used to change the bits of a PTE
when the access permission is being restricted (a flush is
potentially needed).

It uses atomic operations on when needed and handles the hash
synchronization on hash based processors.

It is currently only used to clear PTE bits and so the current
implementation doesn't provide a way to also set PTE bits.

The new _PAGE_NUMA bit, when set, is actually restricting access
so it must use that function too, so this change adds the ability
for pte_update() to also set bits.

We will use this later to set the _PAGE_NUMA bit.

Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index d750336..623f297 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 					    unsigned long addr, pte_t *ptep)
 {
 #ifdef CONFIG_PPC64
-	return __pte(pte_update(mm, addr, ptep, ~0UL, 1));
+	return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
 #else
 	return __pte(pte_update(ptep, ~0UL, 0));
 #endif
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index bc141c9..eb92610 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 static inline unsigned long pte_update(struct mm_struct *mm,
 				       unsigned long addr,
 				       pte_t *ptep, unsigned long clr,
+				       unsigned long set,
 				       int huge)
 {
 #ifdef PTE_ATOMIC_UPDATES
@@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct *mm,
 	andi.	%1,%0,%6\n\
 	bne-	1b \n\
 	andc	%1,%0,%4 \n\
+	or	%1,%1,%7\n\
 	stdcx.	%1,0,%3 \n\
 	bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*ptep)
-	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY)
+	: "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set)
 	: "cc" );
 #else
 	unsigned long old = pte_val(*ptep);
-	*ptep = __pte(old & ~clr);
+	*ptep = __pte((old & ~clr) | set);
 #endif
 	/* huge pages use the old page table lock */
 	if (!huge)
@@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
 {
 	unsigned long old;
 
-       	if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+	if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
 		return 0;
-	old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0);
+	old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
 	return (old & _PAGE_ACCESSED) != 0;
 }
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	if ((pte_val(*ptep) & _PAGE_RW) == 0)
 		return;
 
-	pte_update(mm, addr, ptep, _PAGE_RW, 0);
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
 }
 
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
@@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	if ((pte_val(*ptep) & _PAGE_RW) == 0)
 		return;
 
-	pte_update(mm, addr, ptep, _PAGE_RW, 1);
+	pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
 }
 
 /*
@@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long addr, pte_t *ptep)
 {
-	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0);
+	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
 	return __pte(old);
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 			     pte_t * ptep)
 {
-	pte_update(mm, addr, ptep, ~0UL, 0);
+	pte_update(mm, addr, ptep, ~0UL, 0, 0);
 }
 
 
@@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
 					 unsigned long addr,
-					 pmd_t *pmdp, unsigned long clr);
+					 pmd_t *pmdp,
+					 unsigned long clr,
+					 unsigned long set);
 
 static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
 					      unsigned long addr, pmd_t *pmdp)
@@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
 
 	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
 		return 0;
-	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
 	return ((old & _PAGE_ACCESSED) != 0);
 }
 
@@ -542,7 +546,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
 		return;
 
-	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW);
+	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
 }
 
 #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 65b7b65..62bf5e8 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -510,7 +510,8 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 }
 
 unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
-				  pmd_t *pmdp, unsigned long clr)
+				  pmd_t *pmdp, unsigned long clr,
+				  unsigned long set)
 {
 
 	unsigned long old, tmp;
@@ -526,14 +527,15 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 		andi.	%1,%0,%6\n\
 		bne-	1b \n\
 		andc	%1,%0,%4 \n\
+		or	%1,%1,%7\n\
 		stdcx.	%1,0,%3 \n\
 		bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
-	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
+	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
 	: "cc" );
 #else
 	old = pmd_val(*pmdp);
-	*pmdp = __pmd(old & ~clr);
+	*pmdp = __pmd((old & ~clr) | set);
 #endif
 	if (old & _PAGE_HASHPTE)
 		hpte_do_hugepage_flush(mm, addr, pmdp);
@@ -708,7 +710,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
-	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
+	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
 }
 
 /*
@@ -835,7 +837,7 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 	unsigned long old;
 	pgtable_t *pgtable_slot;
 
-	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 	old_pmd = __pmd(old);
 	/*
 	 * We have pmd == none and we are holding page_table_lock.
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index a770df2d..6c0b1f5 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -78,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
 	for (; npages > 0; --npages) {
-		pte_update(mm, addr, pte, 0, 0);
+		pte_update(mm, addr, pte, 0, 0, 0);
 		addr += PAGE_SIZE;
 		++pte;
 	}
-- 
cgit v0.10.2


From 9d85d5863fa4818eb7fa306563bf830c0210c3a6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 09:13:37 +0530
Subject: mm: Dirty accountable change only apply to non prot numa case

So move it within the if loop

Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c17..33eab90 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				if (pte_numa(ptent))
 					ptent = pte_mknonnuma(ptent);
 				ptent = pte_modify(ptent, newprot);
+				/*
+				 * Avoid taking write faults for pages we
+				 * know to be dirty.
+				 */
+				if (dirty_accountable && pte_dirty(ptent))
+					ptent = pte_mkwrite(ptent);
+				ptep_modify_prot_commit(mm, addr, pte, ptent);
 				updated = true;
 			} else {
 				struct page *page;
@@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 					}
 				}
 			}
-
-			/*
-			 * Avoid taking write faults for pages we know to be
-			 * dirty.
-			 */
-			if (dirty_accountable && pte_dirty(ptent)) {
-				ptent = pte_mkwrite(ptent);
-				updated = true;
-			}
-
 			if (updated)
 				pages++;
-
-			/* Only !prot_numa always clears the pte */
-			if (!prot_numa)
-				ptep_modify_prot_commit(mm, addr, pte, ptent);
 		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 
-- 
cgit v0.10.2


From 56eecdb912b536a4fa97fb5bfe5a940a54d79be6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 09:13:38 +0530
Subject: mm: Use ptep/pmdp_set_numa() for updating _PAGE_NUMA bit

Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions when using
a hash table MMU for various reasons (the flush is handled as part of
the PTE modification when necessary).

ppc64 thus doesn't implement flush_tlb_range for hash based MMUs.

Additionally ppc64 require the tlb flushing to be batched within ptl locks.

The reason to do that is to ensure that the hash page table is in sync with
linux page table.

We track the hpte index in linux pte and if we clear them without flushing
hash and drop the ptl lock, we can have another cpu update the pte and can
end up with duplicate entry in the hash table, which is fatal.

We also want to keep set_pte_at simpler by not requiring them to do hash
flush for performance reason. We do that by assuming that set_pte_at() is
never *ever* called on a PTE that is already valid.

This was the case until the NUMA code went in which broke that assumption.

Fix that by introducing a new pair of helpers to set _PAGE_NUMA in a
way similar to ptep/pmdp_set_wrprotect(), with a generic implementation
using set_pte_at() and a powerpc specific one using the appropriate
mechanism needed to keep the hash table in sync.

Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index f83b6f3..3ebb188 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -75,12 +75,34 @@ static inline pte_t pte_mknuma(pte_t pte)
 	return pte;
 }
 
+#define ptep_set_numa ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+				 pte_t *ptep)
+{
+	if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
+		VM_BUG_ON(1);
+
+	pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
+	return;
+}
+
 #define pmd_numa pmd_numa
 static inline int pmd_numa(pmd_t pmd)
 {
 	return pte_numa(pmd_pte(pmd));
 }
 
+#define pmdp_set_numa pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+				 pmd_t *pmdp)
+{
+	if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
+		VM_BUG_ON(1);
+
+	pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
+	return;
+}
+
 #define pmd_mknonnuma pmd_mknonnuma
 static inline pmd_t pmd_mknonnuma(pmd_t pmd)
 {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e4f41d..34c7bdc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -701,6 +701,18 @@ static inline pte_t pte_mknuma(pte_t pte)
 }
 #endif
 
+#ifndef ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+				 pte_t *ptep)
+{
+	pte_t ptent = *ptep;
+
+	ptent = pte_mknuma(ptent);
+	set_pte_at(mm, addr, ptep, ptent);
+	return;
+}
+#endif
+
 #ifndef pmd_mknuma
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
@@ -708,6 +720,18 @@ static inline pmd_t pmd_mknuma(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_PRESENT);
 }
 #endif
+
+#ifndef pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+				 pmd_t *pmdp)
+{
+	pmd_t pmd = *pmdp;
+
+	pmd = pmd_mknuma(pmd);
+	set_pmd_at(mm, addr, pmdp, pmd);
+	return;
+}
+#endif
 #else
 extern int pte_numa(pte_t pte);
 extern int pmd_numa(pmd_t pmd);
@@ -715,6 +739,8 @@ extern pte_t pte_mknonnuma(pte_t pte);
 extern pmd_t pmd_mknonnuma(pmd_t pmd);
 extern pte_t pte_mknuma(pte_t pte);
 extern pmd_t pmd_mknuma(pmd_t pmd);
+extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp);
 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 #else
 static inline int pmd_numa(pmd_t pmd)
@@ -742,10 +768,23 @@ static inline pte_t pte_mknuma(pte_t pte)
 	return pte;
 }
 
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+				 pte_t *ptep)
+{
+	return;
+}
+
+
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
 	return pmd;
 }
+
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+				 pmd_t *pmdp)
+{
+	return ;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* CONFIG_MMU */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf..da23eb9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1545,6 +1545,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 				entry = pmd_mknonnuma(entry);
 			entry = pmd_modify(entry, newprot);
 			ret = HPAGE_PMD_NR;
+			set_pmd_at(mm, addr, pmd, entry);
 			BUG_ON(pmd_write(entry));
 		} else {
 			struct page *page = pmd_page(*pmd);
@@ -1557,16 +1558,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			 */
 			if (!is_huge_zero_page(page) &&
 			    !pmd_numa(*pmd)) {
-				entry = *pmd;
-				entry = pmd_mknuma(entry);
+				pmdp_set_numa(mm, addr, pmd);
 				ret = HPAGE_PMD_NR;
 			}
 		}
-
-		/* Set PMD if cleared earlier */
-		if (ret == HPAGE_PMD_NR)
-			set_pmd_at(mm, addr, pmd, entry);
-
 		spin_unlock(ptl);
 	}
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33eab90..769a67a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -69,12 +69,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			} else {
 				struct page *page;
 
-				ptent = *pte;
 				page = vm_normal_page(vma, addr, oldpte);
 				if (page && !PageKsm(page)) {
 					if (!pte_numa(oldpte)) {
-						ptent = pte_mknuma(ptent);
-						set_pte_at(mm, addr, pte, ptent);
+						ptep_set_numa(mm, addr, pte);
 						updated = true;
 					}
 				}
-- 
cgit v0.10.2


From a0a4419e302fedb548d56129e02130347810f892 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@au1.ibm.com>
Date: Wed, 12 Feb 2014 17:17:05 +1100
Subject: powerpc: Link VDSOs at 0x0

perf is failing to resolve symbols in the VDSO. A while (1)
gettimeofday() loop shows:

93.99%  [vdso]  [.] 0x00000000000005e0
 3.12%  test    [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18
 2.81%  test    [.] main

The reason for this is that we are linking our VDSO shared libraries
at 1MB, which is a little weird. Even though this is uncommon, Alan
points out that it is valid and we should probably fix perf userspace.

Regardless, I can't see a reason why we are doing this. The code
is all position independent and we never rely on the VDSO ending
up at 1M (and we never place it there on 64bit tasks).

Changing our link address to 0x0 fixes perf VDSO symbol resolution:

73.18%  [vdso]  [.] 0x000000000000060c
12.39%  [vdso]  [.] __kernel_gettimeofday
 3.58%  test    [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18
 2.94%  [vdso]  [.] __kernel_datapage_offset
 2.90%  test    [.] main

We still have some local symbol resolution issues that will be
fixed in a subsequent patch.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 0d9cecd..c53f5f6 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -4,11 +4,11 @@
 #ifdef __KERNEL__
 
 /* Default link addresses for the vDSOs */
-#define VDSO32_LBASE	0x100000
-#define VDSO64_LBASE	0x100000
+#define VDSO32_LBASE	0x0
+#define VDSO64_LBASE	0x0
 
 /* Default map addresses for 32bit vDSO */
-#define VDSO32_MBASE	VDSO32_LBASE
+#define VDSO32_MBASE	0x100000
 
 #define VDSO_VERSION_STRING	LINUX_2.6.15
 
-- 
cgit v0.10.2


From 24b659a13866b935eca72748ce725279bd3c4466 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@au1.ibm.com>
Date: Wed, 12 Feb 2014 17:18:50 +1100
Subject: powerpc: Use unstripped VDSO image for more accurate profiling data

We are seeing a lot of hits in the VDSO that are not resolved by perf.
A while(1) gettimeofday() loop shows the issue:

27.64%  [vdso]  [.] 0x000000000000060c
22.57%  [vdso]  [.] 0x0000000000000628
16.88%  [vdso]  [.] 0x0000000000000610
12.39%  [vdso]  [.] __kernel_gettimeofday
 6.09%  [vdso]  [.] 0x00000000000005f8
 3.58%  test    [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18
 2.94%  [vdso]  [.] __kernel_datapage_offset
 2.90%  test    [.] main

We are using a stripped VDSO image which means only symbols with
relocation info can be resolved. There isn't a lot of point to
stripping the VDSO, the debug info is only about 1kB:

4680 arch/powerpc/kernel/vdso64/vdso64.so
5815 arch/powerpc/kernel/vdso64/vdso64.so.dbg

By using the unstripped image, we can resolve all the symbols in the
VDSO and the perf profile data looks much better:

76.53%  [vdso]  [.] __do_get_tspec
12.20%  [vdso]  [.] __kernel_gettimeofday
 5.05%  [vdso]  [.] __get_datapage
 3.20%  test    [.] main
 2.92%  test    [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 79683d0..6ac107a 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -6,7 +6,7 @@
 	.globl vdso32_start, vdso32_end
 	.balign PAGE_SIZE
 vdso32_start:
-	.incbin "arch/powerpc/kernel/vdso32/vdso32.so"
+	.incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg"
 	.balign PAGE_SIZE
 vdso32_end:
 
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index 8df9e24..df60fca 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -6,7 +6,7 @@
 	.globl vdso64_start, vdso64_end
 	.balign PAGE_SIZE
 vdso64_start:
-	.incbin "arch/powerpc/kernel/vdso64/vdso64.so"
+	.incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg"
 	.balign PAGE_SIZE
 vdso64_end:
 
-- 
cgit v0.10.2


From 5b2e198e50f6ba57081586b853163ea1bb95f1a8 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 15:24:54 +0800
Subject: powerpc/powernv: Rework EEH reset

When doing reset in order to recover the affected PE, we issue
hot reset on PE primary bus if it's not root bus. Otherwise, we
issue hot or fundamental reset on root port or PHB accordingly.
For the later case, we didn't cover the situation where PE only
includes root port and it potentially causes kernel crash upon
EEH error to the PE.

The patch reworks the logic of EEH reset to improve the code
readability and also avoid the kernel crash.

Cc: stable@vger.kernel.org
Reported-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index e1e7161..fcb79cf 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -489,8 +489,7 @@ static int ioda_eeh_bridge_reset(struct pci_controller *hose,
 static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 {
 	struct pci_controller *hose = pe->phb;
-	struct eeh_dev *edev;
-	struct pci_dev *dev;
+	struct pci_bus *bus;
 	int ret;
 
 	/*
@@ -519,31 +518,11 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 	if (pe->type & EEH_PE_PHB) {
 		ret = ioda_eeh_phb_reset(hose, option);
 	} else {
-		if (pe->type & EEH_PE_DEVICE) {
-			/*
-			 * If it's device PE, we didn't refer to the parent
-			 * PCI bus yet. So we have to figure it out indirectly.
-			 */
-			edev = list_first_entry(&pe->edevs,
-					struct eeh_dev, list);
-			dev = eeh_dev_to_pci_dev(edev);
-			dev = dev->bus->self;
-		} else {
-			/*
-			 * If it's bus PE, the parent PCI bus is already there
-			 * and just pick it up.
-			 */
-			dev = pe->bus->self;
-		}
-
-		/*
-		 * Do reset based on the fact that the direct upstream bridge
-		 * is root bridge (port) or not.
-		 */
-		if (dev->bus->number == 0)
+		bus = eeh_pe_bus_get(pe);
+		if (pci_is_root_bus(bus))
 			ret = ioda_eeh_root_reset(hose, option);
 		else
-			ret = ioda_eeh_bridge_reset(hose, dev, option);
+			ret = ioda_eeh_bridge_reset(hose, bus->self, option);
 	}
 
 	return ret;
-- 
cgit v0.10.2


From 2ec5a0adf60c23bb6b0a95d3b96a8c1ff1e1aa5a Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 15:24:55 +0800
Subject: powerpc/eeh: Cleanup on eeh_subsystem_enabled

The patch cleans up variable eeh_subsystem_enabled so that we needn't
refer the variable directly from external. Instead, we will use
function eeh_enabled() and eeh_set_enable() to operate the variable.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 9e39ceb..d4dd41f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -172,10 +172,20 @@ struct eeh_ops {
 };
 
 extern struct eeh_ops *eeh_ops;
-extern int eeh_subsystem_enabled;
+extern bool eeh_subsystem_enabled;
 extern raw_spinlock_t confirm_error_lock;
 extern int eeh_probe_mode;
 
+static inline bool eeh_enabled(void)
+{
+	return eeh_subsystem_enabled;
+}
+
+static inline void eeh_set_enable(bool mode)
+{
+	eeh_subsystem_enabled = mode;
+}
+
 #define EEH_PROBE_MODE_DEV	(1<<0)	/* From PCI device	*/
 #define EEH_PROBE_MODE_DEVTREE	(1<<1)	/* From device tree	*/
 
@@ -246,7 +256,7 @@ void eeh_remove_device(struct pci_dev *);
  * If this macro yields TRUE, the caller relays to eeh_check_failure()
  * which does further tests out of line.
  */
-#define EEH_POSSIBLE_ERROR(val, type)	((val) == (type)~0 && eeh_subsystem_enabled)
+#define EEH_POSSIBLE_ERROR(val, type)	((val) == (type)~0 && eeh_enabled())
 
 /*
  * Reads from a device which has been isolated by EEH will return
@@ -257,6 +267,13 @@ void eeh_remove_device(struct pci_dev *);
 
 #else /* !CONFIG_EEH */
 
+static inline bool eeh_enabled(void)
+{
+        return false;
+}
+
+static inline void eeh_set_enable(bool mode) { }
+
 static inline int eeh_init(void)
 {
 	return 0;
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 148db72..f22f7b6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -89,7 +89,7 @@
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-int eeh_subsystem_enabled;
+bool eeh_subsystem_enabled = false;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
 /*
@@ -364,7 +364,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 
 	eeh_stats.total_mmio_ffs++;
 
-	if (!eeh_subsystem_enabled)
+	if (!eeh_enabled())
 		return 0;
 
 	if (!edev) {
@@ -822,7 +822,7 @@ int eeh_init(void)
 			return ret;
 	}
 
-	if (eeh_subsystem_enabled)
+	if (eeh_enabled())
 		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
 	else
 		pr_warning("EEH: No capable adapters found\n");
@@ -897,7 +897,7 @@ void eeh_add_device_late(struct pci_dev *dev)
 	struct device_node *dn;
 	struct eeh_dev *edev;
 
-	if (!dev || !eeh_subsystem_enabled)
+	if (!dev || !eeh_enabled())
 		return;
 
 	pr_debug("EEH: Adding device %s\n", pci_name(dev));
@@ -1005,7 +1005,7 @@ void eeh_remove_device(struct pci_dev *dev)
 {
 	struct eeh_dev *edev;
 
-	if (!dev || !eeh_subsystem_enabled)
+	if (!dev || !eeh_enabled())
 		return;
 	edev = pci_dev_to_eeh_dev(dev);
 
@@ -1045,7 +1045,7 @@ void eeh_remove_device(struct pci_dev *dev)
 
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
-	if (0 == eeh_subsystem_enabled) {
+	if (!eeh_enabled()) {
 		seq_printf(m, "EEH Subsystem is globally disabled\n");
 		seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
 	} else {
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index a79fddc..a59788e 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -145,7 +145,7 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag)
 	 * Enable EEH explicitly so that we will do EEH check
 	 * while accessing I/O stuff
 	 */
-	eeh_subsystem_enabled = 1;
+	eeh_set_enable(true);
 
 	/* Save memory bars */
 	eeh_save_bars(edev);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 9ef3cc8..8a8f047 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -265,7 +265,7 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
 			enable = 1;
 
 		if (enable) {
-			eeh_subsystem_enabled = 1;
+			eeh_set_enable(true);
 			eeh_add_to_parent_pe(edev);
 
 			pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n",
-- 
cgit v0.10.2


From 66f9af83e56bfa12964d251df9d60fb571579913 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 15:24:56 +0800
Subject: powerpc/eeh: Disable EEH on reboot

We possiblly detect EEH errors during reboot, particularly in kexec
path, but it's impossible for device drivers and EEH core to handle
or recover them properly.

The patch registers one reboot notifier for EEH and disable EEH
subsystem during reboot. That means the EEH errors is going to be
cleared by hardware reset or second kernel during early stage of
PCI probe.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index f22f7b6..e7b76a6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -28,6 +28,7 @@
 #include <linux/pci.h>
 #include <linux/proc_fs.h>
 #include <linux/rbtree.h>
+#include <linux/reboot.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
@@ -747,6 +748,17 @@ int __exit eeh_ops_unregister(const char *name)
 	return -EEXIST;
 }
 
+static int eeh_reboot_notifier(struct notifier_block *nb,
+			       unsigned long action, void *unused)
+{
+	eeh_set_enable(false);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_reboot_nb = {
+	.notifier_call = eeh_reboot_notifier,
+};
+
 /**
  * eeh_init - EEH initialization
  *
@@ -778,6 +790,14 @@ int eeh_init(void)
 	if (machine_is(powernv) && cnt++ <= 0)
 		return ret;
 
+	/* Register reboot notifier */
+	ret = register_reboot_notifier(&eeh_reboot_nb);
+	if (ret) {
+		pr_warn("%s: Failed to register notifier (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
 	/* call platform initialization function */
 	if (!eeh_ops) {
 		pr_warning("%s: Platform EEH operation not found\n",
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index fcb79cf..f514743 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -44,7 +44,8 @@ static int ioda_eeh_event(struct notifier_block *nb,
 
 	/* We simply send special EEH event */
 	if ((changed_evts & OPAL_EVENT_PCI_ERROR) &&
-	    (events & OPAL_EVENT_PCI_ERROR))
+	    (events & OPAL_EVENT_PCI_ERROR) &&
+	    eeh_enabled())
 		eeh_send_failure_event(NULL);
 
 	return 0;
-- 
cgit v0.10.2