From 2abbbb63c90ab55ca3f054772c2e5ba7df810c48 Mon Sep 17 00:00:00 2001
From: Gerhard Sittig <gsi@denx.de>
Date: Tue, 14 May 2013 04:40:53 +0000
Subject: powerpc/mpc512x: move common code to shared.c file

- implement all of the init, init early, and setup arch routines in the
  shared source file for the MPC512x PowerPC platform, and make all
  MPC512x based boards (ADS, PDM, generic) use those common routines
- remove declarations from header files for routines which aren't
  referenced from external callers any longer

this modification concentrates knowledge about the optional FSL DIU
support in one spot within the shared code, and makes all boards benefit
transparently from future improvements in the shared platform code

the change does not modify any behaviour but preserves all code paths

Signed-off-by: Gerhard Sittig <gsi@denx.de>
Signed-off-by: Anatolij Gustschin <agust@denx.de>

diff --git a/arch/powerpc/include/asm/mpc5121.h b/arch/powerpc/include/asm/mpc5121.h
index 885c040..8ae133e 100644
--- a/arch/powerpc/include/asm/mpc5121.h
+++ b/arch/powerpc/include/asm/mpc5121.h
@@ -68,6 +68,5 @@ struct mpc512x_lpc {
 };
 
 int mpc512x_cs_config(unsigned int cs, u32 val);
-int __init mpc5121_clk_init(void);
 
 #endif /* __ASM_POWERPC_MPC5121_H__ */
diff --git a/arch/powerpc/platforms/512x/mpc5121_ads.c b/arch/powerpc/platforms/512x/mpc5121_ads.c
index 0a134e0..3e90ece 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads.c
@@ -43,9 +43,7 @@ static void __init mpc5121_ads_setup_arch(void)
 		mpc83xx_add_bridge(np);
 #endif
 
-#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
-	mpc512x_setup_diu();
-#endif
+	mpc512x_setup_arch();
 }
 
 static void __init mpc5121_ads_init_IRQ(void)
@@ -69,7 +67,7 @@ define_machine(mpc5121_ads) {
 	.probe			= mpc5121_ads_probe,
 	.setup_arch		= mpc5121_ads_setup_arch,
 	.init			= mpc512x_init,
-	.init_early		= mpc512x_init_diu,
+	.init_early		= mpc512x_init_early,
 	.init_IRQ		= mpc5121_ads_init_IRQ,
 	.get_irq		= ipic_get_irq,
 	.calibrate_decr		= generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h
index 0a8e600..fdb4303 100644
--- a/arch/powerpc/platforms/512x/mpc512x.h
+++ b/arch/powerpc/platforms/512x/mpc512x.h
@@ -12,18 +12,11 @@
 #ifndef __MPC512X_H__
 #define __MPC512X_H__
 extern void __init mpc512x_init_IRQ(void);
+extern void __init mpc512x_init_early(void);
 extern void __init mpc512x_init(void);
+extern void __init mpc512x_setup_arch(void);
 extern int __init mpc5121_clk_init(void);
-void __init mpc512x_declare_of_platform_devices(void);
 extern const char *mpc512x_select_psc_compat(void);
 extern void mpc512x_restart(char *cmd);
 
-#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
-void mpc512x_init_diu(void);
-void mpc512x_setup_diu(void);
-#else
-#define mpc512x_init_diu NULL
-#define mpc512x_setup_diu NULL
-#endif
-
 #endif				/* __MPC512X_H__ */
diff --git a/arch/powerpc/platforms/512x/mpc512x_generic.c b/arch/powerpc/platforms/512x/mpc512x_generic.c
index 5fb919b..ce71408 100644
--- a/arch/powerpc/platforms/512x/mpc512x_generic.c
+++ b/arch/powerpc/platforms/512x/mpc512x_generic.c
@@ -45,8 +45,8 @@ define_machine(mpc512x_generic) {
 	.name			= "MPC512x generic",
 	.probe			= mpc512x_generic_probe,
 	.init			= mpc512x_init,
-	.init_early		= mpc512x_init_diu,
-	.setup_arch		= mpc512x_setup_diu,
+	.init_early		= mpc512x_init_early,
+	.setup_arch		= mpc512x_setup_arch,
 	.init_IRQ		= mpc512x_init_IRQ,
 	.get_irq		= ipic_get_irq,
 	.calibrate_decr		= generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 6eb94ab..09622d3 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -58,7 +58,7 @@ void mpc512x_restart(char *cmd)
 		;
 }
 
-#if defined(CONFIG_FB_FSL_DIU) || defined(CONFIG_FB_FSL_DIU_MODULE)
+#if IS_ENABLED(CONFIG_FB_FSL_DIU)
 
 struct fsl_diu_shared_fb {
 	u8		gamma[0x300];	/* 32-bit aligned! */
@@ -436,6 +436,12 @@ void __init mpc512x_psc_fifo_init(void)
 	}
 }
 
+void __init mpc512x_init_early(void)
+{
+	if (IS_ENABLED(CONFIG_FB_FSL_DIU))
+		mpc512x_init_diu();
+}
+
 void __init mpc512x_init(void)
 {
 	mpc5121_clk_init();
@@ -444,6 +450,12 @@ void __init mpc512x_init(void)
 	mpc512x_psc_fifo_init();
 }
 
+void __init mpc512x_setup_arch(void)
+{
+	if (IS_ENABLED(CONFIG_FB_FSL_DIU))
+		mpc512x_setup_diu();
+}
+
 /**
  * mpc512x_cs_config - Setup chip select configuration
  * @cs: chip select number
diff --git a/arch/powerpc/platforms/512x/pdm360ng.c b/arch/powerpc/platforms/512x/pdm360ng.c
index 0575e85..24b314d 100644
--- a/arch/powerpc/platforms/512x/pdm360ng.c
+++ b/arch/powerpc/platforms/512x/pdm360ng.c
@@ -119,9 +119,9 @@ static int __init pdm360ng_probe(void)
 define_machine(pdm360ng) {
 	.name			= "PDM360NG",
 	.probe			= pdm360ng_probe,
-	.setup_arch		= mpc512x_setup_diu,
+	.setup_arch		= mpc512x_setup_arch,
 	.init			= pdm360ng_init,
-	.init_early		= mpc512x_init_diu,
+	.init_early		= mpc512x_init_early,
 	.init_IRQ		= mpc512x_init_IRQ,
 	.get_irq		= ipic_get_irq,
 	.calibrate_decr		= generic_calibrate_decr,
-- 
cgit v0.10.2


From a4f4124cf308275b4a2219d1e332dfc01d8bd6b7 Mon Sep 17 00:00:00 2001
From: Gerhard Sittig <gsi@denx.de>
Date: Tue, 14 May 2013 04:40:54 +0000
Subject: powerpc/mpc512x: initialize board restart earlier

move the MPC512x restart initialization from the shared init routine
to the shared init_early routine

recent problems in the proc(5) filesystem initialization led to the
situation where the platform's restart routine was invoked yet the
registers required for software reset were not yet available, which
made the board hang instead of reboot

Signed-off-by: Gerhard Sittig <gsi@denx.de>
Signed-off-by: Anatolij Gustschin <agust@denx.de>

diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 09622d3..a8b5110 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -438,6 +438,7 @@ void __init mpc512x_psc_fifo_init(void)
 
 void __init mpc512x_init_early(void)
 {
+	mpc512x_restart_init();
 	if (IS_ENABLED(CONFIG_FB_FSL_DIU))
 		mpc512x_init_diu();
 }
@@ -446,7 +447,6 @@ void __init mpc512x_init(void)
 {
 	mpc5121_clk_init();
 	mpc512x_declare_of_platform_devices();
-	mpc512x_restart_init();
 	mpc512x_psc_fifo_init();
 }
 
-- 
cgit v0.10.2


From 8663890a9e9278623d20c67aa9fbeeb31ff3be97 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:20:34 -0700
Subject: mm/thp: use the correct function when updating access flags

We should use pmdp_set_access_flags to update access flags.  Archs like
powerpc use extra checks(_PAGE_BUSY) when updating a hugepage PTE.  A
set_pmd_at doesn't do those checks.  We should use set_pmd_at only when
updating a none hugepage PTE.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>a
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329..dab90fd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 		 * young bit, instead of the current set_pmd_at.
 		 */
 		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
-		set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+		if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+					  pmd, _pmd,  1))
+			update_mmu_cache_pmd(vma, addr, pmd);
 	}
 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
 		if (page->mapping && trylock_page(page)) {
-- 
cgit v0.10.2


From 6b0b50b0617fad5f2af3b928596a25f7de8dbf50 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 17:14:02 -0700
Subject: mm/THP: add pmd args to pgtable deposit and withdraw APIs

This will be later used by powerpc THP support.  In powerpc we want to use
pgtable for storing the hash index values.  So instead of adding them to
mm_context list, we would like to store them in the second half of pmd

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index e8b6e5b..2080dfe 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1370,10 +1370,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				       pgtable_t pgtable);
 
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
 static inline int pmd_trans_splitting(pmd_t pmd)
 {
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index a938b54..1ccbffe 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1117,7 +1117,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 	}
 }
 
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable)
 {
 	struct list_head *lh = (struct list_head *) pgtable;
 
@@ -1131,7 +1132,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
 	mm->pmd_huge_pte = pgtable;
 }
 
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
 	struct list_head *lh;
 	pgtable_t pgtable;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7619f2f..d22b92d 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -853,10 +853,11 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 				 pmd_t *pmd);
 
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				       pgtable_t pgtable);
 
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #endif
 
 /* Encode and de-code a swap entry */
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 83d89bc..f828dd3 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -188,7 +188,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	}
 }
 
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable)
 {
 	struct list_head *lh = (struct list_head *) pgtable;
 
@@ -202,7 +203,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
 	mm->pmd_huge_pte = pgtable;
 }
 
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
 	struct list_head *lh;
 	pgtable_t pgtable;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a59ff51..18e27c2 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -173,11 +173,12 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 #endif
 
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				       pgtable_t pgtable);
 #endif
 
 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #endif
 
 #ifndef __HAVE_ARCH_PMDP_INVALIDATE
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dab90fd..6b785e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -730,7 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		entry = mk_huge_pmd(page, vma);
 		page_add_new_anon_rmap(page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
-		pgtable_trans_huge_deposit(mm, pgtable);
+		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm->nr_ptes++;
 		spin_unlock(&mm->page_table_lock);
@@ -772,7 +772,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	entry = pmd_wrprotect(entry);
 	entry = pmd_mkhuge(entry);
 	set_pmd_at(mm, haddr, pmd, entry);
-	pgtable_trans_huge_deposit(mm, pgtable);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	mm->nr_ptes++;
 	return true;
 }
@@ -917,7 +917,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-	pgtable_trans_huge_deposit(dst_mm, pgtable);
+	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	dst_mm->nr_ptes++;
 
 	ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
-	pgtable = pgtable_trans_huge_withdraw(mm);
+	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);
 
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
-	pgtable = pgtable_trans_huge_withdraw(mm);
+	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);
 
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1360,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		struct page *page;
 		pgtable_t pgtable;
 		pmd_t orig_pmd;
-		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		if (is_huge_zero_pmd(orig_pmd)) {
@@ -1693,7 +1693,7 @@ static int __split_huge_page_map(struct page *page,
 	pmd = page_check_address_pmd(page, mm, address,
 				     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
 	if (pmd) {
-		pgtable = pgtable_trans_huge_withdraw(mm);
+		pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 		pmd_populate(mm, &_pmd, pgtable);
 
 		haddr = address;
@@ -2363,7 +2363,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	page_add_new_anon_rmap(new_page, vma, address);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
-	pgtable_trans_huge_deposit(mm, pgtable);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	spin_unlock(&mm->page_table_lock);
 
 	*hpage = NULL;
@@ -2669,7 +2669,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pmdp_clear_flush(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
-	pgtable = pgtable_trans_huge_withdraw(mm);
+	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);
 
 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323f..e1a6e4f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable)
 {
 	assert_spin_locked(&mm->page_table_lock);
 
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* no "address" argument so destroys page coloring of some arch */
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
 	pgtable_t pgtable;
 
-- 
cgit v0.10.2


From a6bf2bb03e5bad7e9289d80ecb5faac11630c7ab Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 17:14:04 -0700
Subject: mm/THP: withdraw the pgtable after pmdp related operations

For architectures like ppc64 we look at deposited pgtable when calling
pmdp_get_and_clear.  So do the pgtable_trans_huge_withdraw after finishing
pmdp related operations.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6b785e1..5c4fac2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1360,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		struct page *page;
 		pgtable_t pgtable;
 		pmd_t orig_pmd;
-		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+		/*
+		 * For architectures like ppc64 we look at deposited pgtable
+		 * when calling pmdp_get_and_clear. So do the
+		 * pgtable_trans_huge_withdraw after finishing pmdp related
+		 * operations.
+		 */
 		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
 			tlb->mm->nr_ptes--;
 			spin_unlock(&tlb->mm->page_table_lock);
-- 
cgit v0.10.2


From fde52796d487b675cde55427e3347ff3e59f9a7f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 17:14:05 -0700
Subject: mm/THP: don't use HPAGE_SHIFT in transparent hugepage code

For architectures like powerpc that support multiple explicit hugepage
sizes, HPAGE_SHIFT indicate the default explicit hugepage shift.  For THP
to work the hugepage size should be same as PMD_SIZE.  So use PMD_SHIFT
directly.  So move the define outside CONFIG_TRANSPARENT_HUGEPAGE #ifdef
because we want to use these defines in generic code with if
(pmd_trans_huge()) conditional.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 528454c..cc276d2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -58,12 +58,11 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+#define HPAGE_PMD_SHIFT PMD_SHIFT
+#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
+#define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define HPAGE_PMD_SHIFT HPAGE_SHIFT
-#define HPAGE_PMD_MASK HPAGE_MASK
-#define HPAGE_PMD_SIZE HPAGE_SIZE
-
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
 #define transparent_hugepage_enabled(__vma)				\
@@ -181,9 +180,6 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
-#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
-#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
 
 #define hpage_nr_pages(x) 1
 
-- 
cgit v0.10.2


From fce144b477fb0313f6612d5e3e22b67d7bdf935e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 17:14:06 -0700
Subject: mm/THP: deposit the transpare huge pgtable before set_pmd

Architectures like powerpc use the deposited pgtable to store hash index
values.  We need to make the deposted pgtable is visible to other cpus
before we are ready to take a hash fault.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5c4fac2..59d9384 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		pmd_t entry;
 		entry = mk_huge_pmd(page, vma);
 		page_add_new_anon_rmap(page, vma, haddr);
-		set_pmd_at(mm, haddr, pmd, entry);
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
+		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm->nr_ptes++;
 		spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	entry = mk_pmd(zero_page, vma->vm_page_prot);
 	entry = pmd_wrprotect(entry);
 	entry = pmd_mkhuge(entry);
-	set_pmd_at(mm, haddr, pmd, entry);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, entry);
 	mm->nr_ptes++;
 	return true;
 }
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
-	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 	dst_mm->nr_ptes++;
 
 	ret = 0;
@@ -2367,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spin_lock(&mm->page_table_lock);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
-	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	spin_unlock(&mm->page_table_lock);
 
 	*hpage = NULL;
-- 
cgit v0.10.2


From e80034047bee9ceacfc1bfff873ebfdd049817ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Feb 2013 23:38:51 +0100
Subject: powerpc: Mark low level irq handlers NO_THREAD

These low level handlers cannot be threaded. Mark them NO_THREAD

Reported-by: leroy christophe <christophe.leroy@c-s.fr>
Tested-by: leroy christophe <christophe.leroy@c-s.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 1e12108..806cbbd 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -43,6 +43,7 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
 
 static struct irqaction tbint_irqaction = {
 	.handler = timebase_interrupt,
+	.flags = IRQF_NO_THREAD,
 	.name = "tbint",
 };
 
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index d4fa03f..5e6ff38 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -120,6 +120,7 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
 
 static struct irqaction cpm_error_irqaction = {
 	.handler = cpm_error_interrupt,
+	.flags = IRQF_NO_THREAD,
 	.name = "error",
 };
 
-- 
cgit v0.10.2


From 85f395c5b0a26b3a80f9e2d35333981a2a75c0ae Mon Sep 17 00:00:00 2001
From: "Suzuki K. Poulose" <suzuki@in.ibm.com>
Date: Mon, 3 Dec 2012 20:37:42 +0530
Subject: powerpc/kprobes: Do not disable External interrupts during single
 step

External/Decrement exceptions have lower priority than the Debug Exception.
So, we don't have to disable the External interrupts before a single step.
However, on BookE, Critical Input Exception(CE) has higher priority than a
Debug Exception. Hence we mask them.

Signed-off-by: 	Suzuki K. Poulose <suzuki@in.ibm.com>
Cc:		Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc:		Ananth N Mavinakaynahalli <ananth@in.ibm.com>
Cc:		Kumar Gala <galak@kernel.crashing.org>
Cc:		linuxppc-dev@ozlabs.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 11f5b03..560f430 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -104,13 +104,13 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
 
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
-	/* We turn off async exceptions to ensure that the single step will
-	 * be for the instruction we have the kprobe on, if we dont its
-	 * possible we'd get the single step reported for an exception handler
-	 * like Decrementer or External Interrupt */
-	regs->msr &= ~MSR_EE;
 	regs->msr |= MSR_SINGLESTEP;
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	/*
+	 * We turn off Critical Input Exception(CE) to ensure that the single
+	 * step will be for the instruction we have the probe on; if we don't,
+	 * it is possible we'd get the single step reported for CE.
+	 */
 	regs->msr &= ~MSR_CE;
 	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
 #ifdef CONFIG_PPC_47x
-- 
cgit v0.10.2


From 35fd219a268cc82cef842518cd64ea6949629ba2 Mon Sep 17 00:00:00 2001
From: "Suzuki K. Poulose" <suzuki@in.ibm.com>
Date: Mon, 3 Dec 2012 20:38:37 +0530
Subject: powerpc: Move the single step enable code to a generic path

This patch moves the single step enable code used by kprobe to a generic
routine header so that, it can be re-used by other code, in this case,
uprobes. No functional changes.

Signed-off-by: Suzuki K. Poulose <suzuki@in.ibm.com>
Cc:	Ananth N Mavinakaynahalli <ananth@in.ibm.com>
Cc:	Kumar Gala <galak@kernel.crashing.org>
Cc:	linuxppc-dev@ozlabs.org
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/probes.h b/arch/powerpc/include/asm/probes.h
index 5f1e15b..3421637 100644
--- a/arch/powerpc/include/asm/probes.h
+++ b/arch/powerpc/include/asm/probes.h
@@ -38,5 +38,30 @@ typedef u32 ppc_opcode_t;
 #define is_trap(instr)		(IS_TW(instr) || IS_TWI(instr))
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+#define MSR_SINGLESTEP	(MSR_DE)
+#else
+#define MSR_SINGLESTEP	(MSR_SE)
+#endif
+
+/* Enable single stepping for the current task */
+static inline void enable_single_step(struct pt_regs *regs)
+{
+	regs->msr |= MSR_SINGLESTEP;
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+	/*
+	 * We turn off Critical Input Exception(CE) to ensure that the single
+	 * step will be for the instruction we have the probe on; if we don't,
+	 * it is possible we'd get the single step reported for CE.
+	 */
+	regs->msr &= ~MSR_CE;
+	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
+#ifdef CONFIG_PPC_47x
+	isync();
+#endif
+#endif
+}
+
+
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_PROBES_H */
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 560f430..2156ea9 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -36,12 +36,6 @@
 #include <asm/sstep.h>
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-#define MSR_SINGLESTEP	(MSR_DE)
-#else
-#define MSR_SINGLESTEP	(MSR_SE)
-#endif
-
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
@@ -104,19 +98,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
 
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
-	regs->msr |= MSR_SINGLESTEP;
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-	/*
-	 * We turn off Critical Input Exception(CE) to ensure that the single
-	 * step will be for the instruction we have the probe on; if we don't,
-	 * it is possible we'd get the single step reported for CE.
-	 */
-	regs->msr &= ~MSR_CE;
-	mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
-#ifdef CONFIG_PPC_47x
-	isync();
-#endif
-#endif
+	enable_single_step(regs);
 
 	/*
 	 * On powerpc we should single step on the original
-- 
cgit v0.10.2


From 39a421ff0b7cb056e687894a9d5f57aa1303e1c8 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Wed, 20 Mar 2013 19:06:12 -0500
Subject: powerpc/mm/nohash: Ignore NULL stale_map entries

This happens with threads that are offline due to CPU hotplug
(including threads that were never "plugged in" to begin with because
SMT is disabled).

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642..810f8e4 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
 		 */
 		for_each_cpu(cpu, mm_cpumask(mm)) {
 			for (i = cpu_first_thread_sibling(cpu);
-			     i <= cpu_last_thread_sibling(cpu); i++)
-				__set_bit(id, stale_map[i]);
+			     i <= cpu_last_thread_sibling(cpu); i++) {
+				if (stale_map[i])
+					__set_bit(id, stale_map[i]);
+			}
 			cpu = i - 1;
 		}
 		return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
 		for (i = cpu_first_thread_sibling(cpu);
 		     i <= cpu_last_thread_sibling(cpu); i++) {
-			__clear_bit(id, stale_map[i]);
+			if (stale_map[i])
+				__clear_bit(id, stale_map[i]);
 		}
 	}
 
-- 
cgit v0.10.2


From 3139b0a797d6826519ed98a13623a92f12269613 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Wed, 17 Apr 2013 17:50:35 +0800
Subject: powerpc: Remove the unneeded trigger of decrementer interrupt in
 decrementer_check_overflow

Previously in order to handle the edge sensitive decrementers,
we choose to set the decrementer to 1 to trigger a decrementer
interrupt when re-enabling interrupts. But with the rework of the
lazy EE, we would replay the decrementer interrupt when re-enabling
interrupts if a decrementer interrupt occurs with irq soft-disabled.
So there is no need to trigger a decrementer interrupt in this case
any more.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 5cbcf4d..32fa52e 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -116,8 +116,6 @@ static inline notrace int decrementer_check_overflow(void)
  	u64 now = get_tb_or_rtc();
  	u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
  
-	if (now >= *next_tb)
-		set_dec(1);
 	return now >= *next_tb;
 }
 
-- 
cgit v0.10.2


From d5d8ec895ca599fbde43efe3a2f9714315e3d298 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@fifo99.com>
Date: Tue, 23 Apr 2013 17:50:33 -0700
Subject: powerpc/mm: Make mmap_64.c compile on 32bit powerpc

There appears to be no good reason to keep this as 64bit only. It works
on 32bit also, and has checks so that it can work correctly with 32bit
binaries on 64bit hardware which is why I think this works.

I tested this on qemu using the virtex-ml507 machine type.

Before,

/bin2 # ./test & cat /proc/${!}/maps
00100000-00103000 r-xp 00000000 00:00 0          [vdso]
10000000-10007000 r-xp 00000000 00:01 454        /bin2/test
10017000-10018000 rw-p 00007000 00:01 454        /bin2/test
48000000-48020000 r-xp 00000000 00:01 224        /lib/ld-2.11.3.so
48021000-48023000 rw-p 00021000 00:01 224        /lib/ld-2.11.3.so
bfd03000-bfd24000 rw-p 00000000 00:00 0          [stack]
/bin2 # ./test & cat /proc/${!}/maps
00100000-00103000 r-xp 00000000 00:00 0          [vdso]
0fe6e000-0ffd8000 r-xp 00000000 00:01 214        /lib/libc-2.11.3.so
0ffd8000-0ffe8000 ---p 0016a000 00:01 214        /lib/libc-2.11.3.so
0ffe8000-0ffed000 rw-p 0016a000 00:01 214        /lib/libc-2.11.3.so
0ffed000-0fff0000 rw-p 00000000 00:00 0
10000000-10007000 r-xp 00000000 00:01 454        /bin2/test
10017000-10018000 rw-p 00007000 00:01 454        /bin2/test
48000000-48020000 r-xp 00000000 00:01 224        /lib/ld-2.11.3.so
48020000-48021000 rw-p 00000000 00:00 0
48021000-48023000 rw-p 00021000 00:01 224        /lib/ld-2.11.3.so
bf98a000-bf9ab000 rw-p 00000000 00:00 0          [stack]
/bin2 # ./test & cat /proc/${!}/maps
00100000-00103000 r-xp 00000000 00:00 0          [vdso]
0fe6e000-0ffd8000 r-xp 00000000 00:01 214        /lib/libc-2.11.3.so
0ffd8000-0ffe8000 ---p 0016a000 00:01 214        /lib/libc-2.11.3.so
0ffe8000-0ffed000 rw-p 0016a000 00:01 214        /lib/libc-2.11.3.so
0ffed000-0fff0000 rw-p 00000000 00:00 0
10000000-10007000 r-xp 00000000 00:01 454        /bin2/test
10017000-10018000 rw-p 00007000 00:01 454        /bin2/test
48000000-48020000 r-xp 00000000 00:01 224        /lib/ld-2.11.3.so
48020000-48021000 rw-p 00000000 00:00 0
48021000-48023000 rw-p 00021000 00:01 224        /lib/ld-2.11.3.so
bfa54000-bfa75000 rw-p 00000000 00:00 0          [stack]

After,

bash-4.1# ./test & cat /proc/${!}/maps
[7] 803
00100000-00103000 r-xp 00000000 00:00 0          [vdso]
10000000-10007000 r-xp 00000000 00:01 454        /bin2/test
10017000-10018000 rw-p 00007000 00:01 454        /bin2/test
b7eb0000-b7ed0000 r-xp 00000000 00:01 224        /lib/ld-2.11.3.so
b7ed1000-b7ed3000 rw-p 00021000 00:01 224        /lib/ld-2.11.3.so
bfbc0000-bfbe1000 rw-p 00000000 00:00 0          [stack]
bash-4.1# ./test & cat /proc/${!}/maps
[8] 805
00100000-00103000 r-xp 00000000 00:00 0          [vdso]
10000000-10007000 r-xp 00000000 00:01 454        /bin2/test
10017000-10018000 rw-p 00007000 00:01 454        /bin2/test
b7b03000-b7b23000 r-xp 00000000 00:01 224        /lib/ld-2.11.3.so
b7b24000-b7b26000 rw-p 00021000 00:01 224        /lib/ld-2.11.3.so
bfc27000-bfc48000 rw-p 00000000 00:00 0          [stack]
bash-4.1# ./test & cat /proc/${!}/maps
[9] 807
00100000-00103000 r-xp 00000000 00:00 0          [vdso]
10000000-10007000 r-xp 00000000 00:01 454        /bin2/test
10017000-10018000 rw-p 00007000 00:01 454        /bin2/test
b7f37000-b7f57000 r-xp 00000000 00:01 224        /lib/ld-2.11.3.so
b7f58000-b7f5a000 rw-p 00021000 00:01 224        /lib/ld-2.11.3.so
bff96000-bffb7000 rw-p 00000000 00:00 0          [stack]

Signed-off-by: Daniel Walker <dwalker@fifo90.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 14a6583..7135a25 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -404,9 +404,7 @@ static inline void prefetchw(const void *x)
 
 #define spin_lock_prefetch(x)	prefetchw(x)
 
-#ifdef CONFIG_PPC64
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
-#endif
 
 #ifdef CONFIG_PPC64
 static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cf16b57..26f29a7 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
-obj-y				:= fault.o mem.o pgtable.o gup.o \
+obj-y				:= fault.o mem.o pgtable.o gup.o mmap.o \
 				   init_$(CONFIG_WORD_SIZE).o \
 				   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(CONFIG_WORD_SIZE)e.o
-obj-$(CONFIG_PPC64)		+= mmap_64.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hash_utils_64.o \
 				   slb_low.o slb.o stab.o \
-				   mmap_64.o $(hash64-y)
+				   $(hash64-y)
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
 				   tlb_hash$(CONFIG_WORD_SIZE).o \
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
new file mode 100644
index 0000000..67a42ed
--- /dev/null
+++ b/arch/powerpc/mm/mmap.c
@@ -0,0 +1,101 @@
+/*
+ *  flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave at least a ~128 MB hole on 32bit applications.
+ *
+ * On 64bit applications we randomise the stack by 1GB so we need to
+ * space our mmap start address by a further 1GB, otherwise there is a
+ * chance the mmap area will end up closer to the stack than our ulimit
+ * requires.
+ */
+#define MIN_GAP32 (128*1024*1024)
+#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
+#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static inline int mmap_is_legacy(void)
+{
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+static unsigned long mmap_rnd(void)
+{
+	unsigned long rnd = 0;
+
+	if (current->flags & PF_RANDOMIZE) {
+		/* 8MB for 32bit, 1GB for 64bit */
+		if (is_32bit_task())
+			rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+		else
+			rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+	}
+	return rnd << PAGE_SHIFT;
+}
+
+static inline unsigned long mmap_base(void)
+{
+	unsigned long gap = rlimit(RLIMIT_STACK);
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	/*
+	 * Fall back to the standard layout if the personality
+	 * bit is set, or if the expected stack growth is unlimited:
+	 */
+	if (mmap_is_legacy()) {
+		mm->mmap_base = TASK_UNMAPPED_BASE;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm->unmap_area = arch_unmap_area;
+	} else {
+		mm->mmap_base = mmap_base();
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm->unmap_area = arch_unmap_area_topdown;
+	}
+}
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap_64.c
deleted file mode 100644
index 67a42ed..0000000
--- a/arch/powerpc/mm/mmap_64.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- *  flexible mmap layout support
- *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
- */
-
-#include <linux/personality.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave at least a ~128 MB hole on 32bit applications.
- *
- * On 64bit applications we randomise the stack by 1GB so we need to
- * space our mmap start address by a further 1GB, otherwise there is a
- * chance the mmap area will end up closer to the stack than our ulimit
- * requires.
- */
-#define MIN_GAP32 (128*1024*1024)
-#define MIN_GAP64 ((128 + 1024)*1024*1024UL)
-#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64)
-#define MAX_GAP (TASK_SIZE/6*5)
-
-static inline int mmap_is_legacy(void)
-{
-	if (current->personality & ADDR_COMPAT_LAYOUT)
-		return 1;
-
-	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
-		return 1;
-
-	return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_rnd(void)
-{
-	unsigned long rnd = 0;
-
-	if (current->flags & PF_RANDOMIZE) {
-		/* 8MB for 32bit, 1GB for 64bit */
-		if (is_32bit_task())
-			rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
-		else
-			rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
-	}
-	return rnd << PAGE_SHIFT;
-}
-
-static inline unsigned long mmap_base(void)
-{
-	unsigned long gap = rlimit(RLIMIT_STACK);
-
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
-
-	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
-}
-
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-	/*
-	 * Fall back to the standard layout if the personality
-	 * bit is set, or if the expected stack growth is unlimited:
-	 */
-	if (mmap_is_legacy()) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
-		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
-	} else {
-		mm->mmap_base = mmap_base();
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
-	}
-}
-- 
cgit v0.10.2


From 0962e8004e97409072bb6caee7b3ba948a5fb93a Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 24 Apr 2013 14:26:30 +0800
Subject: powerpc/prom: Scan reserved-ranges node for memory reservations

Based on benh's proposal at
https://lists.ozlabs.org/pipermail/linuxppc-dev/2012-September/101237.html,
this change provides support for reserving memory from the
reserved-ranges node at the root of the device tree.

We just call memblock_reserve on these ranges for now.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 8b6f7a9..9c753bc 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -559,6 +559,33 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start,
 }
 #endif
 
+static bool __init early_reserve_mem_dt(void)
+{
+	unsigned long i, len, dt_root;
+	const __be32 *prop;
+
+	dt_root = of_get_flat_dt_root();
+
+	prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
+
+	if (!prop)
+		return false;
+
+	/* Each reserved range is an (address,size) pair, 2 cells each,
+	 * totalling 4 cells per range. */
+	for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+		u64 base, size;
+
+		base = of_read_number(prop + (i * 4) + 0, 2);
+		size = of_read_number(prop + (i * 4) + 2, 2);
+
+		if (size)
+			memblock_reserve(base, size);
+	}
+
+	return true;
+}
+
 static void __init early_reserve_mem(void)
 {
 	u64 base, size;
@@ -574,6 +601,14 @@ static void __init early_reserve_mem(void)
 	self_size = initial_boot_params->totalsize;
 	memblock_reserve(self_base, self_size);
 
+	/*
+	 * Try looking for reserved-regions property in the DT first; if
+	 * it's present, it'll contain all of the necessary reservation
+	 * info
+	 */
+	if (early_reserve_mem_dt())
+		return;
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* then reserve the initrd, if any */
 	if (initrd_start && (initrd_end > initrd_start))
-- 
cgit v0.10.2


From 071df9422ac91c0d290e81f5ae2635c74cda6d00 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 29 Apr 2013 13:42:43 +1000
Subject: powerpc: Add a configuration option for early BootX/OpenFirmware
 debug

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 863d877..d86875f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -147,6 +147,13 @@ choice
 	  enable debugging for the wrong type of machine your kernel
 	  _will not boot_.
 
+config PPC_EARLY_DEBUG_BOOTX
+	bool "BootX or OpenFirmware"
+	depends on BOOTX_TEXT
+	help
+	  Select this to enable early debugging for a machine using BootX
+	  or OpenFirmware.
+
 config PPC_EARLY_DEBUG_LPAR
 	bool "LPAR HV Console"
 	depends on PPC_PSERIES
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index 9d3fdcd..a158375 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -50,7 +50,7 @@ void __init udbg_early_init(void)
 	udbg_init_debug_beat();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE)
 	udbg_init_pas_realmode();
-#elif defined(CONFIG_BOOTX_TEXT)
+#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX)
 	udbg_init_btext();
 #elif defined(CONFIG_PPC_EARLY_DEBUG_44x)
 	/* PPC44x debug */
-- 
cgit v0.10.2


From b9ef7d6b11c120cc402a76013062061bbe0fbaad Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 29 Apr 2013 13:42:44 +1000
Subject: powerpc: Update default configurations

Update default configurations for systems with CONFIG_BOOTX_TEXT
selected so that they continue to print early debug messages as is
currently the case.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig
index 2a84fd7..671a8f9 100644
--- a/arch/powerpc/configs/c2k_defconfig
+++ b/arch/powerpc/configs/c2k_defconfig
@@ -423,6 +423,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_KEYS=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig
index 07b7f2a..1ea22fc 100644
--- a/arch/powerpc/configs/g5_defconfig
+++ b/arch/powerpc/configs/g5_defconfig
@@ -284,6 +284,8 @@ CONFIG_DEBUG_MUTEXES=y
 CONFIG_LATENCYTOP=y
 CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_ECB=m
diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig
index 02ac96b..2a5afac 100644
--- a/arch/powerpc/configs/maple_defconfig
+++ b/arch/powerpc/configs/maple_defconfig
@@ -138,6 +138,8 @@ CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_XMON=y
 CONFIG_XMON_DEFAULT=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_PCBC=m
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig
index 29767a8..a73626b 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -350,6 +350,8 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_XMON=y
 CONFIG_XMON_DEFAULT=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_MD4=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index aef3f71..c86fcb9 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -398,6 +398,8 @@ CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index be1cb6e..20ebfaf 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1264,6 +1264,8 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_XMON=y
 CONFIG_BOOTX_TEXT=y
+CONFIG_PPC_EARLY_DEBUG=y
+CONFIG_PPC_EARLY_DEBUG_BOOTX=y
 CONFIG_KEYS=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
-- 
cgit v0.10.2


From 70a54a4faec72ee9d12b9c4dfa27bc241deb79a6 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 6 May 2013 21:32:40 +1000
Subject: powerpc: Fix single step emulation of 32bit overflowed branches

Check truncate_if_32bit() on final write to nip.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index e15c521..99c7fc1 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -580,7 +580,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 		if (instr & 1)
 			regs->link = regs->nip;
 		if (branch_taken(instr, regs))
-			regs->nip = imm;
+			regs->nip = truncate_if_32bit(regs->msr, imm);
 		return 1;
 #ifdef CONFIG_PPC64
 	case 17:	/* sc */
-- 
cgit v0.10.2


From ab9a4183fddf232a46b6255e0d3da5a09f85ecbd Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Thu, 9 May 2013 10:42:13 +1000
Subject: powerpc: Update currituck pci/usb fixup for new board revision

The currituck board uses a different IRQ for the pci usb host
controller depending on the board revision. This patch adds support
for newer board revisions by retrieving the board revision from the
FPGA and mapping the appropriate IRQ.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Acked-by: Tony Breeds <tony@bakeyournoodle.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts
index b801dd0..d2c8a87 100644
--- a/arch/powerpc/boot/dts/currituck.dts
+++ b/arch/powerpc/boot/dts/currituck.dts
@@ -103,6 +103,11 @@
 				interrupts = <34 2>;
 			};
 
+			FPGA0: fpga@50000000 {
+				compatible = "ibm,currituck-fpga";
+				reg = <0x50000000 0x4>;
+			};
+
 			IIC0: i2c@00000000 {
 				compatible = "ibm,iic-currituck", "ibm,iic";
 				reg = <0x0 0x00000014>;
diff --git a/arch/powerpc/platforms/44x/currituck.c b/arch/powerpc/platforms/44x/currituck.c
index ecd3890..c52e1b3 100644
--- a/arch/powerpc/platforms/44x/currituck.c
+++ b/arch/powerpc/platforms/44x/currituck.c
@@ -176,13 +176,48 @@ static int __init ppc47x_probe(void)
 	return 1;
 }
 
+static int board_rev = -1;
+static int __init ppc47x_get_board_rev(void)
+{
+	u8 fpga_reg0;
+	void *fpga;
+	struct device_node *np;
+
+	np = of_find_compatible_node(NULL, NULL, "ibm,currituck-fpga");
+	if (!np)
+		goto fail;
+
+	fpga = of_iomap(np, 0);
+	of_node_put(np);
+	if (!fpga)
+		goto fail;
+
+	fpga_reg0 = ioread8(fpga);
+	board_rev = fpga_reg0 & 0x03;
+	pr_info("%s: Found board revision %d\n", __func__, board_rev);
+	iounmap(fpga);
+	return 0;
+
+fail:
+	pr_info("%s: Unable to find board revision\n", __func__);
+	return 0;
+}
+machine_arch_initcall(ppc47x, ppc47x_get_board_rev);
+
 /* Use USB controller should have been hardware swizzled but it wasn't :( */
 static void ppc47x_pci_irq_fixup(struct pci_dev *dev)
 {
 	if (dev->vendor == 0x1033 && (dev->device == 0x0035 ||
 	                              dev->device == 0x00e0)) {
-		dev->irq = irq_create_mapping(NULL, 47);
-		pr_info("%s: Mapping irq 47 %d\n", __func__, dev->irq);
+		if (board_rev == 0) {
+			dev->irq = irq_create_mapping(NULL, 47);
+			pr_info("%s: Mapping irq %d\n", __func__, dev->irq);
+		} else if (board_rev == 2) {
+			dev->irq = irq_create_mapping(NULL, 49);
+			pr_info("%s: Mapping irq %d\n", __func__, dev->irq);
+		} else {
+			pr_alert("%s: Unknown board revision\n", __func__);
+		}
 	}
 }
 
-- 
cgit v0.10.2


From 4e13c1ac6baa1d6c2b650d66ca89e1e12727ec19 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 May 2013 13:33:09 +1000
Subject: powerpc/vfio: Enable on PowerNV platform

This initializes IOMMU groups based on the IOMMU configuration
discovered during the PCI scan on POWERNV (POWER non virtualized)
platform.  The IOMMU groups are to be used later by the VFIO driver,
which is used for PCI pass through.

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

The iommu_put_tce_user_mode() does only a single page mapping
as an API for adding many mappings at once is going to be
added later.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.  As h_put_tce hypercall is received by the host
kernel and processed by the QEMU (what involves calling
the host kernel again), performance is not the best -
circa 220MB/s on 10Gb ethernet network.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..98d1422 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
+extern void iommu_register_group(struct iommu_table *tbl,
+				 int pci_domain_number, unsigned long pe_num);
 
 extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			struct scatterlist *sglist, int nelems,
@@ -147,5 +152,26 @@ static inline void iommu_restore(void)
 }
 #endif
 
+/* The API to support IOMMU operations for VFIO */
+extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
+		unsigned long ioba, unsigned long tce_value,
+		unsigned long npages);
+extern int iommu_tce_put_param_check(struct iommu_table *tbl,
+		unsigned long ioba, unsigned long tce);
+extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+		unsigned long hwaddr, enum dma_data_direction direction);
+extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
+		unsigned long entry);
+extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
+extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce);
+
+extern void iommu_flush_tce(struct iommu_table *tbl);
+extern int iommu_take_ownership(struct iommu_table *tbl);
+extern void iommu_release_ownership(struct iommu_table *tbl);
+
+extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c0d0dbd..b20ff17 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,8 @@
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/sched.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -44,6 +46,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -724,6 +727,13 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
+#ifdef CONFIG_IOMMU_API
+	if (tbl->it_group) {
+		iommu_group_put(tbl->it_group);
+		BUG_ON(tbl->it_group);
+	}
+#endif
+
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
 		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
@@ -860,3 +870,316 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+void iommu_register_group(struct iommu_table *tbl,
+		int pci_domain_number, unsigned long pe_num)
+{
+	struct iommu_group *grp;
+	char *name;
+
+	grp = iommu_group_alloc();
+	if (IS_ERR(grp)) {
+		pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
+				PTR_ERR(grp));
+		return;
+	}
+	tbl->it_group = grp;
+	iommu_group_set_iommudata(grp, tbl, group_release);
+	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
+			pci_domain_number, pe_num);
+	if (!name)
+		return;
+	iommu_group_set_name(grp, name);
+	kfree(name);
+}
+
+enum dma_data_direction iommu_tce_direction(unsigned long tce)
+{
+	if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
+		return DMA_BIDIRECTIONAL;
+	else if (tce & TCE_PCI_READ)
+		return DMA_TO_DEVICE;
+	else if (tce & TCE_PCI_WRITE)
+		return DMA_FROM_DEVICE;
+	else
+		return DMA_NONE;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_direction);
+
+void iommu_flush_tce(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+EXPORT_SYMBOL_GPL(iommu_flush_tce);
+
+int iommu_tce_clear_param_check(struct iommu_table *tbl,
+		unsigned long ioba, unsigned long tce_value,
+		unsigned long npages)
+{
+	/* ppc_md.tce_free() does not support any value but 0 */
+	if (tce_value)
+		return -EINVAL;
+
+	if (ioba & ~IOMMU_PAGE_MASK)
+		return -EINVAL;
+
+	ioba >>= IOMMU_PAGE_SHIFT;
+	if (ioba < tbl->it_offset)
+		return -EINVAL;
+
+	if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+
+int iommu_tce_put_param_check(struct iommu_table *tbl,
+		unsigned long ioba, unsigned long tce)
+{
+	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return -EINVAL;
+
+	if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
+		return -EINVAL;
+
+	if (ioba & ~IOMMU_PAGE_MASK)
+		return -EINVAL;
+
+	ioba >>= IOMMU_PAGE_SHIFT;
+	if (ioba < tbl->it_offset)
+		return -EINVAL;
+
+	if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+
+unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long oldtce;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
+		ppc_md.tce_free(tbl, entry, 1);
+	else
+		oldtce = 0;
+
+	spin_unlock(&(pool->lock));
+
+	return oldtce;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	unsigned long oldtce;
+	struct page *page;
+
+	for ( ; pages; --pages, ++entry) {
+		oldtce = iommu_clear_tce(tbl, entry);
+		if (!oldtce)
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+		WARN_ON(!page);
+		if (page) {
+			if (oldtce & TCE_PCI_WRITE)
+				SetPageDirty(page);
+			put_page(page);
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+
+/*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+ */
+int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+		unsigned long hwaddr, enum dma_data_direction direction)
+{
+	int ret = -EBUSY;
+	unsigned long oldtce;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+	/* Add new entry if it is not busy */
+	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+
+	spin_unlock(&(pool->lock));
+
+	/* if (unlikely(ret))
+		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
+				__func__, hwaddr, entry << IOMMU_PAGE_SHIFT,
+				hwaddr, ret); */
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+		unsigned long tce)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (unlikely(ret != 1)) {
+		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret); */
+		return -EFAULT;
+	}
+	hwaddr = (unsigned long) page_address(page) + offset;
+
+	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+	if (ret)
+		put_page(page);
+
+	if (ret < 0)
+		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+				__func__, entry << IOMMU_PAGE_SHIFT, tce, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+
+int iommu_take_ownership(struct iommu_table *tbl)
+{
+	unsigned long sz = (tbl->it_size + 7) >> 3;
+
+	if (tbl->it_offset == 0)
+		clear_bit(0, tbl->it_map);
+
+	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+		pr_err("iommu_tce: it_map is not empty");
+		return -EBUSY;
+	}
+
+	memset(tbl->it_map, 0xff, sz);
+	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_take_ownership);
+
+void iommu_release_ownership(struct iommu_table *tbl)
+{
+	unsigned long sz = (tbl->it_size + 7) >> 3;
+
+	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+	memset(tbl->it_map, 0, sz);
+
+	/* Restore bit#0 set by iommu_init_table() */
+	if (tbl->it_offset == 0)
+		set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_release_ownership);
+
+static int iommu_add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl || !tbl->it_group) {
+		pr_debug("iommu_tce: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("iommu_tce: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("iommu_tce: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void iommu_del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return iommu_add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		iommu_del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+	for_each_pci_dev(pdev)
+		iommu_add_device(&pdev->dev);
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+	return 0;
+}
+
+subsys_initcall_sync(tce_iommu_init);
+
+#else
+
+void iommu_register_group(struct iommu_table *tbl,
+		int pci_domain_number, unsigned long pe_num)
+{
+}
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9c9d15e..2931d97 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -595,6 +595,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 			       TCE_PCI_SWINV_PAIR;
 	}
 	iommu_init_table(tbl, phb->hose->node);
+	iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number);
 
 	return;
  fail:
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 92b37a0..5d378f2 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -86,8 +86,11 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 					 struct pci_dev *pdev)
 {
-	if (phb->p5ioc2.iommu_table.it_map == NULL)
+	if (phb->p5ioc2.iommu_table.it_map == NULL) {
 		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
+		iommu_register_group(&phb->p5ioc2.iommu_table,
+				pci_domain_nr(phb->hose->bus), phb->opal_id);
+	}
 
 	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
 }
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 277343c..e16b729 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -412,6 +413,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose)
 	pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
 				  be32_to_cpup(sizep), 0);
 	iommu_init_table(tbl, hose->node);
+	iommu_register_group(tbl, pci_domain_nr(hose->bus), 0);
 
 	/* Deal with SW invalidated TCEs when needed (BML way) */
 	swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info",
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c332fb9..3f3abde 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -261,4 +261,12 @@ config SHMOBILE_IOMMU_L1SIZE
 	default 256 if SHMOBILE_IOMMU_ADDRSIZE_64MB
 	default 128 if SHMOBILE_IOMMU_ADDRSIZE_32MB
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops
+	  is not implemented as it is not necessary for VFIO.
+
 endif # IOMMU_SUPPORT
-- 
cgit v0.10.2


From 5ffd229c02731a91d08ca21e76b503c5bbb5c095 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 May 2013 13:33:10 +1000
Subject: powerpc/vfio: Implement IOMMU driver for VFIO

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling.  This implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWER
guest).

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda363..c55533c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The read/write/mmap
 interfaces implement the device region access defined by the device's
 own VFIO_DEVICE_GET_REGION_INFO ioctl.
 
+
+PPC64 sPAPR implementation note
+-------------------------------------------------------------------------------
+
+This implementation has some specifics:
+
+1) Only one IOMMU group per container is supported as an IOMMU group
+represents the minimal entity which isolation can be guaranteed for and
+groups are allocated statically, one per a Partitionable Endpoint (PE)
+(PE is often a PCI domain but not always).
+
+2) The hardware supports so called DMA windows - the PCI address range
+within which DMA transfer is allowed, any attempt to access address space
+out of the window leads to the whole PE isolation.
+
+3) PPC64 guests are paravirtualized but not fully emulated. There is an API
+to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
+currently there is no way to reduce the number of calls. In order to make things
+faster, the map/unmap handling has been implemented in real mode which provides
+an excellent performance which has limitations such as inability to do
+locked pages accounting in real time.
+
+So 3 additional ioctls have been added:
+
+	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
+		of the DMA window on the PCI bus.
+
+	VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
+		is done at this point. This lets user first to know what
+		the DMA window is and adjust rlimit before doing any real job.
+
+	VFIO_IOMMU_DISABLE - disables the container.
+
+
+The code flow from the example above should be slightly changed:
+
+	.....
+	/* Add the group to the container */
+	ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
+
+	/* Enable the IOMMU model we want */
+	ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
+
+	/* Get addition sPAPR IOMMU info */
+	vfio_iommu_spapr_tce_info spapr_iommu_info;
+	ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info);
+
+	if (ioctl(container, VFIO_IOMMU_ENABLE))
+		/* Cannot enable container, may be low rlimit */
+
+	/* Allocate some space and setup a DMA mapping */
+	dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
+			     MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+	dma_map.size = 1024 * 1024;
+	dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
+	dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+	/* Check here is .iova/.size are within DMA window from spapr_iommu_info */
+
+	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
+	.....
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6d78736..259ad28 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1415,6 +1415,7 @@ static int __init vfio_init(void)
 	 * drivers.
 	 */
 	request_module_nowait("vfio_iommu_type1");
+	request_module_nowait("vfio_iommu_spapr_tce");
 
 	return 0;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..bdae7a0
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,377 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2013 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+	bool enabled;
+};
+
+static int tce_iommu_enable(struct tce_container *container)
+{
+	int ret = 0;
+	unsigned long locked, lock_limit, npages;
+	struct iommu_table *tbl = container->tbl;
+
+	if (!container->tbl)
+		return -ENXIO;
+
+	if (!current->mm)
+		return -ESRCH; /* process exited */
+
+	if (container->enabled)
+		return -EBUSY;
+
+	/*
+	 * When userspace pages are mapped into the IOMMU, they are effectively
+	 * locked memory, so, theoretically, we need to update the accounting
+	 * of locked pages on each map and unmap.  For powerpc, the map unmap
+	 * paths can be very hot, though, and the accounting would kill
+	 * performance, especially since it would be difficult to impossible
+	 * to handle the accounting in real mode only.
+	 *
+	 * To address that, rather than precisely accounting every page, we
+	 * instead account for a worst case on locked memory when the iommu is
+	 * enabled and disabled.  The worst case upper bound on locked memory
+	 * is the size of the whole iommu window, which is usually relatively
+	 * small (compared to total memory sizes) on POWER hardware.
+	 *
+	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+	 * that would effectively kill the guest at random points, much better
+	 * enforcing the limit based on the max that the guest can map.
+	 */
+	down_write(&current->mm->mmap_sem);
+	npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	locked = current->mm->locked_vm + npages;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+				rlimit(RLIMIT_MEMLOCK));
+		ret = -ENOMEM;
+	} else {
+
+		current->mm->locked_vm += npages;
+		container->enabled = true;
+	}
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static void tce_iommu_disable(struct tce_container *container)
+{
+	if (!container->enabled)
+		return;
+
+	container->enabled = false;
+
+	if (!container->tbl || !current->mm)
+		return;
+
+	down_write(&current->mm->mmap_sem);
+	current->mm->locked_vm -= (container->tbl->it_size <<
+			IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	up_write(&current->mm->mmap_sem);
+}
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	tce_iommu_disable(container);
+
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		struct vfio_iommu_type1_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		unsigned long tce, i;
+
+		if (!tbl)
+			return -ENXIO;
+
+		BUG_ON(!tbl->it_group);
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE))
+			return -EINVAL;
+
+		if ((param.size & ~IOMMU_PAGE_MASK) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* iova is checked by the IOMMU API */
+		tce = param.vaddr;
+		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			tce |= TCE_PCI_READ;
+		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			tce |= TCE_PCI_WRITE;
+
+		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
+			ret = iommu_put_tce_user_mode(tbl,
+					(param.iova >> IOMMU_PAGE_SHIFT) + i,
+					tce);
+			if (ret)
+				break;
+			tce += IOMMU_PAGE_SIZE;
+		}
+		if (ret)
+			iommu_clear_tces_and_put_pages(tbl,
+					param.iova >> IOMMU_PAGE_SHIFT,	i);
+
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		struct vfio_iommu_type1_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		if (param.size & ~IOMMU_PAGE_MASK)
+			return -EINVAL;
+
+		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
+				param.size >> IOMMU_PAGE_SHIFT);
+		if (ret)
+			return ret;
+
+		ret = iommu_clear_tces_and_put_pages(tbl,
+				param.iova >> IOMMU_PAGE_SHIFT,
+				param.size >> IOMMU_PAGE_SHIFT);
+		iommu_flush_tce(tbl);
+
+		return ret;
+	}
+	case VFIO_IOMMU_ENABLE:
+		mutex_lock(&container->lock);
+		ret = tce_iommu_enable(container);
+		mutex_unlock(&container->lock);
+		return ret;
+
+
+	case VFIO_IOMMU_DISABLE:
+		mutex_lock(&container->lock);
+		tce_iommu_disable(container);
+		mutex_unlock(&container->lock);
+		return 0;
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	int ret;
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+
+	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group); */
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		ret = -EBUSY;
+	} else if (container->enabled) {
+		pr_err("tce_vfio: attaching group #%u to enabled container\n",
+				iommu_group_id(iommu_group));
+		ret = -EBUSY;
+	} else {
+		ret = iommu_take_ownership(tbl);
+		if (!ret)
+			container->tbl = tbl;
+	}
+
+	mutex_unlock(&container->lock);
+
+	return ret;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+		if (container->enabled) {
+			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
+					iommu_group_id(tbl->it_group));
+			tce_iommu_disable(container);
+		}
+
+		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group); */
+		container->tbl = NULL;
+		iommu_release_ownership(tbl);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 284ff24..87ee4f4 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -22,6 +22,7 @@
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/*
+ * IOCTLs to enable/disable IOMMU container usage.
+ * No parameters are supported.
+ */
+#define VFIO_IOMMU_ENABLE	_IO(VFIO_TYPE, VFIO_BASE + 15)
+#define VFIO_IOMMU_DISABLE	_IO(VFIO_TYPE, VFIO_BASE + 16)
+
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * A flag will need to be added if other page sizes are supported,
+ * so as defined here, it is always 4k.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* ***************************************************************** */
+
 #endif /* _UAPIVFIO_H */
-- 
cgit v0.10.2


From 5b25199eff8e124297e6e95392f1719d20daca89 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 21 May 2013 13:33:11 +1000
Subject: powerpc/vfio: Enable on pSeries platform

The enables VFIO on the pSeries platform, enabling user space
programs to access PCI devices directly.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 86ae364..23fc1dc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -614,6 +614,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+	iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -658,6 +659,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 				   ppci->phb->node);
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
 		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
+		iommu_register_group(tbl, pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->iommu_table);
 	}
 }
@@ -684,6 +686,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 				   phb->node);
 		iommu_table_setparms(phb, dn, tbl);
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
+		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
 		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
 		return;
 	}
@@ -1184,6 +1187,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 				   pci->phb->node);
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
 		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
 		pr_debug("  created table: %p\n", pci->iommu_table);
 	} else {
 		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 3f3abde..01730b2 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -263,7 +263,7 @@ config SHMOBILE_IOMMU_L1SIZE
 
 config SPAPR_TCE_IOMMU
 	bool "sPAPR TCE IOMMU Support"
-	depends on PPC_POWERNV
+	depends on PPC_POWERNV || PPC_PSERIES
 	select IOMMU_API
 	help
 	  Enables bits of IOMMU API required by VFIO. The iommu_ops
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index b464687..26b3d9d 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -12,7 +12,7 @@ menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
-	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
+	select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
-- 
cgit v0.10.2


From d8899bb2be91b3a19ebf82b138232919ffcf833a Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Wed, 22 May 2013 09:50:58 +0530
Subject: powerpc: Debug control and status registers are 32bit

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 7135a25..2b5a39c 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -168,10 +168,10 @@ struct thread_struct {
 	 * The following help to manage the use of Debug Control Registers
 	 * om the BookE platforms.
 	 */
-	unsigned long	dbcr0;
-	unsigned long	dbcr1;
+	uint32_t	dbcr0;
+	uint32_t	dbcr1;
 #ifdef CONFIG_BOOKE
-	unsigned long	dbcr2;
+	uint32_t	dbcr2;
 #endif
 	/*
 	 * The stored value of the DBSR register will be the value at the
@@ -179,7 +179,7 @@ struct thread_struct {
 	 * user (will never be written to) and has value while helping to
 	 * describe the reason for the last debug trap.  Torez
 	 */
-	unsigned long	dbsr;
+	uint32_t	dbsr;
 	/*
 	 * The following will contain addresses used by debug applications
 	 * to help trace and trap on particular address locations.
-- 
cgit v0.10.2


From 13d543cd79963133cd26748547552c99df8d23a7 Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Wed, 22 May 2013 09:50:59 +0530
Subject: powerpc: Restore dbcr0 on user space exit

On BookE (Branch taken + Single Step) is as same as Branch Taken
on BookS and in Linux we simulate BookS behavior for BookE as well.
When doing so, in Branch taken handling we want to set DBCR0_IC but
we update the current->thread->dbcr0 and not DBCR0.

Now on 64bit the current->thread.dbcr0 (and other debug registers)
is synchronized ONLY on context switch flow. But after handling
Branch taken in debug exception if we return back to user space
without context switch then single stepping change (DBCR0_ICMP)
does not get written in h/w DBCR0 and Instruction Complete exception
does not happen.

This fixes using ptrace reliably on BookE-PowerPC

lmbench latency test (lat_syscall) Results are (they varies a little
on each run)

1) ./lat_syscall <action> /dev/shm/uImage

action:	Open	read	write	stat	fstat	null
Before:	3.8618	0.2017	0.2851	1.6789	0.2256	0.0856
After:	3.8580	0.2017	0.2851	1.6955	0.2255	0.0856

1) ./lat_syscall -P 2 -N 10 <action> /dev/shm/uImage
action:	Open	read	write	stat	fstat	null
Before:	4.1388	0.2238	0.3066	1.7106	0.2256	0.0856
After:	4.1413	0.2236	0.3062	1.7107	0.2256	0.0856

[ Slightly modified to avoid extra branch in the fast path
  on Book3S and fix build on all non-BookE 64-bit -- BenH
]

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6f16ffa..dc684b1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -105,9 +105,6 @@ int main(void)
 	DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
 #else /* CONFIG_PPC64 */
 	DEFINE(PGDIR, offsetof(struct thread_struct, pgdir));
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-	DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
-#endif
 #ifdef CONFIG_SPE
 	DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0]));
 	DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc));
@@ -115,6 +112,9 @@ int main(void)
 	DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
 #endif /* CONFIG_SPE */
 #endif /* CONFIG_PPC64 */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+	DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
 #endif
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 8741c85..ab15b8d 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -629,21 +629,43 @@ _GLOBAL(ret_from_except_lite)
 
 	CURRENT_THREAD_INFO(r9, r1)
 	ld	r3,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3E
+	ld	r10,PACACURRENT(r13)
+#endif /* CONFIG_PPC_BOOK3E */
 	ld	r4,TI_FLAGS(r9)
 	andi.	r3,r3,MSR_PR
 	beq	resume_kernel
+#ifdef CONFIG_PPC_BOOK3E
+	lwz	r3,(THREAD+THREAD_DBCR0)(r10)
+#endif /* CONFIG_PPC_BOOK3E */
 
 	/* Check current_thread_info()->flags */
 	andi.	r0,r4,_TIF_USER_WORK_MASK
+#ifdef CONFIG_PPC_BOOK3E
+	bne	1f
+	/*
+	 * Check to see if the dbcr0 register is set up to debug.
+	 * Use the internal debug mode bit to do this.
+	 */
+	andis.	r0,r3,DBCR0_IDM@h
 	beq	restore
-
-	andi.	r0,r4,_TIF_NEED_RESCHED
-	beq	1f
+	mfmsr	r0
+	rlwinm	r0,r0,0,~MSR_DE	/* Clear MSR.DE */
+	mtmsr	r0
+	mtspr	SPRN_DBCR0,r3
+	li	r10, -1
+	mtspr	SPRN_DBSR,r10
+	b	restore
+#else
+	beq	restore
+#endif
+1:	andi.	r0,r4,_TIF_NEED_RESCHED
+	beq	2f
 	bl	.restore_interrupts
 	SCHEDULE_USER
 	b	.ret_from_except_lite
 
-1:	bl	.save_nvgprs
+2:	bl	.save_nvgprs
 	bl	.restore_interrupts
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_notify_resume
-- 
cgit v0.10.2


From a5b45ded097908d40803b5c2770259398811b24e Mon Sep 17 00:00:00 2001
From: liguang <lig.fnst@cn.fujitsu.com>
Date: Thu, 30 May 2013 14:47:53 +0800
Subject: powerpc/smp: Use '==' instead of '<' for system_state

'system_state < SYSTEM_RUNNING' will have same effect
with 'system_state == SYSTEM_BOOTING', but the later
one is more clearer.

Signed-off-by: liguang <lig.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index d35dbbc..f75f6fc 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -142,7 +142,7 @@ static int smp_cell_cpu_bootable(unsigned int nr)
 	 * during boot if the user requests it.  Odd-numbered
 	 * cpus are assumed to be secondary threads.
 	 */
-	if (system_state < SYSTEM_RUNNING &&
+	if (system_state == SYSTEM_BOOTING &&
 	    cpu_has_feature(CPU_FTR_SMT) &&
 	    !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
 		return 0;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 88c9459..77784ae 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -51,7 +51,7 @@ static int pnv_smp_cpu_bootable(unsigned int nr)
 	/* Special case - we inhibit secondary thread startup
 	 * during boot if the user requests it.
 	 */
-	if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+	if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
 		if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
 			return 0;
 		if (smt_enabled_at_boot
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 12bc8c3..306643c 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -192,7 +192,7 @@ static int smp_pSeries_cpu_bootable(unsigned int nr)
 	/* Special case - we inhibit secondary thread startup
 	 * during boot if the user requests it.
 	 */
-	if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+	if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
 		if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
 			return 0;
 		if (smt_enabled_at_boot
-- 
cgit v0.10.2


From 1b7e0cbe4969dd4c2945b2fb8d6881a03b4d0ac7 Mon Sep 17 00:00:00 2001
From: liguang <lig.fnst@cn.fujitsu.com>
Date: Thu, 30 May 2013 15:20:33 +0800
Subject: powerpc/pseries: Use 'true' instead of '1' for orderly_poweroff

orderly_poweroff is expecting a bool parameter, so
use 'true' instead '1'

Signed-off-by: liguang <lig.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index c4dfccd..7b3cbde 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -83,7 +83,7 @@ static void handle_system_shutdown(char event_modifier)
 	switch (event_modifier) {
 	case EPOW_SHUTDOWN_NORMAL:
 		pr_emerg("Firmware initiated power off");
-		orderly_poweroff(1);
+		orderly_poweroff(true);
 		break;
 
 	case EPOW_SHUTDOWN_ON_UPS:
@@ -95,13 +95,13 @@ static void handle_system_shutdown(char event_modifier)
 		pr_emerg("Loss of system critical functions reported by "
 			"firmware");
 		pr_emerg("Check RTAS error log for details");
-		orderly_poweroff(1);
+		orderly_poweroff(true);
 		break;
 
 	case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
 		pr_emerg("Ambient temperature too high reported by firmware");
 		pr_emerg("Check RTAS error log for details");
-		orderly_poweroff(1);
+		orderly_poweroff(true);
 		break;
 
 	default:
@@ -162,7 +162,7 @@ void rtas_parse_epow_errlog(struct rtas_error_log *log)
 
 	case EPOW_SYSTEM_HALT:
 		pr_emerg("Firmware initiated power off");
-		orderly_poweroff(1);
+		orderly_poweroff(true);
 		break;
 
 	case EPOW_MAIN_ENCLOSURE:
-- 
cgit v0.10.2


From 475e68cfdde39e6d95055999b0cb42fdb2bea0ca Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 5 Jun 2013 13:02:26 +1000
Subject: powerpc: Align thread->fpr to 16 bytes

On newer CPUs we use VSX loads and stores to the thread->fpr array.
For best performance we need to ensure 16 byte alignment.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 2b5a39c..9fe1129 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -200,7 +200,7 @@ struct thread_struct {
 #endif
 #endif
 	/* FP and VSX 0-31 register set */
-	double		fpr[32][TS_FPRWIDTH];
+	double		fpr[32][TS_FPRWIDTH] __attribute__((aligned(16)));
 	struct {
 
 		unsigned int pad;
-- 
cgit v0.10.2


From 5fb621698e94e3af8b413d9439041fde48e2784d Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 15:34:02 +0800
Subject: powerpc/eeh: Fix fetching bus for single-dev-PE

While running Linux as guest on top of phyp, we possiblly have
PE that includes single PCI device. However, we didn't return
its PCI bus correctly and it leads to failure on recovery from
EEH errors for single-dev-PE. The patch fixes the issue.

Cc: <stable@vger.kernel.org> # v3.7+
Cc: Steve Best <sbest@us.ibm.com>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index fe43d1a..9d4a9e8 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -639,7 +639,8 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
 	if (pe->type & EEH_PE_PHB) {
 		bus = pe->phb->bus;
-	} else if (pe->type & EEH_PE_BUS) {
+	} else if (pe->type & EEH_PE_BUS ||
+		   pe->type & EEH_PE_DEVICE) {
 		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
 		pdev = eeh_dev_to_pci_dev(edev);
 		if (pdev)
-- 
cgit v0.10.2


From 2d5c121678ce2c7e12ecc174121badeea0b98fe0 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 15:34:03 +0800
Subject: powerpc/eeh: Enhance converting EEH dev

Under some special circumstances, the EEH device doesn't have the
associated device tree node or PCI device. The patch enhances those
functions converting EEH device to device tree node or PCI device
accordingly to avoid unnecessary system crash.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a80e32b4..e32c3c5 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -95,12 +95,12 @@ struct eeh_dev {
 
 static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev)
 {
-	return edev->dn;
+	return edev ? edev->dn : NULL;
 }
 
 static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 {
-	return edev->pdev;
+	return edev ? edev->pdev : NULL;
 }
 
 /*
-- 
cgit v0.10.2


From 1bf247f8df2e37b53d0415015e56910677626108 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:20:55 +0530
Subject: powerpc/pseries: Remove syslog prefix in uncompressed oops text

Removal of syslog prefix in the uncompressed oops text will
help in capturing more oops data.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 8733a86..e54a8b7 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -619,7 +619,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 	}
 	if (rc != 0) {
 		kmsg_dump_rewind(dumper);
-		kmsg_dump_get_buffer(dumper, true,
+		kmsg_dump_get_buffer(dumper, false,
 				     oops_data, oops_data_sz, &text_len);
 		err_type = ERR_TYPE_KERNEL_PANIC;
 		*oops_len = (u16) text_len;
-- 
cgit v0.10.2


From b1f70e1f72179f7afe02f4131ed15da406f93e0d Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:21:05 +0530
Subject: powerpc/pseries: Add version and timestamp to oops header

Introduce version and timestamp information in the oops header.
oops_log_info (oops header) holds version (to distinguish between old
and new format oops header), length of the oops text
(compressed or uncompressed) and timestamp.

The version field will sit in the same place as the length in old
headers. version is assigned 5000 (greater than oops partition size)
so that existing tools will refuse to dump new style partitions as
the length is too large. The updated tools will work with both
old and new format headers.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index e54a8b7..742735a 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -29,6 +29,13 @@
 /* Max bytes to read/write in one go */
 #define NVRW_CNT 0x20
 
+/*
+ * Set oops header version to distingush between old and new format header.
+ * lnx,oops-log partition max size is 4000, header version > 4000 will
+ * help in identifying new header.
+ */
+#define OOPS_HDR_VERSION 5000
+
 static unsigned int nvram_size;
 static int nvram_fetch, nvram_store;
 static char nvram_buf[NVRW_CNT];	/* assume this is in the first 4GB */
@@ -67,6 +74,12 @@ static const char *pseries_nvram_os_partitions[] = {
 	NULL
 };
 
+struct oops_log_info {
+	u16 version;
+	u16 report_length;
+	u64 timestamp;
+} __attribute__((packed));
+
 static void oops_to_nvram(struct kmsg_dumper *dumper,
 			  enum kmsg_dump_reason reason);
 
@@ -83,28 +96,28 @@ static unsigned long last_unread_rtas_event;	/* timestamp */
 
  * big_oops_buf[] holds the uncompressed text we're capturing.
  *
- * oops_buf[] holds the compressed text, preceded by a prefix.
- * The prefix is just a u16 holding the length of the compressed* text.
- * (*Or uncompressed, if compression fails.)  oops_buf[] gets written
- * to NVRAM.
+ * oops_buf[] holds the compressed text, preceded by a oops header.
+ * oops header has u16 holding the version of oops header (to differentiate
+ * between old and new format header) followed by u16 holding the length of
+ * the compressed* text (*Or uncompressed, if compression fails.) and u64
+ * holding the timestamp. oops_buf[] gets written to NVRAM.
  *
- * oops_len points to the prefix.  oops_data points to the compressed text.
+ * oops_log_info points to the header. oops_data points to the compressed text.
  *
  * +- oops_buf
- * |		+- oops_data
- * v		v
- * +------------+-----------------------------------------------+
- * | length	| text                                          |
- * | (2 bytes)	| (oops_data_sz bytes)                          |
- * +------------+-----------------------------------------------+
+ * |                                   +- oops_data
+ * v                                   v
+ * +-----------+-----------+-----------+------------------------+
+ * | version   | length    | timestamp | text                   |
+ * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes)   |
+ * +-----------+-----------+-----------+------------------------+
  * ^
- * +- oops_len
+ * +- oops_log_info
  *
  * We preallocate these buffers during init to avoid kmalloc during oops/panic.
  */
 static size_t big_oops_buf_sz;
 static char *big_oops_buf, *oops_buf;
-static u16 *oops_len;
 static char *oops_data;
 static size_t oops_data_sz;
 
@@ -425,9 +438,8 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
 						oops_log_partition.name);
 		return;
 	}
-	oops_len = (u16*) oops_buf;
-	oops_data = oops_buf + sizeof(u16);
-	oops_data_sz = oops_log_partition.size - sizeof(u16);
+	oops_data = oops_buf + sizeof(struct oops_log_info);
+	oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
 
 	/*
 	 * Figure compression (preceded by elimination of each line's <n>
@@ -555,6 +567,7 @@ error:
 /* Compress the text from big_oops_buf into oops_buf. */
 static int zip_oops(size_t text_len)
 {
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
 	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
 								oops_data_sz);
 	if (zipped_len < 0) {
@@ -562,7 +575,9 @@ static int zip_oops(size_t text_len)
 		pr_err("nvram: logging uncompressed oops/panic report\n");
 		return -1;
 	}
-	*oops_len = (u16) zipped_len;
+	oops_hdr->version = OOPS_HDR_VERSION;
+	oops_hdr->report_length = (u16) zipped_len;
+	oops_hdr->timestamp = get_seconds();
 	return 0;
 }
 
@@ -576,6 +591,7 @@ static int zip_oops(size_t text_len)
 static void oops_to_nvram(struct kmsg_dumper *dumper,
 			  enum kmsg_dump_reason reason)
 {
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
 	static unsigned int oops_count = 0;
 	static bool panicking = false;
 	static DEFINE_SPINLOCK(lock);
@@ -622,11 +638,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 		kmsg_dump_get_buffer(dumper, false,
 				     oops_data, oops_data_sz, &text_len);
 		err_type = ERR_TYPE_KERNEL_PANIC;
-		*oops_len = (u16) text_len;
+		oops_hdr->version = OOPS_HDR_VERSION;
+		oops_hdr->report_length = (u16) text_len;
+		oops_hdr->timestamp = get_seconds();
 	}
 
 	(void) nvram_write_os_partition(&oops_log_partition, oops_buf,
-		(int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
+		(int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type,
+		++oops_count);
 
 	spin_unlock_irqrestore(&lock, flags);
 }
-- 
cgit v0.10.2


From 126746101e89991f179209ef58ab5937bc5ea4a3 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:21:16 +0530
Subject: powerpc/pseries: Introduce generic read function to read
 nvram-partitions

Introduce generic read function to read nvram partitions other than rtas.
nvram_read_error_log will be retained which is used to read rtas partition
from rtasd. nvram_read_partition is the generic read function to read from
any nvram partition.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 742735a..088f023 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -293,34 +293,35 @@ int nvram_write_error_log(char * buff, int length,
 	return rc;
 }
 
-/* nvram_read_error_log
+/* nvram_read_partition
  *
- * Reads nvram for error log for at most 'length'
+ * Reads nvram partition for at most 'length'
  */
-int nvram_read_error_log(char * buff, int length,
-                         unsigned int * err_type, unsigned int * error_log_cnt)
+int nvram_read_partition(struct nvram_os_partition *part, char *buff,
+			int length, unsigned int *err_type,
+			unsigned int *error_log_cnt)
 {
 	int rc;
 	loff_t tmp_index;
 	struct err_log_info info;
 	
-	if (rtas_log_partition.index == -1)
+	if (part->index == -1)
 		return -1;
 
-	if (length > rtas_log_partition.size)
-		length = rtas_log_partition.size;
+	if (length > part->size)
+		length = part->size;
 
-	tmp_index = rtas_log_partition.index;
+	tmp_index = part->index;
 
 	rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
 	if (rc <= 0) {
-		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+		pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc);
 		return rc;
 	}
 
 	rc = ppc_md.nvram_read(buff, length, &tmp_index);
 	if (rc <= 0) {
-		printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+		pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc);
 		return rc;
 	}
 
@@ -330,6 +331,17 @@ int nvram_read_error_log(char * buff, int length,
 	return 0;
 }
 
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char *buff, int length,
+			unsigned int *err_type, unsigned int *error_log_cnt)
+{
+	return nvram_read_partition(&rtas_log_partition, buff, length,
+						err_type, error_log_cnt);
+}
+
 /* This doesn't actually zero anything, but it sets the event_logged
  * word to tell that this event is safely in syslog.
  */
-- 
cgit v0.10.2


From d7563c94f728d8c9228cfddbb044c843b2e243e8 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:21:32 +0530
Subject: powerpc/pseries: Read/Write oops nvram partition via pstore

IBM's p series machines provide persistent storage for LPARs through NVRAM.
NVRAM's lnx,oops-log partition is used to log oops messages.
Currently the kernel provides the contents of p-series NVRAM only as a
simple stream of bytes via /dev/nvram, which must be interpreted in user
space by the nvram command in the powerpc-utils package.

This patch set exploits the pstore subsystem to expose oops partition in
NVRAM as a separate file in /dev/pstore. For instance, Oops messages will be
stored in a file named [dmesg-nvram-2]. In case pstore registration fails it
will fall back to kmsg_dump mechanism.

This patch will read/write the oops messages from/to this partition via pstore.

Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 088f023..9edec8e 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -18,6 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/kmsg_dump.h>
+#include <linux/pstore.h>
 #include <linux/ctype.h>
 #include <linux/zlib.h>
 #include <asm/uaccess.h>
@@ -127,6 +128,14 @@ static size_t oops_data_sz;
 #define MEM_LEVEL 4
 static struct z_stream_s stream;
 
+#ifdef CONFIG_PSTORE
+static enum pstore_type_id nvram_type_ids[] = {
+	PSTORE_TYPE_DMESG,
+	-1
+};
+static int read_type;
+#endif
+
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
 {
 	unsigned int i;
@@ -430,6 +439,149 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
 	return 0;
 }
 
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process?  Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+static int clobbering_unread_rtas_event(void)
+{
+	return (oops_log_partition.index == rtas_log_partition.index
+		&& last_unread_rtas_event
+		&& get_seconds() - last_unread_rtas_event <=
+						NVRAM_RTAS_READ_TIMEOUT);
+}
+
+#ifdef CONFIG_PSTORE
+static int nvram_pstore_open(struct pstore_info *psi)
+{
+	/* Reset the iterator to start reading partitions again */
+	read_type = -1;
+	return 0;
+}
+
+/**
+ * nvram_pstore_write - pstore write callback for nvram
+ * @type:               Type of message logged
+ * @reason:             reason behind dump (oops/panic)
+ * @id:                 identifier to indicate the write performed
+ * @part:               pstore writes data to registered buffer in parts,
+ *                      part number will indicate the same.
+ * @count:              Indicates oops count
+ * @size:               number of bytes written to the registered buffer
+ * @psi:                registered pstore_info structure
+ *
+ * Called by pstore_dump() when an oops or panic report is logged in the
+ * printk buffer.
+ * Returns 0 on successful write.
+ */
+static int nvram_pstore_write(enum pstore_type_id type,
+				enum kmsg_dump_reason reason,
+				u64 *id, unsigned int part, int count,
+				size_t size, struct pstore_info *psi)
+{
+	int rc;
+	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
+
+	/* part 1 has the recent messages from printk buffer */
+	if (part > 1 || type != PSTORE_TYPE_DMESG ||
+				clobbering_unread_rtas_event())
+		return -1;
+
+	oops_hdr->version = OOPS_HDR_VERSION;
+	oops_hdr->report_length = (u16) size;
+	oops_hdr->timestamp = get_seconds();
+	rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
+		(int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC,
+		count);
+
+	if (rc != 0)
+		return rc;
+
+	*id = part;
+	return 0;
+}
+
+/*
+ * Reads the oops/panic report.
+ * Returns the length of the data we read from each partition.
+ * Returns 0 if we've been called before.
+ */
+static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
+				int *count, struct timespec *time, char **buf,
+				struct pstore_info *psi)
+{
+	struct oops_log_info *oops_hdr;
+	unsigned int err_type, id_no;
+	struct nvram_os_partition *part = NULL;
+	char *buff = NULL;
+
+	read_type++;
+
+	switch (nvram_type_ids[read_type]) {
+	case PSTORE_TYPE_DMESG:
+		part = &oops_log_partition;
+		*type = PSTORE_TYPE_DMESG;
+		break;
+	default:
+		return 0;
+	}
+
+	buff = kmalloc(part->size, GFP_KERNEL);
+
+	if (!buff)
+		return -ENOMEM;
+
+	if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) {
+		kfree(buff);
+		return 0;
+	}
+
+	*count = 0;
+	*id = id_no;
+	oops_hdr = (struct oops_log_info *)buff;
+	*buf = buff + sizeof(*oops_hdr);
+	time->tv_sec = oops_hdr->timestamp;
+	time->tv_nsec = 0;
+	return oops_hdr->report_length;
+}
+
+static struct pstore_info nvram_pstore_info = {
+	.owner = THIS_MODULE,
+	.name = "nvram",
+	.open = nvram_pstore_open,
+	.read = nvram_pstore_read,
+	.write = nvram_pstore_write,
+};
+
+static int nvram_pstore_init(void)
+{
+	int rc = 0;
+
+	nvram_pstore_info.buf = oops_data;
+	nvram_pstore_info.bufsize = oops_data_sz;
+
+	rc = pstore_register(&nvram_pstore_info);
+	if (rc != 0)
+		pr_err("nvram: pstore_register() failed, defaults to "
+				"kmsg_dump; returned %d\n", rc);
+	else
+		/*TODO: Support compression when pstore is configured */
+		pr_info("nvram: Compression of oops text supported only when "
+				"pstore is not configured");
+
+	return rc;
+}
+#else
+static int nvram_pstore_init(void)
+{
+	return -1;
+}
+#endif
+
 static void __init nvram_init_oops_partition(int rtas_partition_exists)
 {
 	int rc;
@@ -453,6 +605,11 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
 	oops_data = oops_buf + sizeof(struct oops_log_info);
 	oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
 
+	rc = nvram_pstore_init();
+
+	if (!rc)
+		return;
+
 	/*
 	 * Figure compression (preceded by elimination of each line's <n>
 	 * severity prefix) will reduce the oops/panic report to at most
@@ -525,21 +682,6 @@ int __init pSeries_nvram_init(void)
 	return 0;
 }
 
-/*
- * Are we using the ibm,rtas-log for oops/panic reports?  And if so,
- * would logging this oops/panic overwrite an RTAS event that rtas_errd
- * hasn't had a chance to read and process?  Return 1 if so, else 0.
- *
- * We assume that if rtas_errd hasn't read the RTAS event in
- * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
- */
-static int clobbering_unread_rtas_event(void)
-{
-	return (oops_log_partition.index == rtas_log_partition.index
-		&& last_unread_rtas_event
-		&& get_seconds() - last_unread_rtas_event <=
-						NVRAM_RTAS_READ_TIMEOUT);
-}
 
 /* Derived from logfs_compress() */
 static int nvram_compress(const void *in, void *out, size_t inlen,
-- 
cgit v0.10.2


From 69020eea973d95766e905ee0ce7773e0027377a3 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:21:44 +0530
Subject: powerpc/pseries: Read rtas partition via pstore

This patch set exploits the pstore subsystem to read details of rtas partition
in NVRAM to a separate file in /dev/pstore. For instance, rtas details will be
stored in a file named [rtas-nvram-4].

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 9edec8e..78d72f0 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -131,9 +131,11 @@ static struct z_stream_s stream;
 #ifdef CONFIG_PSTORE
 static enum pstore_type_id nvram_type_ids[] = {
 	PSTORE_TYPE_DMESG,
+	PSTORE_TYPE_PPC_RTAS,
 	-1
 };
 static int read_type;
+static unsigned long last_rtas_event;
 #endif
 
 static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
@@ -297,8 +299,13 @@ int nvram_write_error_log(char * buff, int length,
 {
 	int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
 						err_type, error_log_cnt);
-	if (!rc)
+	if (!rc) {
 		last_unread_rtas_event = get_seconds();
+#ifdef CONFIG_PSTORE
+		last_rtas_event = get_seconds();
+#endif
+	}
+
 	return rc;
 }
 
@@ -506,7 +513,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
 }
 
 /*
- * Reads the oops/panic report.
+ * Reads the oops/panic report and ibm,rtas-log partition.
  * Returns the length of the data we read from each partition.
  * Returns 0 if we've been called before.
  */
@@ -526,6 +533,12 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 		part = &oops_log_partition;
 		*type = PSTORE_TYPE_DMESG;
 		break;
+	case PSTORE_TYPE_PPC_RTAS:
+		part = &rtas_log_partition;
+		*type = PSTORE_TYPE_PPC_RTAS;
+		time->tv_sec = last_rtas_event;
+		time->tv_nsec = 0;
+		break;
 	default:
 		return 0;
 	}
@@ -542,11 +555,17 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 
 	*count = 0;
 	*id = id_no;
-	oops_hdr = (struct oops_log_info *)buff;
-	*buf = buff + sizeof(*oops_hdr);
-	time->tv_sec = oops_hdr->timestamp;
-	time->tv_nsec = 0;
-	return oops_hdr->report_length;
+
+	if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
+		oops_hdr = (struct oops_log_info *)buff;
+		*buf = buff + sizeof(*oops_hdr);
+		time->tv_sec = oops_hdr->timestamp;
+		time->tv_nsec = 0;
+		return oops_hdr->report_length;
+	}
+
+	*buf = buff;
+	return part->size;
 }
 
 static struct pstore_info nvram_pstore_info = {
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2c..ec24f9c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -324,6 +324,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	case PSTORE_TYPE_MCE:
 		sprintf(name, "mce-%s-%lld", psname, id);
 		break;
+	case PSTORE_TYPE_PPC_RTAS:
+		sprintf(name, "rtas-%s-%lld", psname, id);
+		break;
 	case PSTORE_TYPE_UNKNOWN:
 		sprintf(name, "unknown-%s-%lld", psname, id);
 		break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 75d0176..d7a8fe9 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -35,6 +35,8 @@ enum pstore_type_id {
 	PSTORE_TYPE_MCE		= 1,
 	PSTORE_TYPE_CONSOLE	= 2,
 	PSTORE_TYPE_FTRACE	= 3,
+	/* PPC64 partition types */
+	PSTORE_TYPE_PPC_RTAS	= 4,
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
-- 
cgit v0.10.2


From edf38465a32c2820350da45a2231d2c53ad2d3c0 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:21:59 +0530
Subject: powerpc/pseries: Distinguish between a os-partition and non-os
 partition

Introduce os_partition member in nvram_os_partition structure to identify
if the partition is an os partition or not. This will be useful to handle
non-os partitions of-config and common.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 78d72f0..714ed8a 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -53,20 +53,23 @@ struct nvram_os_partition {
 	int min_size;	/* minimum acceptable size (0 means req_size) */
 	long size;	/* size of data portion (excluding err_log_info) */
 	long index;	/* offset of data portion of partition */
+	bool os_partition; /* partition initialized by OS, not FW */
 };
 
 static struct nvram_os_partition rtas_log_partition = {
 	.name = "ibm,rtas-log",
 	.req_size = 2079,
 	.min_size = 1055,
-	.index = -1
+	.index = -1,
+	.os_partition = true
 };
 
 static struct nvram_os_partition oops_log_partition = {
 	.name = "lnx,oops-log",
 	.req_size = 4000,
 	.min_size = 2000,
-	.index = -1
+	.index = -1,
+	.os_partition = true
 };
 
 static const char *pseries_nvram_os_partitions[] = {
-- 
cgit v0.10.2


From f33f748c964f6a6ee272b1c794b52f54f4da1d04 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:22:10 +0530
Subject: powerpc/pseries: Read of-config partition via pstore

This patch set exploits the pstore subsystem to read details of
of-config partition in NVRAM to a separate file in /dev/pstore.
For instance, of-config partition details will be stored in a
file named [of-nvram-5].

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 714ed8a..f7392f6 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -132,9 +132,16 @@ static size_t oops_data_sz;
 static struct z_stream_s stream;
 
 #ifdef CONFIG_PSTORE
+static struct nvram_os_partition of_config_partition = {
+	.name = "of-config",
+	.index = -1,
+	.os_partition = false
+};
+
 static enum pstore_type_id nvram_type_ids[] = {
 	PSTORE_TYPE_DMESG,
 	PSTORE_TYPE_PPC_RTAS,
+	PSTORE_TYPE_PPC_OF,
 	-1
 };
 static int read_type;
@@ -332,10 +339,15 @@ int nvram_read_partition(struct nvram_os_partition *part, char *buff,
 
 	tmp_index = part->index;
 
-	rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
-	if (rc <= 0) {
-		pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc);
-		return rc;
+	if (part->os_partition) {
+		rc = ppc_md.nvram_read((char *)&info,
+					sizeof(struct err_log_info),
+					&tmp_index);
+		if (rc <= 0) {
+			pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__,
+									rc);
+			return rc;
+		}
 	}
 
 	rc = ppc_md.nvram_read(buff, length, &tmp_index);
@@ -344,8 +356,10 @@ int nvram_read_partition(struct nvram_os_partition *part, char *buff,
 		return rc;
 	}
 
-	*error_log_cnt = info.seq_num;
-	*err_type = info.error_type;
+	if (part->os_partition) {
+		*error_log_cnt = info.seq_num;
+		*err_type = info.error_type;
+	}
 
 	return 0;
 }
@@ -516,7 +530,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
 }
 
 /*
- * Reads the oops/panic report and ibm,rtas-log partition.
+ * Reads the oops/panic report, rtas and of-config partition.
  * Returns the length of the data we read from each partition.
  * Returns 0 if we've been called before.
  */
@@ -525,9 +539,11 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 				struct pstore_info *psi)
 {
 	struct oops_log_info *oops_hdr;
-	unsigned int err_type, id_no;
+	unsigned int err_type, id_no, size = 0;
 	struct nvram_os_partition *part = NULL;
 	char *buff = NULL;
+	int sig = 0;
+	loff_t p;
 
 	read_type++;
 
@@ -542,10 +558,29 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 		time->tv_sec = last_rtas_event;
 		time->tv_nsec = 0;
 		break;
+	case PSTORE_TYPE_PPC_OF:
+		sig = NVRAM_SIG_OF;
+		part = &of_config_partition;
+		*type = PSTORE_TYPE_PPC_OF;
+		*id = PSTORE_TYPE_PPC_OF;
+		time->tv_sec = 0;
+		time->tv_nsec = 0;
+		break;
 	default:
 		return 0;
 	}
 
+	if (!part->os_partition) {
+		p = nvram_find_partition(part->name, sig, &size);
+		if (p <= 0) {
+			pr_err("nvram: Failed to find partition %s, "
+				"err %d\n", part->name, (int)p);
+			return 0;
+		}
+		part->index = p;
+		part->size = size;
+	}
+
 	buff = kmalloc(part->size, GFP_KERNEL);
 
 	if (!buff)
@@ -557,7 +592,9 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 	}
 
 	*count = 0;
-	*id = id_no;
+
+	if (part->os_partition)
+		*id = id_no;
 
 	if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
 		oops_hdr = (struct oops_log_info *)buff;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index ec24f9c..73148ae 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -327,6 +327,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	case PSTORE_TYPE_PPC_RTAS:
 		sprintf(name, "rtas-%s-%lld", psname, id);
 		break;
+	case PSTORE_TYPE_PPC_OF:
+		sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+		break;
 	case PSTORE_TYPE_UNKNOWN:
 		sprintf(name, "unknown-%s-%lld", psname, id);
 		break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index d7a8fe9..615dc18 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -37,6 +37,7 @@ enum pstore_type_id {
 	PSTORE_TYPE_FTRACE	= 3,
 	/* PPC64 partition types */
 	PSTORE_TYPE_PPC_RTAS	= 4,
+	PSTORE_TYPE_PPC_OF	= 5,
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
-- 
cgit v0.10.2


From a5e4797b0f46819a74a7233825137ed5d2f51b51 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 6 Jun 2013 00:22:20 +0530
Subject: powerpc/pseries: Read common partition via pstore

This patch exploits pstore subsystem to read details of common partition
in NVRAM to a separate file in /dev/pstore. For instance, common partition
details will be stored in a file named [common-nvram-6].

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Reviewed-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index f7392f6..14cc486 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -138,10 +138,17 @@ static struct nvram_os_partition of_config_partition = {
 	.os_partition = false
 };
 
+static struct nvram_os_partition common_partition = {
+	.name = "common",
+	.index = -1,
+	.os_partition = false
+};
+
 static enum pstore_type_id nvram_type_ids[] = {
 	PSTORE_TYPE_DMESG,
 	PSTORE_TYPE_PPC_RTAS,
 	PSTORE_TYPE_PPC_OF,
+	PSTORE_TYPE_PPC_COMMON,
 	-1
 };
 static int read_type;
@@ -530,7 +537,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
 }
 
 /*
- * Reads the oops/panic report, rtas and of-config partition.
+ * Reads the oops/panic report, rtas, of-config and common partition.
  * Returns the length of the data we read from each partition.
  * Returns 0 if we've been called before.
  */
@@ -566,6 +573,14 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 		time->tv_sec = 0;
 		time->tv_nsec = 0;
 		break;
+	case PSTORE_TYPE_PPC_COMMON:
+		sig = NVRAM_SIG_SYS;
+		part = &common_partition;
+		*type = PSTORE_TYPE_PPC_COMMON;
+		*id = PSTORE_TYPE_PPC_COMMON;
+		time->tv_sec = 0;
+		time->tv_nsec = 0;
+		break;
 	default:
 		return 0;
 	}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 73148ae..08c3d76 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -330,6 +330,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	case PSTORE_TYPE_PPC_OF:
 		sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
 		break;
+	case PSTORE_TYPE_PPC_COMMON:
+		sprintf(name, "powerpc-common-%s-%lld", psname, id);
+		break;
 	case PSTORE_TYPE_UNKNOWN:
 		sprintf(name, "unknown-%s-%lld", psname, id);
 		break;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 615dc18..656699f 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -38,6 +38,7 @@ enum pstore_type_id {
 	/* PPC64 partition types */
 	PSTORE_TYPE_PPC_RTAS	= 4,
 	PSTORE_TYPE_PPC_OF	= 5,
+	PSTORE_TYPE_PPC_COMMON	= 6,
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
-- 
cgit v0.10.2


From 04ae9001719c3f0012d239a7c5aa4136f6b6541d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 9 Jun 2013 17:00:42 +1000
Subject: powerpc/math-emu: Fix decoding of some instructions

The decoding of some instructions such as fsqrt{s} was incorrect,
using the wrong registers, and thus could not work.

This fixes it and also adds a couple of place holders for missing
instructions.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile
index 7d1dba0..8d035d2 100644
--- a/arch/powerpc/math-emu/Makefile
+++ b/arch/powerpc/math-emu/Makefile
@@ -4,7 +4,8 @@ obj-$(CONFIG_MATH_EMULATION)	+= fabs.o fadd.o fadds.o fcmpo.o fcmpu.o \
 					fmadd.o fmadds.o fmsub.o fmsubs.o \
 					fmul.o fmuls.o fnabs.o fneg.o \
 					fnmadd.o fnmadds.o fnmsub.o fnmsubs.o \
-					fres.o frsp.o frsqrte.o fsel.o lfs.o \
+					fres.o fre.o frsp.o fsel.o lfs.o \
+					frsqrte.o frsqrtes.o \
 					fsqrt.o	fsqrts.o fsub.o fsubs.o \
 					mcrfs.o mffs.o mtfsb0.o mtfsb1.o \
 					mtfsf.o mtfsfi.o stfiwx.o stfs.o \
diff --git a/arch/powerpc/math-emu/fre.c b/arch/powerpc/math-emu/fre.c
new file mode 100644
index 0000000..49ccf2c
--- /dev/null
+++ b/arch/powerpc/math-emu/fre.c
@@ -0,0 +1,11 @@
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+
+int fre(void *frD, void *frB)
+{
+#ifdef DEBUG
+	printk("%s: %p %p\n", __func__, frD, frB);
+#endif
+	return -ENOSYS;
+}
diff --git a/arch/powerpc/math-emu/frsqrtes.c b/arch/powerpc/math-emu/frsqrtes.c
new file mode 100644
index 0000000..7e838e3
--- /dev/null
+++ b/arch/powerpc/math-emu/frsqrtes.c
@@ -0,0 +1,11 @@
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+
+int frsqrtes(void *frD, void *frB)
+{
+#ifdef DEBUG
+	printk("%s: %p %p\n", __func__, frD, frB);
+#endif
+	return 0;
+}
diff --git a/arch/powerpc/math-emu/math.c b/arch/powerpc/math-emu/math.c
index 164d559..0328e66 100644
--- a/arch/powerpc/math-emu/math.c
+++ b/arch/powerpc/math-emu/math.c
@@ -58,8 +58,10 @@ FLOATFUNC(fnabs);
 FLOATFUNC(fneg);
 
 /* Optional */
+FLOATFUNC(fre);
 FLOATFUNC(fres);
 FLOATFUNC(frsqrte);
+FLOATFUNC(frsqrtes);
 FLOATFUNC(fsel);
 FLOATFUNC(fsqrt);
 FLOATFUNC(fsqrts);
@@ -97,6 +99,7 @@ FLOATFUNC(fsqrts);
 #define FSQRTS		0x016		/*   22 */
 #define FRES		0x018		/*   24 */
 #define FMULS		0x019		/*   25 */
+#define FRSQRTES	0x01a		/*   26 */
 #define FMSUBS		0x01c		/*   28 */
 #define FMADDS		0x01d		/*   29 */
 #define FNMSUBS		0x01e		/*   30 */
@@ -109,6 +112,7 @@ FLOATFUNC(fsqrts);
 #define FADD		0x015		/*   21 */
 #define FSQRT		0x016		/*   22 */
 #define FSEL		0x017		/*   23 */
+#define FRE		0x018		/*   24 */
 #define FMUL		0x019		/*   25 */
 #define FRSQRTE		0x01a		/*   26 */
 #define FMSUB		0x01c		/*   28 */
@@ -299,9 +303,10 @@ do_mathemu(struct pt_regs *regs)
 		case FDIVS:	func = fdivs;	type = AB;	break;
 		case FSUBS:	func = fsubs;	type = AB;	break;
 		case FADDS:	func = fadds;	type = AB;	break;
-		case FSQRTS:	func = fsqrts;	type = AB;	break;
-		case FRES:	func = fres;	type = AB;	break;
+		case FSQRTS:	func = fsqrts;	type = XB;	break;
+		case FRES:	func = fres;	type = XB;	break;
 		case FMULS:	func = fmuls;	type = AC;	break;
+		case FRSQRTES:	func = frsqrtes;type = XB;	break;
 		case FMSUBS:	func = fmsubs;	type = ABC;	break;
 		case FMADDS:	func = fmadds;	type = ABC;	break;
 		case FNMSUBS:	func = fnmsubs;	type = ABC;	break;
@@ -317,10 +322,11 @@ do_mathemu(struct pt_regs *regs)
 			case FDIV:	func = fdiv;	type = AB;	break;
 			case FSUB:	func = fsub;	type = AB;	break;
 			case FADD:	func = fadd;	type = AB;	break;
-			case FSQRT:	func = fsqrt;	type = AB;	break;
+			case FSQRT:	func = fsqrt;	type = XB;	break;
+			case FRE:	func = fre;	type = XB;	break;
 			case FSEL:	func = fsel;	type = ABC;	break;
 			case FMUL:	func = fmul;	type = AC;	break;
-			case FRSQRTE:	func = frsqrte;	type = AB;	break;
+			case FRSQRTE:	func = frsqrte;	type = XB;	break;
 			case FMSUB:	func = fmsub;	type = ABC;	break;
 			case FMADD:	func = fmadd;	type = ABC;	break;
 			case FNMSUB:	func = fnmsub;	type = ABC;	break;
-- 
cgit v0.10.2


From 4e63f8edfe4d6f20b1af176efc022c2b2f5e7aeb Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 9 Jun 2013 17:01:24 +1000
Subject: powerpc/math-emu: Allow math-emu to be used for HW FPU

(Including 64-bit ones)

This allow SW emulation by the kernel of optional instructions
such as fsqrt which aren't implemented on some processors, and
thus fixes some Fedora 19 issues such as Anaconda since the
compiler is set to generate those by default on 64-bit.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c33e3ad..e3009ab 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -298,7 +298,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 
 config MATH_EMULATION
 	bool "Math emulation"
-	depends on 4xx || 8xx || E200 || PPC_MPC832x || E500
+	depends on 4xx || 8xx || PPC_MPC832x || BOOKE
 	---help---
 	  Some PowerPC chips designed for embedded applications do not have
 	  a floating-point unit and therefore do not implement the
@@ -307,6 +307,10 @@ config MATH_EMULATION
 	  unit, which will allow programs that use floating-point
 	  instructions to run.
 
+	  This is also useful to emulate missing (optional) instructions
+	  such as fsqrt on cores that do have an FPU but do not implement
+	  them (such as Freescale BookE).
+
 config PPC_TRANSACTIONAL_MEM
        bool "Transactional Memory support for POWERPC"
        depends on PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f18c79c..f4b5687 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1125,7 +1125,17 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 	 * ESR_DST (!?) or 0.  In the process of chasing this with the
 	 * hardware people - not sure if it can happen on any illegal
 	 * instruction or only on FP instructions, whether there is a
-	 * pattern to occurrences etc. -dgibson 31/Mar/2003 */
+	 * pattern to occurrences etc. -dgibson 31/Mar/2003
+	 */
+
+	/*
+	 * If we support a HW FPU, we need to ensure the FP state
+	 * if flushed into the thread_struct before attempting
+	 * emulation
+	 */
+#ifdef CONFIG_PPC_FPU
+	flush_fp_to_thread(current);
+#endif
 	switch (do_mathemu(regs)) {
 	case 0:
 		emulate_single_step(regs);
-- 
cgit v0.10.2


From 968219fa334ce8fed3421dd12ea12e9f562c95cb Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 9 Jun 2013 17:04:58 +1000
Subject: powerpc/8xx: Remove 8xx specific "minimal FPU emulation"

This is duplicated code from math-emu and implements such a small
subset of the FPU (load/stores/fmr) that it's essentially pointless
nowdays.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e3009ab..5374776 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -319,17 +319,6 @@ config PPC_TRANSACTIONAL_MEM
        ---help---
          Support user-mode Transactional Memory on POWERPC.
 
-config 8XX_MINIMAL_FPEMU
-	bool "Minimal math emulation for 8xx"
-	depends on 8xx && !MATH_EMULATION
-	help
-	  Older arch/ppc kernels still emulated a few floating point
-	  instructions such as load and store, even when full math
-	  emulation is disabled.  Say "Y" here if you want to preserve
-	  this behavior.
-
-	  It is recommended that you build a soft-float userspace instead.
-
 config IOMMU_HELPER
 	def_bool PPC64
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f4b5687..071f6e0 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1396,8 +1396,7 @@ void performance_monitor_exception(struct pt_regs *regs)
 void SoftwareEmulation(struct pt_regs *regs)
 {
 	extern int do_mathemu(struct pt_regs *);
-	extern int Soft_emulate_8xx(struct pt_regs *);
-#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU)
+#if defined(CONFIG_MATH_EMULATION)
 	int errcode;
 #endif
 
@@ -1430,23 +1429,6 @@ void SoftwareEmulation(struct pt_regs *regs)
 		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 		return;
 	}
-
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
-	errcode = Soft_emulate_8xx(regs);
-	if (errcode >= 0)
-		PPC_WARN_EMULATED(8xx, regs);
-
-	switch (errcode) {
-	case 0:
-		emulate_single_step(regs);
-		return;
-	case 1:
-		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-		return;
-	case -EFAULT:
-		_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
-		return;
-	}
 #else
 	_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 #endif
@@ -1796,8 +1778,6 @@ struct ppc_emulated ppc_emulated = {
 	WARN_EMULATED_SETUP(unaligned),
 #ifdef CONFIG_MATH_EMULATION
 	WARN_EMULATED_SETUP(math),
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
-	WARN_EMULATED_SETUP(8xx),
 #endif
 #ifdef CONFIG_VSX
 	WARN_EMULATED_SETUP(vsx),
-- 
cgit v0.10.2


From 1d25f11fdbcc5390d68efd98c28900bfd29b264c Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Sun, 9 Jun 2013 21:23:15 +1000
Subject: powerpc/tm: Fix writing top half of MSR on 32 bit signals

The MSR TM controls are in the top 32 bits of the MSR hence on 32 bit signals,
we stick the top half of the MSR in the checkpointed signal context so that the
user can access it.

Unfortunately, we don't currently write anything to the checkpointed signal
context when coming in a from a non transactional process and hence the top MSR
bits can contain junk.

This updates the 32 bit signal handling code to always write something to the
top MSR bits so that users know if the process is transactional or not and the
kernel can use it on signal return.

Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 201385c..5bc819f 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -407,7 +407,8 @@ inline unsigned long copy_transact_fpr_from_user(struct task_struct *task,
  * altivec/spe instructions at some point.
  */
 static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
-		int sigret, int ctx_has_vsx_region)
+			  struct mcontext __user *tm_frame, int sigret,
+			  int ctx_has_vsx_region)
 {
 	unsigned long msr = regs->msr;
 
@@ -475,6 +476,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
 
 	if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
 		return 1;
+	/* We need to write 0 the MSR top 32 bits in the tm frame so that we
+	 * can check it on the restore to see if TM is active
+	 */
+	if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR]))
+		return 1;
+
 	if (sigret) {
 		/* Set up the sigreturn trampoline: li r0,sigret; sc */
 		if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
@@ -952,6 +959,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 {
 	struct rt_sigframe __user *rt_sf;
 	struct mcontext __user *frame;
+	struct mcontext __user *tm_frame = NULL;
 	void __user *addr;
 	unsigned long newsp = 0;
 	int sigret;
@@ -985,23 +993,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 	}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	tm_frame = &rt_sf->uc_transact.uc_mcontext;
 	if (MSR_TM_ACTIVE(regs->msr)) {
-		if (save_tm_user_regs(regs, &rt_sf->uc.uc_mcontext,
-				      &rt_sf->uc_transact.uc_mcontext, sigret))
+		if (save_tm_user_regs(regs, frame, tm_frame, sigret))
 			goto badframe;
 	}
 	else
 #endif
-		if (save_user_regs(regs, frame, sigret, 1))
+	{
+		if (save_user_regs(regs, frame, tm_frame, sigret, 1))
 			goto badframe;
+	}
 	regs->link = tramp;
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	if (MSR_TM_ACTIVE(regs->msr)) {
 		if (__put_user((unsigned long)&rt_sf->uc_transact,
 			       &rt_sf->uc.uc_link)
-		    || __put_user(to_user_ptr(&rt_sf->uc_transact.uc_mcontext),
-				  &rt_sf->uc_transact.uc_regs))
+		    || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs))
 			goto badframe;
 	}
 	else
@@ -1170,7 +1179,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx,
 		mctx = (struct mcontext __user *)
 			((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
 		if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
-		    || save_user_regs(regs, mctx, 0, ctx_has_vsx_region)
+		    || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
 		    || put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
 		    || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
 			return -EFAULT;
@@ -1392,6 +1401,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 {
 	struct sigcontext __user *sc;
 	struct sigframe __user *frame;
+	struct mcontext __user *tm_mctx = NULL;
 	unsigned long newsp = 0;
 	int sigret;
 	unsigned long tramp;
@@ -1425,6 +1435,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 	}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	tm_mctx = &frame->mctx_transact;
 	if (MSR_TM_ACTIVE(regs->msr)) {
 		if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
 				      sigret))
@@ -1432,8 +1443,10 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 	}
 	else
 #endif
-		if (save_user_regs(regs, &frame->mctx, sigret, 1))
+	{
+		if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1))
 			goto badframe;
+	}
 
 	regs->link = tramp;
 
-- 
cgit v0.10.2


From fee55450710dff32a13ae30b4129ec7b5a4b44d0 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Sun, 9 Jun 2013 21:23:16 +1000
Subject: powerpc/tm: Fix 32 bit non-rt signals

Currently sys_sigreturn() is TM unaware.  Therefore, if we take a 32 bit signal
without SIGINFO (non RT) inside a transaction, on signal return we don't
restore the signal frame correctly.

This checks if the signal frame being restoring is an active transaction, and
if so, it copies the additional state to ptregs so it can be restored.

Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 5bc819f..fa81462 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1494,16 +1494,22 @@ badframe:
 long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 		       struct pt_regs *regs)
 {
+	struct sigframe __user *sf;
 	struct sigcontext __user *sc;
 	struct sigcontext sigctx;
 	struct mcontext __user *sr;
 	void __user *addr;
 	sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	struct mcontext __user *mcp, *tm_mcp;
+	unsigned long msr_hi;
+#endif
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-	sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+	sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+	sc = &sf->sctx;
 	addr = sc;
 	if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
 		goto badframe;
@@ -1520,11 +1526,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 #endif
 	set_current_blocked(&set);
 
-	sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
-	addr = sr;
-	if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
-	    || restore_user_regs(regs, sr, 1))
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	mcp = (struct mcontext __user *)&sf->mctx;
+	tm_mcp = (struct mcontext __user *)&sf->mctx_transact;
+	if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR]))
 		goto badframe;
+	if (MSR_TM_ACTIVE(msr_hi<<32)) {
+		if (!cpu_has_feature(CPU_FTR_TM))
+			goto badframe;
+		if (restore_tm_user_regs(regs, mcp, tm_mcp))
+			goto badframe;
+	} else
+#endif
+	{
+		sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
+		addr = sr;
+		if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+		    || restore_user_regs(regs, sr, 1))
+			goto badframe;
+	}
 
 	set_thread_flag(TIF_RESTOREALL);
 	return 0;
-- 
cgit v0.10.2


From 2c27a18f8736da047bef2b997bdd48efc667e3c9 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Sun, 9 Jun 2013 21:23:17 +1000
Subject: powerpc/tm: Fix restoration of MSR on 32bit signal return

Currently we clear out the MSR TM bits on signal return assuming that the
signal should never return to an active transaction.

This is bogus as the user may do this.  It's most likely the transaction will
be doomed due to a treclaim but that's a problem for the HW not the kernel.

The current code is a legacy of earlier kernel implementations which did
software rollback of active transactions in the kernel.  That code has now gone
but we didn't correctly fix up this part of the signals code which still makes
the assumption that it must be returning to a suspended transaction.

This pulls out both MSR TM bits from the user supplied context rather than just
setting TM suspend.  We pull out only the bits needed to ensure the user can't
do anything dangerous to the MSR.

Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index fa81462..364cb1e 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -754,7 +754,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 				 struct mcontext __user *tm_sr)
 {
 	long err;
-	unsigned long msr;
+	unsigned long msr, msr_hi;
 #ifdef CONFIG_VSX
 	int i;
 #endif
@@ -859,8 +859,11 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	tm_enable();
 	/* This loads the checkpointed FP/VEC state, if used */
 	tm_recheckpoint(&current->thread, msr);
-	/* The task has moved into TM state S, so ensure MSR reflects this */
-	regs->msr = (regs->msr & ~MSR_TS_MASK) | MSR_TS_S;
+	/* Get the top half of the MSR */
+	if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR]))
+		return 1;
+	/* Pull in MSR TM from user context */
+	regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK);
 
 	/* This loads the speculative FP/VEC state, if used */
 	if (msr & MSR_FP) {
-- 
cgit v0.10.2


From 55e4341850ac56e63a3eefe9583a9000042164fa Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Sun, 9 Jun 2013 21:23:18 +1000
Subject: powerpc/tm: Fix return of 32bit rt signals to active transactions

Currently we only restore signals which are transactionally suspended but it's
possible that the transaction can be restored even when it's active.  Most
likely this will result in a transactional rollback by the hardware as the
transaction will have been doomed by an earlier treclaim.

The current code is a legacy of earlier kernel implementations which did
software rollback of active transactions in the kernel.  That code has now gone
but we didn't correctly fix up this part of the signals code which still makes
assumptions based on having software rollback.

This changes the signal return code to always restore both contexts on 32 bit
rt signal return.

Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 364cb1e..0f83122 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1245,7 +1245,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
 		if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR]))
 			goto bad;
 
-		if (MSR_TM_SUSPENDED(msr_hi<<32)) {
+		if (MSR_TM_ACTIVE(msr_hi<<32)) {
 			/* We only recheckpoint on return if we're
 			 * transaction.
 			 */
-- 
cgit v0.10.2


From 87b4e5393af77f5cba124638f19f6c426e210aec Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Sun, 9 Jun 2013 21:23:19 +1000
Subject: powerpc/tm: Fix return of active 64bit signals

Currently we only restore signals which are transactionally suspended but it's
possible that the transaction can be restored even when it's active.  Most
likely this will result in a transactional rollback by the hardware as the
transaction will have been doomed by an earlier treclaim.

The current code is a legacy of earlier kernel implementations which did
software rollback of active transactions in the kernel.  That code has now gone
but we didn't correctly fix up this part of the signals code which still makes
assumptions based on having software rollback.

This changes the signal return code to always restore both contexts on 64 bit
signal return.  It also ensures that the MSR TM bits are properly restored from
the signal context which they are not currently.

Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 3459473..887e99d 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -410,6 +410,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 
 	/* get MSR separately, transfer the LE bit if doing signal return */
 	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+	/* pull in MSR TM from user context */
+	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+	/* pull in MSR LE from user context */
 	regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
 	/* The following non-GPR non-FPR non-VR state is also checkpointed: */
@@ -505,8 +509,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 	tm_enable();
 	/* This loads the checkpointed FP/VEC state, if used */
 	tm_recheckpoint(&current->thread, msr);
-	/* The task has moved into TM state S, so ensure MSR reflects this: */
-	regs->msr = (regs->msr & ~MSR_TS_MASK) | __MASK(33);
 
 	/* This loads the speculative FP/VEC state, if used */
 	if (msr & MSR_FP) {
@@ -654,7 +656,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
 		goto badframe;
-	if (MSR_TM_SUSPENDED(msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		/* We recheckpoint on return. */
 		struct ucontext __user *uc_transact;
 		if (__get_user(uc_transact, &uc->uc_link))
-- 
cgit v0.10.2


From a84f273c30500c0d5816e07232e3326a61956016 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:51 +0800
Subject: powerpc/eeh: Cleanup for EEH core

Cleanup on EEH core to remove unnecessary whitespaces.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 6b73d6c..8a83451 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -368,7 +368,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	}
 
 	eeh_stats.slot_resets++;
- 
+
 	/* Avoid repeated reports of this failure, including problems
 	 * with other functions on this device, and functions under
 	 * bridges.
@@ -525,7 +525,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
 	 * or a fundamental reset (3).
 	 * A fundamental reset required by any device under
 	 * Partitionable Endpoint trumps hot-reset.
-  	 */
+	 */
 	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
 
 	if (freset)
@@ -538,8 +538,8 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
 	 */
 #define PCI_BUS_RST_HOLD_TIME_MSEC 250
 	msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
-	
-	/* We might get hit with another EEH freeze as soon as the 
+
+	/* We might get hit with another EEH freeze as soon as the
 	 * pci slot reset line is dropped. Make sure we don't miss
 	 * these, and clear the flag now.
 	 */
@@ -604,7 +604,7 @@ void eeh_save_bars(struct eeh_dev *edev)
 	if (!edev)
 		return;
 	dn = eeh_dev_to_of_node(edev);
-	
+
 	for (i = 0; i < 16; i++)
 		eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
 }
@@ -803,12 +803,12 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
- 		eeh_add_device_late(dev);
- 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
- 			struct pci_bus *subbus = dev->subordinate;
- 			if (subbus)
- 				eeh_add_device_tree_late(subbus);
- 		}
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_device_tree_late(subbus);
+		}
 	}
 }
 EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index a3fefb6..0acc5a2 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -154,9 +154,9 @@ static void eeh_enable_irq(struct pci_dev *dev)
  * eeh_report_error - Report pci error to each device driver
  * @data: eeh device
  * @userdata: return value
- * 
- * Report an EEH error to each device driver, collect up and 
- * merge the device driver responses. Cumulative response 
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
  * passed back in "userdata".
  */
 static void *eeh_report_error(void *data, void *userdata)
@@ -376,9 +376,9 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 	eeh_pe_restore_bars(pe);
 
 	/* Give the system 5 seconds to finish running the user-space
-	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes, 
-	 * this is a hack, but if we don't do this, and try to bring 
-	 * the device up before the scripts have taken it down, 
+	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
+	 * this is a hack, but if we don't do this, and try to bring
+	 * the device up before the scripts have taken it down,
 	 * potentially weird things happen.
 	 */
 	if (bus) {
@@ -520,7 +520,7 @@ void eeh_handle_event(struct eeh_pe *pe)
 	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
 	return;
-	
+
 excess_failures:
 	/*
 	 * About 90% of all real-life EEH failures in the field
-- 
cgit v0.10.2


From 317f06de78152e0eb0aab5881d69e4c5cdf9f1fe Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:52 +0800
Subject: powerpc/eeh: Move common part to kernel directory

The patch moves the common part of EEH core into arch/powerpc/kernel
directory so that we needn't PPC_PSERIES while compiling POWERNV
platform:

        * Move the EEH common part into arch/powerpc/kernel
        * Move the functions for PCI hotplug from pSeries platform to
          arch/powerpc/kernel/pci-hotplug.c
        * Move CONFIG_EEH from arch/powerpc/platforms/pseries/Kconfig to
          arch/powerpc/platforms/Kconfig
        * Adjust makefile accordingly

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f960a79..a8619bf 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -58,6 +58,8 @@ obj-$(CONFIG_RTAS_PROC)		+= rtas-proc.o
 obj-$(CONFIG_LPARCFG)		+= lparcfg.o
 obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)           += ibmebus.o
+obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+				  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_FA_DUMP)		+= fadump.o
@@ -100,7 +102,7 @@ obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_SWIOTLB)		+= dma-swiotlb.o
 
-pci64-$(CONFIG_PPC64)		+= pci_dn.o isa-bridge.o
+pci64-$(CONFIG_PPC64)		+= pci_dn.o pci-hotplug.o isa-bridge.o
 obj-$(CONFIG_PCI)		+= pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
 				   pci-common.o pci_of_scan.o
 obj-$(CONFIG_PCI_MSI)		+= msi.o
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
new file mode 100644
index 0000000..8a83451
--- /dev/null
+++ b/arch/powerpc/kernel/eeh.c
@@ -0,0 +1,942 @@
+/*
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ * Copyright 2001-2012 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+
+#include <linux/atomic.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+
+/** Overview:
+ *  EEH, or "Extended Error Handling" is a PCI bridge technology for
+ *  dealing with PCI bus errors that can't be dealt with within the
+ *  usual PCI framework, except by check-stopping the CPU.  Systems
+ *  that are designed for high-availability/reliability cannot afford
+ *  to crash due to a "mere" PCI error, thus the need for EEH.
+ *  An EEH-capable bridge operates by converting a detected error
+ *  into a "slot freeze", taking the PCI adapter off-line, making
+ *  the slot behave, from the OS'es point of view, as if the slot
+ *  were "empty": all reads return 0xff's and all writes are silently
+ *  ignored.  EEH slot isolation events can be triggered by parity
+ *  errors on the address or data busses (e.g. during posted writes),
+ *  which in turn might be caused by low voltage on the bus, dust,
+ *  vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ *  Note, however, that one of the leading causes of EEH slot
+ *  freeze events are buggy device drivers, buggy device microcode,
+ *  or buggy device hardware.  This is because any attempt by the
+ *  device to bus-master data to a memory address that is not
+ *  assigned to the device will trigger a slot freeze.   (The idea
+ *  is to prevent devices-gone-wild from corrupting system memory).
+ *  Buggy hardware/drivers will have a miserable time co-existing
+ *  with EEH.
+ *
+ *  Ideally, a PCI device driver, when suspecting that an isolation
+ *  event has occurred (e.g. by reading 0xff's), will then ask EEH
+ *  whether this is the case, and then take appropriate steps to
+ *  reset the PCI slot, the PCI device, and then resume operations.
+ *  However, until that day,  the checking is done here, with the
+ *  eeh_check_failure() routine embedded in the MMIO macros.  If
+ *  the slot is found to be isolated, an "EEH Event" is synthesized
+ *  and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event, it might be broken.
+ * This sets the threshold for how many read attempts we allow
+ * before printing an error message.
+ */
+#define EEH_MAX_FAILS	2100000
+
+/* Time to wait for a PCI slot to report status, in milliseconds */
+#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
+
+/* Platform dependent EEH operations */
+struct eeh_ops *eeh_ops = NULL;
+
+int eeh_subsystem_enabled;
+EXPORT_SYMBOL(eeh_subsystem_enabled);
+
+/*
+ * EEH probe mode support. The intention is to support multiple
+ * platforms for EEH. Some platforms like pSeries do PCI emunation
+ * based on device tree. However, other platforms like powernv probe
+ * PCI devices from hardware. The flag is used to distinguish that.
+ * In addition, struct eeh_ops::probe would be invoked for particular
+ * OF node or PCI device so that the corresponding PE would be created
+ * there.
+ */
+int eeh_probe_mode;
+
+/* Global EEH mutex */
+DEFINE_MUTEX(eeh_mutex);
+
+/* Lock to avoid races due to multiple reports of an error */
+static DEFINE_RAW_SPINLOCK(confirm_error_lock);
+
+/* Buffer for reporting pci register dumps. Its here in BSS, and
+ * not dynamically alloced, so that it ends up in RMO where RTAS
+ * can access it.
+ */
+#define EEH_PCI_REGS_LOG_LEN 4096
+static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
+
+/*
+ * The struct is used to maintain the EEH global statistic
+ * information. Besides, the EEH global statistics will be
+ * exported to user space through procfs
+ */
+struct eeh_stats {
+	u64 no_device;		/* PCI device not found		*/
+	u64 no_dn;		/* OF node not found		*/
+	u64 no_cfg_addr;	/* Config address not found	*/
+	u64 ignored_check;	/* EEH check skipped		*/
+	u64 total_mmio_ffs;	/* Total EEH checks		*/
+	u64 false_positives;	/* Unnecessary EEH checks	*/
+	u64 slot_resets;	/* PE reset			*/
+};
+
+static struct eeh_stats eeh_stats;
+
+#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
+
+/**
+ * eeh_gather_pci_data - Copy assorted PCI config space registers to buff
+ * @edev: device to report data for
+ * @buf: point to buffer in which to log
+ * @len: amount of room in buffer
+ *
+ * This routine captures assorted PCI configuration space data,
+ * and puts them into a buffer for RTAS error logging.
+ */
+static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
+{
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	u32 cfg;
+	int cap, i;
+	int n = 0;
+
+	n += scnprintf(buf+n, len-n, "%s\n", dn->full_name);
+	printk(KERN_WARNING "EEH: of node=%s\n", dn->full_name);
+
+	eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg);
+	n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
+	printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
+
+	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg);
+	n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
+	printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg);
+
+	if (!dev) {
+		printk(KERN_WARNING "EEH: no PCI device for this of node\n");
+		return n;
+	}
+
+	/* Gather bridge-specific registers */
+	if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
+		eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg);
+		n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
+		printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg);
+
+		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg);
+		n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
+		printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg);
+	}
+
+	/* Dump out the PCI-X command and status regs */
+	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+	if (cap) {
+		eeh_ops->read_config(dn, cap, 4, &cfg);
+		n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
+		printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg);
+
+		eeh_ops->read_config(dn, cap+4, 4, &cfg);
+		n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
+		printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg);
+	}
+
+	/* If PCI-E capable, dump PCI-E cap 10, and the AER */
+	cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (cap) {
+		n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
+		printk(KERN_WARNING
+		       "EEH: PCI-E capabilities and status follow:\n");
+
+		for (i=0; i<=8; i++) {
+			eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+			printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg);
+		}
+
+		cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+		if (cap) {
+			n += scnprintf(buf+n, len-n, "pci-e AER:\n");
+			printk(KERN_WARNING
+			       "EEH: PCI-E AER capability register set follows:\n");
+
+			for (i=0; i<14; i++) {
+				eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+				n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+				printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg);
+			}
+		}
+	}
+
+	return n;
+}
+
+/**
+ * eeh_slot_error_detail - Generate combined log including driver log and error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ *
+ * This routine should be called to generate the combined log, which
+ * is comprised of driver log and error log. The driver log is figured
+ * out from the config space of the corresponding PCI device, while
+ * the error log is fetched through platform dependent function call.
+ */
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
+{
+	size_t loglen = 0;
+	struct eeh_dev *edev;
+
+	eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
+
+	pci_regs_buf[0] = 0;
+	eeh_pe_for_each_dev(pe, edev) {
+		loglen += eeh_gather_pci_data(edev, pci_regs_buf,
+				EEH_PCI_REGS_LOG_LEN);
+        }
+
+	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
+}
+
+/**
+ * eeh_token_to_phys - Convert EEH address token to phys address
+ * @token: I/O token, should be address in the form 0xA....
+ *
+ * This routine should be called to convert virtual I/O address
+ * to physical one.
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+	pte_t *ptep;
+	unsigned long pa;
+
+	ptep = find_linux_pte(init_mm.pgd, token);
+	if (!ptep)
+		return token;
+	pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+	return pa | (token & (PAGE_SIZE-1));
+}
+
+/**
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
+ *
+ * Check for an EEH failure for the given device node.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dev_check_failure(struct eeh_dev *edev)
+{
+	int ret;
+	unsigned long flags;
+	struct device_node *dn;
+	struct pci_dev *dev;
+	struct eeh_pe *pe;
+	int rc = 0;
+	const char *location;
+
+	eeh_stats.total_mmio_ffs++;
+
+	if (!eeh_subsystem_enabled)
+		return 0;
+
+	if (!edev) {
+		eeh_stats.no_dn++;
+		return 0;
+	}
+	dn = eeh_dev_to_of_node(edev);
+	dev = eeh_dev_to_pci_dev(edev);
+	pe = edev->pe;
+
+	/* Access to IO BARs might get this far and still not want checking. */
+	if (!pe) {
+		eeh_stats.ignored_check++;
+		pr_debug("EEH: Ignored check for %s %s\n",
+			eeh_pci_name(dev), dn->full_name);
+		return 0;
+	}
+
+	if (!pe->addr && !pe->config_addr) {
+		eeh_stats.no_cfg_addr++;
+		return 0;
+	}
+
+	/* If we already have a pending isolation event for this
+	 * slot, we know it's bad already, we don't need to check.
+	 * Do this checking under a lock; as multiple PCI devices
+	 * in one slot might report errors simultaneously, and we
+	 * only want one error recovery routine running.
+	 */
+	raw_spin_lock_irqsave(&confirm_error_lock, flags);
+	rc = 1;
+	if (pe->state & EEH_PE_ISOLATED) {
+		pe->check_count++;
+		if (pe->check_count % EEH_MAX_FAILS == 0) {
+			location = of_get_property(dn, "ibm,loc-code", NULL);
+			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
+				"location=%s driver=%s pci addr=%s\n",
+				pe->check_count, location,
+				eeh_driver_name(dev), eeh_pci_name(dev));
+			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+				eeh_driver_name(dev));
+			dump_stack();
+		}
+		goto dn_unlock;
+	}
+
+	/*
+	 * Now test for an EEH failure.  This is VERY expensive.
+	 * Note that the eeh_config_addr may be a parent device
+	 * in the case of a device behind a bridge, or it may be
+	 * function zero of a multi-function device.
+	 * In any case they must share a common PHB.
+	 */
+	ret = eeh_ops->get_state(pe, NULL);
+
+	/* Note that config-io to empty slots may fail;
+	 * they are empty when they don't have children.
+	 * We will punt with the following conditions: Failure to get
+	 * PE's state, EEH not support and Permanently unavailable
+	 * state, PE is in good state.
+	 */
+	if ((ret < 0) ||
+	    (ret == EEH_STATE_NOT_SUPPORT) ||
+	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+		eeh_stats.false_positives++;
+		pe->false_positives++;
+		rc = 0;
+		goto dn_unlock;
+	}
+
+	eeh_stats.slot_resets++;
+
+	/* Avoid repeated reports of this failure, including problems
+	 * with other functions on this device, and functions under
+	 * bridges.
+	 */
+	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+
+	eeh_send_failure_event(pe);
+
+	/* Most EEH events are due to device driver bugs.  Having
+	 * a stack trace will help the device-driver authors figure
+	 * out what happened.  So print that out.
+	 */
+	WARN(1, "EEH: failure detected\n");
+	return 1;
+
+dn_unlock:
+	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+	return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
+
+/**
+ * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @token: I/O token, should be address in the form 0xA....
+ * @val: value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an EEH failure at the given token address.  Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event.  This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+	unsigned long addr;
+	struct eeh_dev *edev;
+
+	/* Finding the phys addr + pci device; this is pretty quick. */
+	addr = eeh_token_to_phys((unsigned long __force) token);
+	edev = eeh_addr_cache_get_dev(addr);
+	if (!edev) {
+		eeh_stats.no_device++;
+		return val;
+	}
+
+	eeh_dev_check_failure(edev);
+
+	pci_dev_put(eeh_dev_to_pci_dev(edev));
+	return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+
+/**
+ * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
+ * @pe: EEH PE
+ *
+ * This routine should be called to reenable frozen MMIO or DMA
+ * so that it would work correctly again. It's useful while doing
+ * recovery or log collection on the indicated device.
+ */
+int eeh_pci_enable(struct eeh_pe *pe, int function)
+{
+	int rc;
+
+	rc = eeh_ops->set_option(pe, function);
+	if (rc)
+		pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n",
+			__func__, function, pe->phb->global_number, pe->addr, rc);
+
+	rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+	if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) &&
+	   (function == EEH_OPT_THAW_MMIO))
+		return 0;
+
+	return rc;
+}
+
+/**
+ * pcibios_set_pcie_slot_reset - Set PCI-E reset state
+ * @dev: pci device struct
+ * @state: reset state to enter
+ *
+ * Return value:
+ * 	0 if success
+ */
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+	struct eeh_pe *pe = edev->pe;
+
+	if (!pe) {
+		pr_err("%s: No PE found on PCI device %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
+
+	switch (state) {
+	case pcie_deassert_reset:
+		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+		break;
+	case pcie_hot_reset:
+		eeh_ops->reset(pe, EEH_RESET_HOT);
+		break;
+	case pcie_warm_reset:
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+		break;
+	default:
+		return -EINVAL;
+	};
+
+	return 0;
+}
+
+/**
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collected the information for
+ * the indicated device and its children so that the bunch of the
+ * devices could be reset properly.
+ */
+static void *eeh_set_dev_freset(void *data, void *flag)
+{
+	struct pci_dev *dev;
+	unsigned int *freset = (unsigned int *)flag;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+
+	dev = eeh_dev_to_pci_dev(edev);
+	if (dev)
+		*freset |= dev->needs_freset;
+
+	return NULL;
+}
+
+/**
+ * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
+ * @pe: EEH PE
+ *
+ * Assert the PCI #RST line for 1/4 second.
+ */
+static void eeh_reset_pe_once(struct eeh_pe *pe)
+{
+	unsigned int freset = 0;
+
+	/* Determine type of EEH reset required for
+	 * Partitionable Endpoint, a hot-reset (1)
+	 * or a fundamental reset (3).
+	 * A fundamental reset required by any device under
+	 * Partitionable Endpoint trumps hot-reset.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
+
+	if (freset)
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+	else
+		eeh_ops->reset(pe, EEH_RESET_HOT);
+
+	/* The PCI bus requires that the reset be held high for at least
+	 * a 100 milliseconds. We wait a bit longer 'just in case'.
+	 */
+#define PCI_BUS_RST_HOLD_TIME_MSEC 250
+	msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
+
+	/* We might get hit with another EEH freeze as soon as the
+	 * pci slot reset line is dropped. Make sure we don't miss
+	 * these, and clear the flag now.
+	 */
+	eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+
+	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+
+	/* After a PCI slot has been reset, the PCI Express spec requires
+	 * a 1.5 second idle time for the bus to stabilize, before starting
+	 * up traffic.
+	 */
+#define PCI_BUS_SETTLE_TIME_MSEC 1800
+	msleep(PCI_BUS_SETTLE_TIME_MSEC);
+}
+
+/**
+ * eeh_reset_pe - Reset the indicated PE
+ * @pe: EEH PE
+ *
+ * This routine should be called to reset indicated device, including
+ * PE. A PE might include multiple PCI devices and sometimes PCI bridges
+ * might be involved as well.
+ */
+int eeh_reset_pe(struct eeh_pe *pe)
+{
+	int i, rc;
+
+	/* Take three shots at resetting the bus */
+	for (i=0; i<3; i++) {
+		eeh_reset_pe_once(pe);
+
+		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+		if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
+			return 0;
+
+		if (rc < 0) {
+			pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+				__func__, pe->phb->global_number, pe->addr);
+			return -1;
+		}
+		pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+			i+1, pe->phb->global_number, pe->addr, rc);
+	}
+
+	return -1;
+}
+
+/**
+ * eeh_save_bars - Save device bars
+ * @edev: PCI device associated EEH device
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individually; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+void eeh_save_bars(struct eeh_dev *edev)
+{
+	int i;
+	struct device_node *dn;
+
+	if (!edev)
+		return;
+	dn = eeh_dev_to_of_node(edev);
+
+	for (i = 0; i < 16; i++)
+		eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
+}
+
+/**
+ * eeh_ops_register - Register platform dependent EEH operations
+ * @ops: platform dependent EEH operations
+ *
+ * Register the platform dependent EEH operation callback
+ * functions. The platform should call this function before
+ * any other EEH operations.
+ */
+int __init eeh_ops_register(struct eeh_ops *ops)
+{
+	if (!ops->name) {
+		pr_warning("%s: Invalid EEH ops name for %p\n",
+			__func__, ops);
+		return -EINVAL;
+	}
+
+	if (eeh_ops && eeh_ops != ops) {
+		pr_warning("%s: EEH ops of platform %s already existing (%s)\n",
+			__func__, eeh_ops->name, ops->name);
+		return -EEXIST;
+	}
+
+	eeh_ops = ops;
+
+	return 0;
+}
+
+/**
+ * eeh_ops_unregister - Unreigster platform dependent EEH operations
+ * @name: name of EEH platform operations
+ *
+ * Unregister the platform dependent EEH operation callback
+ * functions.
+ */
+int __exit eeh_ops_unregister(const char *name)
+{
+	if (!name || !strlen(name)) {
+		pr_warning("%s: Invalid EEH ops name\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	if (eeh_ops && !strcmp(eeh_ops->name, name)) {
+		eeh_ops = NULL;
+		return 0;
+	}
+
+	return -EEXIST;
+}
+
+/**
+ * eeh_init - EEH initialization
+ *
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check.  If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+static int __init eeh_init(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct device_node *phb;
+	int ret;
+
+	/* call platform initialization function */
+	if (!eeh_ops) {
+		pr_warning("%s: Platform EEH operation not found\n",
+			__func__);
+		return -EEXIST;
+	} else if ((ret = eeh_ops->init())) {
+		pr_warning("%s: Failed to call platform init function (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	raw_spin_lock_init(&confirm_error_lock);
+
+	/* Enable EEH for all adapters */
+	if (eeh_probe_mode_devtree()) {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node) {
+			phb = hose->dn;
+			traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
+		}
+	}
+
+	if (eeh_subsystem_enabled)
+		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
+	else
+		pr_warning("EEH: No capable adapters found\n");
+
+	return ret;
+}
+
+core_initcall_sync(eeh_init);
+
+/**
+ * eeh_add_device_early - Enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+static void eeh_add_device_early(struct device_node *dn)
+{
+	struct pci_controller *phb;
+
+	if (!of_node_to_eeh_dev(dn))
+		return;
+	phb = of_node_to_eeh_dev(dn)->phb;
+
+	/* USB Bus children of PCI devices will not have BUID's */
+	if (NULL == phb || 0 == phb->buid)
+		return;
+
+	/* FIXME: hotplug support on POWERNV */
+	eeh_ops->of_probe(dn, NULL);
+}
+
+/**
+ * eeh_add_device_tree_early - Enable EEH for the indicated device
+ * @dn: device node
+ *
+ * This routine must be used to perform EEH initialization for the
+ * indicated PCI device that was added after system boot (e.g.
+ * hotplug, dlpar).
+ */
+void eeh_add_device_tree_early(struct device_node *dn)
+{
+	struct device_node *sib;
+
+	for_each_child_of_node(dn, sib)
+		eeh_add_device_tree_early(sib);
+	eeh_add_device_early(dn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
+
+/**
+ * eeh_add_device_late - Perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+static void eeh_add_device_late(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+
+	if (!dev || !eeh_subsystem_enabled)
+		return;
+
+	pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+	dn = pci_device_to_OF_node(dev);
+	edev = of_node_to_eeh_dev(dn);
+	if (edev->pdev == dev) {
+		pr_debug("EEH: Already referenced !\n");
+		return;
+	}
+	WARN_ON(edev->pdev);
+
+	pci_dev_get(dev);
+	edev->pdev = dev;
+	dev->dev.archdata.edev = edev;
+
+	eeh_addr_cache_insert_dev(dev);
+}
+
+/**
+ * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_device_tree_late(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_device_tree_late(subbus);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
+
+/**
+ * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to add EEH sysfs files for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_sysfs_files(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_sysfs_add_device(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (subbus)
+				eeh_add_sysfs_files(subbus);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
+
+/**
+ * eeh_remove_device - Undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ * @purge_pe: remove the PE or not
+ *
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar).  It unregisters
+ * the PCI device from the EEH subsystem.  I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
+ */
+static void eeh_remove_device(struct pci_dev *dev, int purge_pe)
+{
+	struct eeh_dev *edev;
+
+	if (!dev || !eeh_subsystem_enabled)
+		return;
+	edev = pci_dev_to_eeh_dev(dev);
+
+	/* Unregister the device with the EEH/PCI address search system */
+	pr_debug("EEH: Removing device %s\n", pci_name(dev));
+
+	if (!edev || !edev->pdev) {
+		pr_debug("EEH: Not referenced !\n");
+		return;
+	}
+	edev->pdev = NULL;
+	dev->dev.archdata.edev = NULL;
+	pci_dev_put(dev);
+
+	eeh_rmv_from_parent_pe(edev, purge_pe);
+	eeh_addr_cache_rmv_dev(dev);
+	eeh_sysfs_remove_device(dev);
+}
+
+/**
+ * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device
+ * @dev: PCI device
+ * @purge_pe: remove the corresponding PE or not
+ *
+ * This routine must be called when a device is removed from the
+ * running system through hotplug or dlpar. The corresponding
+ * PCI address cache will be removed.
+ */
+void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe)
+{
+	struct pci_bus *bus = dev->subordinate;
+	struct pci_dev *child, *tmp;
+
+	eeh_remove_device(dev, purge_pe);
+
+	if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+		list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
+			 eeh_remove_bus_device(child, purge_pe);
+	}
+}
+EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+	if (0 == eeh_subsystem_enabled) {
+		seq_printf(m, "EEH Subsystem is globally disabled\n");
+		seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
+	} else {
+		seq_printf(m, "EEH Subsystem is enabled\n");
+		seq_printf(m,
+				"no device=%llu\n"
+				"no device node=%llu\n"
+				"no config address=%llu\n"
+				"check not wanted=%llu\n"
+				"eeh_total_mmio_ffs=%llu\n"
+				"eeh_false_positives=%llu\n"
+				"eeh_slot_resets=%llu\n",
+				eeh_stats.no_device,
+				eeh_stats.no_dn,
+				eeh_stats.no_cfg_addr,
+				eeh_stats.ignored_check,
+				eeh_stats.total_mmio_ffs,
+				eeh_stats.false_positives,
+				eeh_stats.slot_resets);
+	}
+
+	return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_eeh_show, NULL);
+}
+
+static const struct file_operations proc_eeh_operations = {
+	.open      = proc_eeh_open,
+	.read      = seq_read,
+	.llseek    = seq_lseek,
+	.release   = single_release,
+};
+
+static int __init eeh_init_proc(void)
+{
+	if (machine_is(pseries))
+		proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+	return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
new file mode 100644
index 0000000..1d5d9a6
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -0,0 +1,318 @@
+/*
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem.  This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range {
+	struct rb_node rb_node;
+	unsigned long addr_lo;
+	unsigned long addr_hi;
+	struct eeh_dev *edev;
+	struct pci_dev *pcidev;
+	unsigned int flags;
+};
+
+static struct pci_io_addr_cache {
+	struct rb_root rb_root;
+	spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
+{
+	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (addr < piar->addr_lo) {
+			n = n->rb_left;
+		} else {
+			if (addr > piar->addr_hi) {
+				n = n->rb_right;
+			} else {
+				pci_dev_get(piar->pcidev);
+				return piar->edev;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_addr_cache_get_dev - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address.  Be sure to pci_dev_put the device
+ * when finished.  I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
+{
+	struct eeh_dev *edev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	edev = __eeh_addr_cache_get_device(addr);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+	return edev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+	struct rb_node *n;
+	int cnt = 0;
+
+	n = rb_first(&cache->rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+		pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
+		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+		cnt++;
+		n = rb_next(n);
+	}
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+		      unsigned long ahi, unsigned int flags)
+{
+	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pci_io_addr_range *piar;
+
+	/* Walk tree, find a place to insert into tree */
+	while (*p) {
+		parent = *p;
+		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+		if (ahi < piar->addr_lo) {
+			p = &parent->rb_left;
+		} else if (alo > piar->addr_hi) {
+			p = &parent->rb_right;
+		} else {
+			if (dev != piar->pcidev ||
+			    alo != piar->addr_lo || ahi != piar->addr_hi) {
+				pr_warning("PIAR: overlapping address range\n");
+			}
+			return piar;
+		}
+	}
+	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	if (!piar)
+		return NULL;
+
+	pci_dev_get(dev);
+	piar->addr_lo = alo;
+	piar->addr_hi = ahi;
+	piar->edev = pci_dev_to_eeh_dev(dev);
+	piar->pcidev = dev;
+	piar->flags = flags;
+
+#ifdef DEBUG
+	pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
+	                  alo, ahi, pci_name(dev));
+#endif
+
+	rb_link_node(&piar->rb_node, parent, p);
+	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+	return piar;
+}
+
+static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+	int i;
+
+	dn = pci_device_to_OF_node(dev);
+	if (!dn) {
+		pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev));
+		return;
+	}
+
+	edev = of_node_to_eeh_dev(dn);
+	if (!edev) {
+		pr_warning("PCI: no EEH dev found for dn=%s\n",
+			dn->full_name);
+		return;
+	}
+
+	/* Skip any devices for which EEH is not enabled. */
+	if (!edev->pe) {
+#ifdef DEBUG
+		pr_info("PCI: skip building address cache for=%s - %s\n",
+			pci_name(dev), dn->full_name);
+#endif
+		return;
+	}
+
+	/* Walk resources on this device, poke them into the tree */
+	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+		unsigned long start = pci_resource_start(dev,i);
+		unsigned long end = pci_resource_end(dev,i);
+		unsigned int flags = pci_resource_flags(dev,i);
+
+		/* We are interested only bus addresses, not dma or other stuff */
+		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+			 continue;
+		eeh_addr_cache_insert(dev, start, end, flags);
+	}
+}
+
+/**
+ * eeh_addr_cache_insert_dev - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	/* Ignore PCI bridges */
+	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
+		return;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__eeh_addr_cache_insert_dev(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+	struct rb_node *n;
+
+restart:
+	n = rb_first(&pci_io_addr_cache_root.rb_root);
+	while (n) {
+		struct pci_io_addr_range *piar;
+		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+		if (piar->pcidev == dev) {
+			rb_erase(n, &pci_io_addr_cache_root.rb_root);
+			pci_dev_put(piar->pcidev);
+			kfree(piar);
+			goto restart;
+		}
+		n = rb_next(n);
+	}
+}
+
+/**
+ * eeh_addr_cache_rmv_dev - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+	__eeh_addr_cache_rmv_dev(dev);
+	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * eeh_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void __init eeh_addr_cache_build(void)
+{
+	struct device_node *dn;
+	struct eeh_dev *edev;
+	struct pci_dev *dev = NULL;
+
+	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+	for_each_pci_dev(dev) {
+		eeh_addr_cache_insert_dev(dev);
+
+		dn = pci_device_to_OF_node(dev);
+		if (!dn)
+			continue;
+
+		edev = of_node_to_eeh_dev(dn);
+		if (!edev)
+			continue;
+
+		pci_dev_get(dev);  /* matching put is in eeh_remove_device() */
+		dev->dev.archdata.edev = edev;
+		edev->pdev = dev;
+
+		eeh_sysfs_add_device(dev);
+	}
+
+#ifdef DEBUG
+	/* Verify tree built up above, echo back the list of addrs. */
+	eeh_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
new file mode 100644
index 0000000..1efa28f
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -0,0 +1,112 @@
+/*
+ * The file intends to implement dynamic creation of EEH device, which will
+ * be bound with OF node and PCI device simutaneously. The EEH devices would
+ * be foundamental information for EEH core components to work proerly. Besides,
+ * We have to support multiple situations where dynamic creation of EEH device
+ * is required:
+ *
+ * 1) Before PCI emunation starts, we need create EEH devices according to the
+ *    PCI sensitive OF nodes.
+ * 2) When PCI emunation is done, we need do the binding between PCI device and
+ *    the associated EEH device.
+ * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
+ *    will be created while PCI sensitive OF node is detected from DR.
+ * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
+ *    PHB is newly inserted, we also need create EEH devices accordingly.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+/**
+ * eeh_dev_init - Create EEH device according to OF node
+ * @dn: device node
+ * @data: PHB
+ *
+ * It will create EEH device according to the given OF node. The function
+ * might be called by PCI emunation, DR, PHB hotplug.
+ */
+void *eeh_dev_init(struct device_node *dn, void *data)
+{
+	struct pci_controller *phb = data;
+	struct eeh_dev *edev;
+
+	/* Allocate EEH device */
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+	if (!edev) {
+		pr_warning("%s: out of memory\n", __func__);
+		return NULL;
+	}
+
+	/* Associate EEH device with OF node */
+	PCI_DN(dn)->edev = edev;
+	edev->dn  = dn;
+	edev->phb = phb;
+	INIT_LIST_HEAD(&edev->list);
+
+	return NULL;
+}
+
+/**
+ * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
+ * @phb: PHB
+ *
+ * Scan the PHB OF node and its child association, then create the
+ * EEH devices accordingly
+ */
+void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
+{
+	struct device_node *dn = phb->dn;
+
+	/* EEH PE for PHB */
+	eeh_phb_pe_create(phb);
+
+	/* EEH device for PHB */
+	eeh_dev_init(dn, phb);
+
+	/* EEH devices for children OF nodes */
+	traverse_pci_devices(dn, eeh_dev_init, phb);
+}
+
+/**
+ * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
+ *
+ * Scan all the existing PHBs and create EEH devices for their OF
+ * nodes and their children OF nodes
+ */
+static int __init eeh_dev_phb_init(void)
+{
+	struct pci_controller *phb, *tmp;
+
+	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+		eeh_dev_phb_init_dynamic(phb);
+
+	pr_info("EEH: devices created\n");
+
+	return 0;
+}
+
+core_initcall(eeh_dev_phb_init);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
new file mode 100644
index 0000000..fb927af
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -0,0 +1,551 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+/**
+ * eeh_pcid_name - Retrieve name of PCI device driver
+ * @pdev: PCI device
+ *
+ * This routine is used to retrieve the name of PCI device driver
+ * if that's valid.
+ */
+static inline const char *eeh_pcid_name(struct pci_dev *pdev)
+{
+	if (pdev && pdev->dev.driver)
+		return pdev->dev.driver->name;
+	return "";
+}
+
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return NULL;
+
+	if (!try_module_get(pdev->driver->driver.owner))
+		return NULL;
+
+	return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return;
+
+	module_put(pdev->driver->driver.owner);
+}
+
+#if 0
+static void print_device_node_tree(struct pci_dn *pdn, int dent)
+{
+	int i;
+	struct device_node *pc;
+
+	if (!pdn)
+		return;
+	for (i = 0; i < dent; i++)
+		printk(" ");
+	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
+		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
+		pdn->eeh_pe_config_addr, pdn->node->full_name);
+	dent += 3;
+	pc = pdn->node->child;
+	while (pc) {
+		print_device_node_tree(PCI_DN(pc), dent);
+		pc = pc->sibling;
+	}
+}
+#endif
+
+/**
+ * eeh_disable_irq - Disable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called when reporting temporary or permanent
+ * error to the particular PCI device to disable interrupt of that
+ * device. If the device has enabled MSI or MSI-X interrupt, we needn't
+ * do real work because EEH should freeze DMA transfers for those PCI
+ * devices encountering EEH errors, which includes MSI or MSI-X.
+ */
+static void eeh_disable_irq(struct pci_dev *dev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+	/* Don't disable MSI and MSI-X interrupts. They are
+	 * effectively disabled by the DMA Stopped state
+	 * when an EEH error occurs.
+	 */
+	if (dev->msi_enabled || dev->msix_enabled)
+		return;
+
+	if (!irq_has_action(dev->irq))
+		return;
+
+	edev->mode |= EEH_DEV_IRQ_DISABLED;
+	disable_irq_nosync(dev->irq);
+}
+
+/**
+ * eeh_enable_irq - Enable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called to enable interrupt while failed
+ * device could be resumed.
+ */
+static void eeh_enable_irq(struct pci_dev *dev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
+		enable_irq(dev->irq);
+	}
+}
+
+/**
+ * eeh_report_error - Report pci error to each device driver
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
+ */
+static void *eeh_report_error(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver;
+
+	/* We might not have the associated PCI device,
+	 * then we should continue for next one.
+	 */
+	if (!dev) return NULL;
+	dev->error_state = pci_channel_io_frozen;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_disable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+
+	/* A driver that needs a reset trumps all others */
+	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled. Collects up and merges the device driver responses.
+ * Cumulative response passed back in "userdata".
+ */
+static void *eeh_report_mmio_enabled(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->mmio_enabled) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	rc = driver->err_handler->mmio_enabled(dev);
+
+	/* A driver that needs a reset trumps all others */
+	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_reset - Tell device that slot has been reset
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called while EEH tries to reset particular
+ * PCI device so that the associated PCI device driver could take
+ * some actions, usually to save data the driver needs so that the
+ * driver can work again while the device is recovered.
+ */
+static void *eeh_report_reset(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	enum pci_ers_result rc, *res = userdata;
+	struct pci_driver *driver;
+
+	if (!dev) return NULL;
+	dev->error_state = pci_channel_io_normal;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_enable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->slot_reset) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	rc = driver->err_handler->slot_reset(dev);
+	if ((*res == PCI_ERS_RESULT_NONE) ||
+	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
+	if (*res == PCI_ERS_RESULT_DISCONNECT &&
+	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_resume - Tell device to resume normal operations
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called to notify the device driver that it
+ * could resume so that the device driver can do some initialization
+ * to make the recovered device work again.
+ */
+static void *eeh_report_resume(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
+
+	if (!dev) return NULL;
+	dev->error_state = pci_channel_io_normal;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_enable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->resume) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	driver->err_handler->resume(dev);
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_report_failure - Tell device driver that device is dead.
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+static void *eeh_report_failure(void *data, void *userdata)
+{
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
+
+	if (!dev) return NULL;
+	dev->error_state = pci_channel_io_perm_failure;
+
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	eeh_disable_irq(dev);
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
+
+	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+	eeh_pcid_put(dev);
+	return NULL;
+}
+
+/**
+ * eeh_reset_device - Perform actual reset of a pci slot
+ * @pe: EEH PE
+ * @bus: PCI bus corresponding to the isolcated slot
+ *
+ * This routine must be called to do reset on the indicated PE.
+ * During the reset, udev might be invoked because those affected
+ * PCI devices will be removed and then added.
+ */
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
+{
+	int cnt, rc;
+
+	/* pcibios will clear the counter; save the value */
+	cnt = pe->freeze_count;
+
+	/*
+	 * We don't remove the corresponding PE instances because
+	 * we need the information afterwords. The attached EEH
+	 * devices are expected to be attached soon when calling
+	 * into pcibios_add_pci_devices().
+	 */
+	if (bus)
+		__pcibios_remove_pci_devices(bus, 0);
+
+	/* Reset the pci controller. (Asserts RST#; resets config space).
+	 * Reconfigure bridges and devices. Don't try to bring the system
+	 * up if the reset failed for some reason.
+	 */
+	rc = eeh_reset_pe(pe);
+	if (rc)
+		return rc;
+
+	/* Restore PE */
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
+	 * this is a hack, but if we don't do this, and try to bring
+	 * the device up before the scripts have taken it down,
+	 * potentially weird things happen.
+	 */
+	if (bus) {
+		ssleep(5);
+		pcibios_add_pci_devices(bus);
+	}
+	pe->freeze_count = cnt;
+
+	return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 150
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+	struct pci_bus *frozen_bus;
+	int rc = 0;
+	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+
+	frozen_bus = eeh_pe_bus_get(pe);
+	if (!frozen_bus) {
+		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+			__func__, pe->phb->global_number, pe->addr);
+		return;
+	}
+
+	pe->freeze_count++;
+	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+		goto excess_failures;
+	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
+		pe->freeze_count);
+
+	/* Walk the various device drivers attached to this slot through
+	 * a reset sequence, giving each an opportunity to do what it needs
+	 * to accomplish the reset.  Each child gets a report of the
+	 * status ... if any child can't handle the reset, then the entire
+	 * slot is dlpar removed and added.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
+
+	/* Get the current PCI slot state. This can take a long time,
+	 * sometimes over 3 seconds for certain systems.
+	 */
+	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
+	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+		printk(KERN_WARNING "EEH: Permanent failure\n");
+		goto hard_fail;
+	}
+
+	/* Since rtas may enable MMIO when posting the error log,
+	 * don't post the error log until after all dev drivers
+	 * have been informed.
+	 */
+	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
+
+	/* If all device drivers were EEH-unaware, then shut
+	 * down all of the device drivers, and hope they
+	 * go down willingly, without panicing the system.
+	 */
+	if (result == PCI_ERS_RESULT_NONE) {
+		rc = eeh_reset_device(pe, frozen_bus);
+		if (rc) {
+			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
+			goto hard_fail;
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable MMIO */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc) {
+			result = PCI_ERS_RESULT_NEED_RESET;
+		} else {
+			result = PCI_ERS_RESULT_NONE;
+			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
+		}
+	}
+
+	/* If all devices reported they can proceed, then re-enable DMA */
+	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+
+		if (rc < 0)
+			goto hard_fail;
+		if (rc)
+			result = PCI_ERS_RESULT_NEED_RESET;
+		else
+			result = PCI_ERS_RESULT_RECOVERED;
+	}
+
+	/* If any device has a hard failure, then shut off everything. */
+	if (result == PCI_ERS_RESULT_DISCONNECT) {
+		printk(KERN_WARNING "EEH: Device driver gave up\n");
+		goto hard_fail;
+	}
+
+	/* If any device called out for a reset, then reset the slot */
+	if (result == PCI_ERS_RESULT_NEED_RESET) {
+		rc = eeh_reset_device(pe, NULL);
+		if (rc) {
+			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
+			goto hard_fail;
+		}
+		result = PCI_ERS_RESULT_NONE;
+		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
+	}
+
+	/* All devices should claim they have recovered by now. */
+	if ((result != PCI_ERS_RESULT_RECOVERED) &&
+	    (result != PCI_ERS_RESULT_NONE)) {
+		printk(KERN_WARNING "EEH: Not recovered\n");
+		goto hard_fail;
+	}
+
+	/* Tell all device drivers that they can resume operations */
+	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
+
+	return;
+
+excess_failures:
+	/*
+	 * About 90% of all real-life EEH failures in the field
+	 * are due to poorly seated PCI cards. Only 10% or so are
+	 * due to actual, failed cards.
+	 */
+	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+	       "last hour and has been permanently disabled.\n"
+	       "Please try reseating or replacing it.\n",
+		pe->phb->global_number, pe->addr,
+		pe->freeze_count);
+	goto perm_error;
+
+hard_fail:
+	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+	       "Please try reseating or replacing it\n",
+		pe->phb->global_number, pe->addr);
+
+perm_error:
+	eeh_slot_error_detail(pe, EEH_LOG_PERM);
+
+	/* Notify all devices that they're about to go down. */
+	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+
+	/* Shut down the device drivers for good. */
+	if (frozen_bus)
+		pcibios_remove_pci_devices(frozen_bus);
+}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
new file mode 100644
index 0000000..185bedd
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -0,0 +1,142 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ *  EEH error states may be detected within exception handlers;
+ *  however, the recovery processing needs to occur asynchronously
+ *  in a normal kernel context and not an interrupt context.
+ *  This pair of routines creates an event and queues it onto a
+ *  work-queue, where a worker thread can drive recovery.
+ */
+
+/* EEH event workqueue setup. */
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+LIST_HEAD(eeh_eventlist);
+static void eeh_thread_launcher(struct work_struct *);
+DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
+
+/* Serialize reset sequences for a given pci device */
+DEFINE_MUTEX(eeh_event_mutex);
+
+/**
+ * eeh_event_handler - Dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it.  The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+	struct eeh_pe *pe;
+
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	event = NULL;
+
+	/* Unqueue the event, get ready to process. */
+	if (!list_empty(&eeh_eventlist)) {
+		event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+		list_del(&event->list);
+	}
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	if (event == NULL)
+		return 0;
+
+	/* Serialize processing of EEH events */
+	mutex_lock(&eeh_event_mutex);
+	pe = event->pe;
+	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+	pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
+		pe->phb->global_number, pe->addr);
+
+	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
+	eeh_handle_event(pe);
+	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+
+	kfree(event);
+	mutex_unlock(&eeh_event_mutex);
+
+	/* If there are no new errors after an hour, clear the counter. */
+	if (pe && pe->freeze_count > 0) {
+		msleep_interruptible(3600*1000);
+		if (pe->freeze_count > 0)
+			pe->freeze_count--;
+
+	}
+
+	return 0;
+}
+
+/**
+ * eeh_thread_launcher - Start kernel thread to handle EEH events
+ * @dummy - unused
+ *
+ * This routine is called to start the kernel thread for processing
+ * EEH event.
+ */
+static void eeh_thread_launcher(struct work_struct *dummy)
+{
+	if (IS_ERR(kthread_run(eeh_event_handler, NULL, "eehd")))
+		printk(KERN_ERR "Failed to start EEH daemon\n");
+}
+
+/**
+ * eeh_send_failure_event - Generate a PCI error event
+ * @pe: EEH PE
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event(struct eeh_pe *pe)
+{
+	unsigned long flags;
+	struct eeh_event *event;
+
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+	if (!event) {
+		pr_err("EEH: out of memory, event not handled\n");
+		return -ENOMEM;
+	}
+	event->pe = pe;
+
+	/* We may or may not be called in an interrupt context */
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_add(&event->list, &eeh_eventlist);
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+	schedule_work(&eeh_event_wq);
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
new file mode 100644
index 0000000..9d4a9e8
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -0,0 +1,653 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+	if (!pe) return NULL;
+
+	/* Initialize PHB PE */
+	pe->type = type;
+	pe->phb = phb;
+	INIT_LIST_HEAD(&pe->child_list);
+	INIT_LIST_HEAD(&pe->child);
+	INIT_LIST_HEAD(&pe->edevs);
+
+	return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int eeh_phb_pe_create(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* Put it into the list */
+	eeh_lock();
+	list_add_tail(&pe->child, &eeh_phb_pe);
+	eeh_unlock();
+
+	pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+
+	return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	list_for_each_entry(pe, &eeh_phb_pe, child) {
+		/*
+		 * Actually, we needn't check the type since
+		 * the PE for PHB has been determined when that
+		 * was created.
+		 */
+		if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
+			return pe;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
+				  struct eeh_pe *root)
+{
+	struct list_head *next = pe->child_list.next;
+
+	if (next == &pe->child_list) {
+		while (1) {
+			if (pe == root)
+				return NULL;
+			next = pe->child.next;
+			if (next != &pe->parent->child_list)
+				break;
+			pe = pe->parent;
+		}
+	}
+
+	return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+static void *eeh_pe_traverse(struct eeh_pe *root,
+			eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	void *ret;
+
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		ret = fn(pe, flag);
+		if (ret) return ret;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+		eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev;
+	void *ret;
+
+	if (!root) {
+		pr_warning("%s: Invalid PE %p\n", __func__, root);
+		return NULL;
+	}
+
+	eeh_lock();
+
+	/* Traverse root PE */
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		eeh_pe_for_each_dev(pe, edev) {
+			ret = fn(edev, flag);
+			if (ret) {
+				eeh_unlock();
+				return ret;
+			}
+		}
+	}
+
+	eeh_unlock();
+
+	return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+static void *__eeh_pe_get(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_dev *edev = (struct eeh_dev *)flag;
+
+	/* Unexpected PHB PE */
+	if (pe->type & EEH_PE_PHB)
+		return NULL;
+
+	/* We prefer PE address */
+	if (edev->pe_config_addr &&
+	   (edev->pe_config_addr == pe->addr))
+		return pe;
+
+	/* Try BDF address */
+	if (edev->pe_config_addr &&
+	   (edev->config_addr == pe->config_addr))
+		return pe;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @edev: EEH device
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+{
+	struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+	struct eeh_pe *pe;
+
+	pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+
+	return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+	struct device_node *dn;
+	struct eeh_dev *parent;
+
+	/*
+	 * It might have the case for the indirect parent
+	 * EEH device already having associated PE, but
+	 * the direct parent EEH device doesn't have yet.
+	 */
+	dn = edev->dn->parent;
+	while (dn) {
+		/* We're poking out of PCI territory */
+		if (!PCI_DN(dn)) return NULL;
+
+		parent = of_node_to_eeh_dev(dn);
+		/* We're poking out of PCI territory */
+		if (!parent) return NULL;
+
+		if (parent->pe)
+			return parent->pe;
+
+		dn = dn->parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent;
+
+	eeh_lock();
+
+	/*
+	 * Search the PE has been existing or not according
+	 * to the PE address. If that has been existing, the
+	 * PE should be composed of PCI bus and its subordinate
+	 * components.
+	 */
+	pe = eeh_pe_get(edev);
+	if (pe && !(pe->type & EEH_PE_INVALID)) {
+		if (!edev->pe_config_addr) {
+			eeh_unlock();
+			pr_err("%s: PE with addr 0x%x already exists\n",
+				__func__, edev->config_addr);
+			return -EEXIST;
+		}
+
+		/* Mark the PE as type of PCI bus */
+		pe->type = EEH_PE_BUS;
+		edev->pe = pe;
+
+		/* Put the edev to PE */
+		list_add_tail(&edev->list, &pe->edevs);
+		eeh_unlock();
+		pr_debug("EEH: Add %s to Bus PE#%x\n",
+			edev->dn->full_name, pe->addr);
+
+		return 0;
+	} else if (pe && (pe->type & EEH_PE_INVALID)) {
+		list_add_tail(&edev->list, &pe->edevs);
+		edev->pe = pe;
+		/*
+		 * We're running to here because of PCI hotplug caused by
+		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
+		 */
+		parent = pe;
+		while (parent) {
+			if (!(parent->type & EEH_PE_INVALID))
+				break;
+			parent->type &= ~EEH_PE_INVALID;
+			parent = parent->parent;
+		}
+		eeh_unlock();
+		pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+			edev->dn->full_name, pe->addr, pe->parent->addr);
+
+		return 0;
+	}
+
+	/* Create a new EEH PE */
+	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+	if (!pe) {
+		eeh_unlock();
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+	pe->addr	= edev->pe_config_addr;
+	pe->config_addr	= edev->config_addr;
+
+	/*
+	 * Put the new EEH PE into hierarchy tree. If the parent
+	 * can't be found, the newly created PE will be attached
+	 * to PHB directly. Otherwise, we have to associate the
+	 * PE with its parent.
+	 */
+	parent = eeh_pe_get_parent(edev);
+	if (!parent) {
+		parent = eeh_phb_pe_get(edev->phb);
+		if (!parent) {
+			eeh_unlock();
+			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+				__func__, edev->phb->global_number);
+			edev->pe = NULL;
+			kfree(pe);
+			return -EEXIST;
+		}
+	}
+	pe->parent = parent;
+
+	/*
+	 * Put the newly created PE into the child list and
+	 * link the EEH device accordingly.
+	 */
+	list_add_tail(&pe->child, &parent->child_list);
+	list_add_tail(&edev->list, &pe->edevs);
+	edev->pe = pe;
+	eeh_unlock();
+	pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+		edev->dn->full_name, pe->addr, pe->parent->addr);
+
+	return 0;
+}
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ * @purge_pe: remove PE or not
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
+{
+	struct eeh_pe *pe, *parent, *child;
+	int cnt;
+
+	if (!edev->pe) {
+		pr_warning("%s: No PE found for EEH device %s\n",
+			__func__, edev->dn->full_name);
+		return -EEXIST;
+	}
+
+	eeh_lock();
+
+	/* Remove the EEH device */
+	pe = edev->pe;
+	edev->pe = NULL;
+	list_del(&edev->list);
+
+	/*
+	 * Check if the parent PE includes any EEH devices.
+	 * If not, we should delete that. Also, we should
+	 * delete the parent PE if it doesn't have associated
+	 * child PEs and EEH devices.
+	 */
+	while (1) {
+		parent = pe->parent;
+		if (pe->type & EEH_PE_PHB)
+			break;
+
+		if (purge_pe) {
+			if (list_empty(&pe->edevs) &&
+			    list_empty(&pe->child_list)) {
+				list_del(&pe->child);
+				kfree(pe);
+			} else {
+				break;
+			}
+		} else {
+			if (list_empty(&pe->edevs)) {
+				cnt = 0;
+				list_for_each_entry(child, &pe->child_list, child) {
+					if (!(child->type & EEH_PE_INVALID)) {
+						cnt++;
+						break;
+					}
+				}
+
+				if (!cnt)
+					pe->type |= EEH_PE_INVALID;
+				else
+					break;
+			}
+		}
+
+		pe = parent;
+	}
+
+	eeh_unlock();
+
+	return 0;
+}
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+	struct eeh_dev *tmp;
+	struct pci_dev *pdev;
+
+	/*
+	 * Mark the PE with the indicated state. Also,
+	 * the associated PCI device will be put into
+	 * I/O frozen state to avoid I/O accesses from
+	 * the PCI device driver.
+	 */
+	pe->state |= state;
+	eeh_pe_for_each_dev(pe, tmp) {
+		pdev = eeh_dev_to_pci_dev(tmp);
+		if (pdev)
+			pdev->error_state = pci_channel_io_frozen;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+	eeh_lock();
+	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+	eeh_unlock();
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+
+	pe->state &= ~state;
+	pe->check_count = 0;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+	eeh_lock();
+	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+	eeh_unlock();
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+	int i;
+	u32 cmd;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+
+	for (i = 4; i < 10; i++)
+		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+	/* 12 == Expansion ROM Address */
+	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+		SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+		SAVED_BYTE(PCI_LATENCY_TIMER));
+
+	/* max latency, min grant, interrupt pin and line */
+	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+	/*
+	 * Restore PERR & SERR bits, some devices require it,
+	 * don't touch the other command bits
+	 */
+	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
+	if (edev->config_space[1] & PCI_COMMAND_PARITY)
+		cmd |= PCI_COMMAND_PARITY;
+	else
+		cmd &= ~PCI_COMMAND_PARITY;
+	if (edev->config_space[1] & PCI_COMMAND_SERR)
+		cmd |= PCI_COMMAND_SERR;
+	else
+		cmd &= ~PCI_COMMAND_SERR;
+	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+	/*
+	 * We needn't take the EEH lock since eeh_pe_dev_traverse()
+	 * will take that.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+	struct pci_bus *bus = NULL;
+	struct eeh_dev *edev;
+	struct pci_dev *pdev;
+
+	eeh_lock();
+
+	if (pe->type & EEH_PE_PHB) {
+		bus = pe->phb->bus;
+	} else if (pe->type & EEH_PE_BUS ||
+		   pe->type & EEH_PE_DEVICE) {
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev)
+			bus = pdev->bus;
+	}
+
+	eeh_unlock();
+
+	return bus;
+}
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
new file mode 100644
index 0000000..e7ae348
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -0,0 +1,74 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format)               \
+static ssize_t eeh_show_##_name(struct device *dev,      \
+		struct device_attribute *attr, char *buf)          \
+{                                                        \
+	struct pci_dev *pdev = to_pci_dev(dev);               \
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);      \
+	                                                      \
+	if (!edev)                                            \
+		return 0;                                     \
+	                                                      \
+	return sprintf(buf, _format "\n", edev->_memb);       \
+}                                                        \
+static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
+
+EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
+EEH_SHOW_ATTR(eeh_config_addr,     config_addr,     "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+	int rc=0;
+
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
+	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+
+	if (rc)
+		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
+	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+}
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
new file mode 100644
index 0000000..3f60880
--- /dev/null
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -0,0 +1,111 @@
+/*
+ * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c"
+ *
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+/**
+ * __pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ * @purge_pe: destroy the PE on removal of PCI devices
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ * By default, the corresponding PE will be destroied during the
+ * normal PCI hotplug path. For PCI hotplug during EEH recovery,
+ * the corresponding PE won't be destroied and deallocated.
+ */
+void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
+{
+	struct pci_dev *dev, *tmp;
+	struct pci_bus *child_bus;
+
+	/* First go down child busses */
+	list_for_each_entry(child_bus, &bus->children, node)
+		__pcibios_remove_pci_devices(child_bus, purge_pe);
+
+	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+		 pci_domain_nr(bus),  bus->number);
+	list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+		pr_debug("     * Removing %s...\n", pci_name(dev));
+		eeh_remove_bus_device(dev, purge_pe);
+		pci_stop_and_remove_bus_device(dev);
+	}
+}
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+	__pcibios_remove_pci_devices(bus, 1);
+}
+EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+
+/**
+ * pcibios_add_pci_devices - adds new pci devices to bus
+ * @bus: the indicated PCI bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices.  (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pcibios_add_pci_devices(struct pci_bus * bus)
+{
+	int slotno, num, mode, pass, max;
+	struct pci_dev *dev;
+	struct device_node *dn = pci_bus_to_OF_node(bus);
+
+	eeh_add_device_tree_early(dn);
+
+	mode = PCI_PROBE_NORMAL;
+	if (ppc_md.pci_probe_mode)
+		mode = ppc_md.pci_probe_mode(bus);
+
+	if (mode == PCI_PROBE_DEVTREE) {
+		/* use ofdt-based probe */
+		of_rescan_bus(dn, bus);
+	} else if (mode == PCI_PROBE_NORMAL) {
+		/* use legacy probe */
+		slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+		num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+		if (!num)
+			return;
+		pcibios_setup_bus_devices(bus);
+		max = bus->busn_res.start;
+		for (pass = 0; pass < 2; pass++) {
+			list_for_each_entry(dev, &bus->devices, bus_list) {
+				if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+				    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+					max = pci_scan_bridge(bus, dev,
+							      max, pass);
+			}
+		}
+	}
+	pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index b62aab3..bed8c60 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -164,6 +164,11 @@ config IBMEBUS
 	help
 	  Bus device driver for GX bus based adapters.
 
+config EEH
+	bool
+	depends on (PPC_POWERNV || PPC_PSERIES) && PCI
+	default y
+
 config PPC_MPC106
 	bool
 	default n
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 4459eff..1bd3399 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -33,11 +33,6 @@ config PPC_SPLPAR
 	  processors, that is, which share physical processors between
 	  two or more partitions.
 
-config EEH
-	bool
-	depends on PPC_PSERIES && PCI
-	default y
-
 config PSERIES_MSI
        bool
        depends on PCI_MSI && EEH
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 53866e5..8ae0103 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,9 +6,7 @@ obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   firmware.o power.o dlpar.o mobility.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
-obj-$(CONFIG_EEH)	+= eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
-			   eeh_driver.o eeh_event.o eeh_sysfs.o \
-			   eeh_pseries.o
+obj-$(CONFIG_EEH)	+= eeh_pseries.o
 obj-$(CONFIG_KEXEC)	+= kexec.o
 obj-$(CONFIG_PCI)	+= pci.o pci_dlpar.o
 obj-$(CONFIG_PSERIES_MSI)	+= msi.o
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
deleted file mode 100644
index 8a83451..0000000
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ /dev/null
@@ -1,942 +0,0 @@
-/*
- * Copyright IBM Corporation 2001, 2005, 2006
- * Copyright Dave Engebretsen & Todd Inglett 2001
- * Copyright Linas Vepstas 2005, 2006
- * Copyright 2001-2012 IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
- */
-
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/pci.h>
-#include <linux/proc_fs.h>
-#include <linux/rbtree.h>
-#include <linux/seq_file.h>
-#include <linux/spinlock.h>
-#include <linux/export.h>
-#include <linux/of.h>
-
-#include <linux/atomic.h>
-#include <asm/eeh.h>
-#include <asm/eeh_event.h>
-#include <asm/io.h>
-#include <asm/machdep.h>
-#include <asm/ppc-pci.h>
-#include <asm/rtas.h>
-
-
-/** Overview:
- *  EEH, or "Extended Error Handling" is a PCI bridge technology for
- *  dealing with PCI bus errors that can't be dealt with within the
- *  usual PCI framework, except by check-stopping the CPU.  Systems
- *  that are designed for high-availability/reliability cannot afford
- *  to crash due to a "mere" PCI error, thus the need for EEH.
- *  An EEH-capable bridge operates by converting a detected error
- *  into a "slot freeze", taking the PCI adapter off-line, making
- *  the slot behave, from the OS'es point of view, as if the slot
- *  were "empty": all reads return 0xff's and all writes are silently
- *  ignored.  EEH slot isolation events can be triggered by parity
- *  errors on the address or data busses (e.g. during posted writes),
- *  which in turn might be caused by low voltage on the bus, dust,
- *  vibration, humidity, radioactivity or plain-old failed hardware.
- *
- *  Note, however, that one of the leading causes of EEH slot
- *  freeze events are buggy device drivers, buggy device microcode,
- *  or buggy device hardware.  This is because any attempt by the
- *  device to bus-master data to a memory address that is not
- *  assigned to the device will trigger a slot freeze.   (The idea
- *  is to prevent devices-gone-wild from corrupting system memory).
- *  Buggy hardware/drivers will have a miserable time co-existing
- *  with EEH.
- *
- *  Ideally, a PCI device driver, when suspecting that an isolation
- *  event has occurred (e.g. by reading 0xff's), will then ask EEH
- *  whether this is the case, and then take appropriate steps to
- *  reset the PCI slot, the PCI device, and then resume operations.
- *  However, until that day,  the checking is done here, with the
- *  eeh_check_failure() routine embedded in the MMIO macros.  If
- *  the slot is found to be isolated, an "EEH Event" is synthesized
- *  and sent out for processing.
- */
-
-/* If a device driver keeps reading an MMIO register in an interrupt
- * handler after a slot isolation event, it might be broken.
- * This sets the threshold for how many read attempts we allow
- * before printing an error message.
- */
-#define EEH_MAX_FAILS	2100000
-
-/* Time to wait for a PCI slot to report status, in milliseconds */
-#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
-
-/* Platform dependent EEH operations */
-struct eeh_ops *eeh_ops = NULL;
-
-int eeh_subsystem_enabled;
-EXPORT_SYMBOL(eeh_subsystem_enabled);
-
-/*
- * EEH probe mode support. The intention is to support multiple
- * platforms for EEH. Some platforms like pSeries do PCI emunation
- * based on device tree. However, other platforms like powernv probe
- * PCI devices from hardware. The flag is used to distinguish that.
- * In addition, struct eeh_ops::probe would be invoked for particular
- * OF node or PCI device so that the corresponding PE would be created
- * there.
- */
-int eeh_probe_mode;
-
-/* Global EEH mutex */
-DEFINE_MUTEX(eeh_mutex);
-
-/* Lock to avoid races due to multiple reports of an error */
-static DEFINE_RAW_SPINLOCK(confirm_error_lock);
-
-/* Buffer for reporting pci register dumps. Its here in BSS, and
- * not dynamically alloced, so that it ends up in RMO where RTAS
- * can access it.
- */
-#define EEH_PCI_REGS_LOG_LEN 4096
-static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
-
-/*
- * The struct is used to maintain the EEH global statistic
- * information. Besides, the EEH global statistics will be
- * exported to user space through procfs
- */
-struct eeh_stats {
-	u64 no_device;		/* PCI device not found		*/
-	u64 no_dn;		/* OF node not found		*/
-	u64 no_cfg_addr;	/* Config address not found	*/
-	u64 ignored_check;	/* EEH check skipped		*/
-	u64 total_mmio_ffs;	/* Total EEH checks		*/
-	u64 false_positives;	/* Unnecessary EEH checks	*/
-	u64 slot_resets;	/* PE reset			*/
-};
-
-static struct eeh_stats eeh_stats;
-
-#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
-
-/**
- * eeh_gather_pci_data - Copy assorted PCI config space registers to buff
- * @edev: device to report data for
- * @buf: point to buffer in which to log
- * @len: amount of room in buffer
- *
- * This routine captures assorted PCI configuration space data,
- * and puts them into a buffer for RTAS error logging.
- */
-static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
-{
-	struct device_node *dn = eeh_dev_to_of_node(edev);
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	u32 cfg;
-	int cap, i;
-	int n = 0;
-
-	n += scnprintf(buf+n, len-n, "%s\n", dn->full_name);
-	printk(KERN_WARNING "EEH: of node=%s\n", dn->full_name);
-
-	eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg);
-	n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-	printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
-
-	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg);
-	n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-	printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg);
-
-	if (!dev) {
-		printk(KERN_WARNING "EEH: no PCI device for this of node\n");
-		return n;
-	}
-
-	/* Gather bridge-specific registers */
-	if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
-		eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg);
-		n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-		printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg);
-
-		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg);
-		n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-		printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg);
-	}
-
-	/* Dump out the PCI-X command and status regs */
-	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
-	if (cap) {
-		eeh_ops->read_config(dn, cap, 4, &cfg);
-		n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
-		printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg);
-
-		eeh_ops->read_config(dn, cap+4, 4, &cfg);
-		n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
-		printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg);
-	}
-
-	/* If PCI-E capable, dump PCI-E cap 10, and the AER */
-	cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
-	if (cap) {
-		n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
-		printk(KERN_WARNING
-		       "EEH: PCI-E capabilities and status follow:\n");
-
-		for (i=0; i<=8; i++) {
-			eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
-			n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
-			printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg);
-		}
-
-		cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
-		if (cap) {
-			n += scnprintf(buf+n, len-n, "pci-e AER:\n");
-			printk(KERN_WARNING
-			       "EEH: PCI-E AER capability register set follows:\n");
-
-			for (i=0; i<14; i++) {
-				eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
-				n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
-				printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg);
-			}
-		}
-	}
-
-	return n;
-}
-
-/**
- * eeh_slot_error_detail - Generate combined log including driver log and error log
- * @pe: EEH PE
- * @severity: temporary or permanent error log
- *
- * This routine should be called to generate the combined log, which
- * is comprised of driver log and error log. The driver log is figured
- * out from the config space of the corresponding PCI device, while
- * the error log is fetched through platform dependent function call.
- */
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
-{
-	size_t loglen = 0;
-	struct eeh_dev *edev;
-
-	eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
-	eeh_ops->configure_bridge(pe);
-	eeh_pe_restore_bars(pe);
-
-	pci_regs_buf[0] = 0;
-	eeh_pe_for_each_dev(pe, edev) {
-		loglen += eeh_gather_pci_data(edev, pci_regs_buf,
-				EEH_PCI_REGS_LOG_LEN);
-        }
-
-	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
-}
-
-/**
- * eeh_token_to_phys - Convert EEH address token to phys address
- * @token: I/O token, should be address in the form 0xA....
- *
- * This routine should be called to convert virtual I/O address
- * to physical one.
- */
-static inline unsigned long eeh_token_to_phys(unsigned long token)
-{
-	pte_t *ptep;
-	unsigned long pa;
-
-	ptep = find_linux_pte(init_mm.pgd, token);
-	if (!ptep)
-		return token;
-	pa = pte_pfn(*ptep) << PAGE_SHIFT;
-
-	return pa | (token & (PAGE_SIZE-1));
-}
-
-/**
- * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
- * @edev: eeh device
- *
- * Check for an EEH failure for the given device node.  Call this
- * routine if the result of a read was all 0xff's and you want to
- * find out if this is due to an EEH slot freeze.  This routine
- * will query firmware for the EEH status.
- *
- * Returns 0 if there has not been an EEH error; otherwise returns
- * a non-zero value and queues up a slot isolation event notification.
- *
- * It is safe to call this routine in an interrupt context.
- */
-int eeh_dev_check_failure(struct eeh_dev *edev)
-{
-	int ret;
-	unsigned long flags;
-	struct device_node *dn;
-	struct pci_dev *dev;
-	struct eeh_pe *pe;
-	int rc = 0;
-	const char *location;
-
-	eeh_stats.total_mmio_ffs++;
-
-	if (!eeh_subsystem_enabled)
-		return 0;
-
-	if (!edev) {
-		eeh_stats.no_dn++;
-		return 0;
-	}
-	dn = eeh_dev_to_of_node(edev);
-	dev = eeh_dev_to_pci_dev(edev);
-	pe = edev->pe;
-
-	/* Access to IO BARs might get this far and still not want checking. */
-	if (!pe) {
-		eeh_stats.ignored_check++;
-		pr_debug("EEH: Ignored check for %s %s\n",
-			eeh_pci_name(dev), dn->full_name);
-		return 0;
-	}
-
-	if (!pe->addr && !pe->config_addr) {
-		eeh_stats.no_cfg_addr++;
-		return 0;
-	}
-
-	/* If we already have a pending isolation event for this
-	 * slot, we know it's bad already, we don't need to check.
-	 * Do this checking under a lock; as multiple PCI devices
-	 * in one slot might report errors simultaneously, and we
-	 * only want one error recovery routine running.
-	 */
-	raw_spin_lock_irqsave(&confirm_error_lock, flags);
-	rc = 1;
-	if (pe->state & EEH_PE_ISOLATED) {
-		pe->check_count++;
-		if (pe->check_count % EEH_MAX_FAILS == 0) {
-			location = of_get_property(dn, "ibm,loc-code", NULL);
-			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
-				"location=%s driver=%s pci addr=%s\n",
-				pe->check_count, location,
-				eeh_driver_name(dev), eeh_pci_name(dev));
-			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
-				eeh_driver_name(dev));
-			dump_stack();
-		}
-		goto dn_unlock;
-	}
-
-	/*
-	 * Now test for an EEH failure.  This is VERY expensive.
-	 * Note that the eeh_config_addr may be a parent device
-	 * in the case of a device behind a bridge, or it may be
-	 * function zero of a multi-function device.
-	 * In any case they must share a common PHB.
-	 */
-	ret = eeh_ops->get_state(pe, NULL);
-
-	/* Note that config-io to empty slots may fail;
-	 * they are empty when they don't have children.
-	 * We will punt with the following conditions: Failure to get
-	 * PE's state, EEH not support and Permanently unavailable
-	 * state, PE is in good state.
-	 */
-	if ((ret < 0) ||
-	    (ret == EEH_STATE_NOT_SUPPORT) ||
-	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
-	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
-		eeh_stats.false_positives++;
-		pe->false_positives++;
-		rc = 0;
-		goto dn_unlock;
-	}
-
-	eeh_stats.slot_resets++;
-
-	/* Avoid repeated reports of this failure, including problems
-	 * with other functions on this device, and functions under
-	 * bridges.
-	 */
-	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
-	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
-
-	eeh_send_failure_event(pe);
-
-	/* Most EEH events are due to device driver bugs.  Having
-	 * a stack trace will help the device-driver authors figure
-	 * out what happened.  So print that out.
-	 */
-	WARN(1, "EEH: failure detected\n");
-	return 1;
-
-dn_unlock:
-	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
-	return rc;
-}
-
-EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
-
-/**
- * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
- * @token: I/O token, should be address in the form 0xA....
- * @val: value, should be all 1's (XXX why do we need this arg??)
- *
- * Check for an EEH failure at the given token address.  Call this
- * routine if the result of a read was all 0xff's and you want to
- * find out if this is due to an EEH slot freeze event.  This routine
- * will query firmware for the EEH status.
- *
- * Note this routine is safe to call in an interrupt context.
- */
-unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
-{
-	unsigned long addr;
-	struct eeh_dev *edev;
-
-	/* Finding the phys addr + pci device; this is pretty quick. */
-	addr = eeh_token_to_phys((unsigned long __force) token);
-	edev = eeh_addr_cache_get_dev(addr);
-	if (!edev) {
-		eeh_stats.no_device++;
-		return val;
-	}
-
-	eeh_dev_check_failure(edev);
-
-	pci_dev_put(eeh_dev_to_pci_dev(edev));
-	return val;
-}
-
-EXPORT_SYMBOL(eeh_check_failure);
-
-
-/**
- * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
- * @pe: EEH PE
- *
- * This routine should be called to reenable frozen MMIO or DMA
- * so that it would work correctly again. It's useful while doing
- * recovery or log collection on the indicated device.
- */
-int eeh_pci_enable(struct eeh_pe *pe, int function)
-{
-	int rc;
-
-	rc = eeh_ops->set_option(pe, function);
-	if (rc)
-		pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n",
-			__func__, function, pe->phb->global_number, pe->addr, rc);
-
-	rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
-	if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) &&
-	   (function == EEH_OPT_THAW_MMIO))
-		return 0;
-
-	return rc;
-}
-
-/**
- * pcibios_set_pcie_slot_reset - Set PCI-E reset state
- * @dev: pci device struct
- * @state: reset state to enter
- *
- * Return value:
- * 	0 if success
- */
-int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
-{
-	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
-	struct eeh_pe *pe = edev->pe;
-
-	if (!pe) {
-		pr_err("%s: No PE found on PCI device %s\n",
-			__func__, pci_name(dev));
-		return -EINVAL;
-	}
-
-	switch (state) {
-	case pcie_deassert_reset:
-		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
-		break;
-	case pcie_hot_reset:
-		eeh_ops->reset(pe, EEH_RESET_HOT);
-		break;
-	case pcie_warm_reset:
-		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
-		break;
-	default:
-		return -EINVAL;
-	};
-
-	return 0;
-}
-
-/**
- * eeh_set_pe_freset - Check the required reset for the indicated device
- * @data: EEH device
- * @flag: return value
- *
- * Each device might have its preferred reset type: fundamental or
- * hot reset. The routine is used to collected the information for
- * the indicated device and its children so that the bunch of the
- * devices could be reset properly.
- */
-static void *eeh_set_dev_freset(void *data, void *flag)
-{
-	struct pci_dev *dev;
-	unsigned int *freset = (unsigned int *)flag;
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-
-	dev = eeh_dev_to_pci_dev(edev);
-	if (dev)
-		*freset |= dev->needs_freset;
-
-	return NULL;
-}
-
-/**
- * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
- * @pe: EEH PE
- *
- * Assert the PCI #RST line for 1/4 second.
- */
-static void eeh_reset_pe_once(struct eeh_pe *pe)
-{
-	unsigned int freset = 0;
-
-	/* Determine type of EEH reset required for
-	 * Partitionable Endpoint, a hot-reset (1)
-	 * or a fundamental reset (3).
-	 * A fundamental reset required by any device under
-	 * Partitionable Endpoint trumps hot-reset.
-	 */
-	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
-
-	if (freset)
-		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
-	else
-		eeh_ops->reset(pe, EEH_RESET_HOT);
-
-	/* The PCI bus requires that the reset be held high for at least
-	 * a 100 milliseconds. We wait a bit longer 'just in case'.
-	 */
-#define PCI_BUS_RST_HOLD_TIME_MSEC 250
-	msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
-
-	/* We might get hit with another EEH freeze as soon as the
-	 * pci slot reset line is dropped. Make sure we don't miss
-	 * these, and clear the flag now.
-	 */
-	eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
-
-	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
-
-	/* After a PCI slot has been reset, the PCI Express spec requires
-	 * a 1.5 second idle time for the bus to stabilize, before starting
-	 * up traffic.
-	 */
-#define PCI_BUS_SETTLE_TIME_MSEC 1800
-	msleep(PCI_BUS_SETTLE_TIME_MSEC);
-}
-
-/**
- * eeh_reset_pe - Reset the indicated PE
- * @pe: EEH PE
- *
- * This routine should be called to reset indicated device, including
- * PE. A PE might include multiple PCI devices and sometimes PCI bridges
- * might be involved as well.
- */
-int eeh_reset_pe(struct eeh_pe *pe)
-{
-	int i, rc;
-
-	/* Take three shots at resetting the bus */
-	for (i=0; i<3; i++) {
-		eeh_reset_pe_once(pe);
-
-		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
-		if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
-			return 0;
-
-		if (rc < 0) {
-			pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
-				__func__, pe->phb->global_number, pe->addr);
-			return -1;
-		}
-		pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
-			i+1, pe->phb->global_number, pe->addr, rc);
-	}
-
-	return -1;
-}
-
-/**
- * eeh_save_bars - Save device bars
- * @edev: PCI device associated EEH device
- *
- * Save the values of the device bars. Unlike the restore
- * routine, this routine is *not* recursive. This is because
- * PCI devices are added individually; but, for the restore,
- * an entire slot is reset at a time.
- */
-void eeh_save_bars(struct eeh_dev *edev)
-{
-	int i;
-	struct device_node *dn;
-
-	if (!edev)
-		return;
-	dn = eeh_dev_to_of_node(edev);
-
-	for (i = 0; i < 16; i++)
-		eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
-}
-
-/**
- * eeh_ops_register - Register platform dependent EEH operations
- * @ops: platform dependent EEH operations
- *
- * Register the platform dependent EEH operation callback
- * functions. The platform should call this function before
- * any other EEH operations.
- */
-int __init eeh_ops_register(struct eeh_ops *ops)
-{
-	if (!ops->name) {
-		pr_warning("%s: Invalid EEH ops name for %p\n",
-			__func__, ops);
-		return -EINVAL;
-	}
-
-	if (eeh_ops && eeh_ops != ops) {
-		pr_warning("%s: EEH ops of platform %s already existing (%s)\n",
-			__func__, eeh_ops->name, ops->name);
-		return -EEXIST;
-	}
-
-	eeh_ops = ops;
-
-	return 0;
-}
-
-/**
- * eeh_ops_unregister - Unreigster platform dependent EEH operations
- * @name: name of EEH platform operations
- *
- * Unregister the platform dependent EEH operation callback
- * functions.
- */
-int __exit eeh_ops_unregister(const char *name)
-{
-	if (!name || !strlen(name)) {
-		pr_warning("%s: Invalid EEH ops name\n",
-			__func__);
-		return -EINVAL;
-	}
-
-	if (eeh_ops && !strcmp(eeh_ops->name, name)) {
-		eeh_ops = NULL;
-		return 0;
-	}
-
-	return -EEXIST;
-}
-
-/**
- * eeh_init - EEH initialization
- *
- * Initialize EEH by trying to enable it for all of the adapters in the system.
- * As a side effect we can determine here if eeh is supported at all.
- * Note that we leave EEH on so failed config cycles won't cause a machine
- * check.  If a user turns off EEH for a particular adapter they are really
- * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't
- * grant access to a slot if EEH isn't enabled, and so we always enable
- * EEH for all slots/all devices.
- *
- * The eeh-force-off option disables EEH checking globally, for all slots.
- * Even if force-off is set, the EEH hardware is still enabled, so that
- * newer systems can boot.
- */
-static int __init eeh_init(void)
-{
-	struct pci_controller *hose, *tmp;
-	struct device_node *phb;
-	int ret;
-
-	/* call platform initialization function */
-	if (!eeh_ops) {
-		pr_warning("%s: Platform EEH operation not found\n",
-			__func__);
-		return -EEXIST;
-	} else if ((ret = eeh_ops->init())) {
-		pr_warning("%s: Failed to call platform init function (%d)\n",
-			__func__, ret);
-		return ret;
-	}
-
-	raw_spin_lock_init(&confirm_error_lock);
-
-	/* Enable EEH for all adapters */
-	if (eeh_probe_mode_devtree()) {
-		list_for_each_entry_safe(hose, tmp,
-			&hose_list, list_node) {
-			phb = hose->dn;
-			traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
-		}
-	}
-
-	if (eeh_subsystem_enabled)
-		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
-	else
-		pr_warning("EEH: No capable adapters found\n");
-
-	return ret;
-}
-
-core_initcall_sync(eeh_init);
-
-/**
- * eeh_add_device_early - Enable EEH for the indicated device_node
- * @dn: device node for which to set up EEH
- *
- * This routine must be used to perform EEH initialization for PCI
- * devices that were added after system boot (e.g. hotplug, dlpar).
- * This routine must be called before any i/o is performed to the
- * adapter (inluding any config-space i/o).
- * Whether this actually enables EEH or not for this device depends
- * on the CEC architecture, type of the device, on earlier boot
- * command-line arguments & etc.
- */
-static void eeh_add_device_early(struct device_node *dn)
-{
-	struct pci_controller *phb;
-
-	if (!of_node_to_eeh_dev(dn))
-		return;
-	phb = of_node_to_eeh_dev(dn)->phb;
-
-	/* USB Bus children of PCI devices will not have BUID's */
-	if (NULL == phb || 0 == phb->buid)
-		return;
-
-	/* FIXME: hotplug support on POWERNV */
-	eeh_ops->of_probe(dn, NULL);
-}
-
-/**
- * eeh_add_device_tree_early - Enable EEH for the indicated device
- * @dn: device node
- *
- * This routine must be used to perform EEH initialization for the
- * indicated PCI device that was added after system boot (e.g.
- * hotplug, dlpar).
- */
-void eeh_add_device_tree_early(struct device_node *dn)
-{
-	struct device_node *sib;
-
-	for_each_child_of_node(dn, sib)
-		eeh_add_device_tree_early(sib);
-	eeh_add_device_early(dn);
-}
-EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
-
-/**
- * eeh_add_device_late - Perform EEH initialization for the indicated pci device
- * @dev: pci device for which to set up EEH
- *
- * This routine must be used to complete EEH initialization for PCI
- * devices that were added after system boot (e.g. hotplug, dlpar).
- */
-static void eeh_add_device_late(struct pci_dev *dev)
-{
-	struct device_node *dn;
-	struct eeh_dev *edev;
-
-	if (!dev || !eeh_subsystem_enabled)
-		return;
-
-	pr_debug("EEH: Adding device %s\n", pci_name(dev));
-
-	dn = pci_device_to_OF_node(dev);
-	edev = of_node_to_eeh_dev(dn);
-	if (edev->pdev == dev) {
-		pr_debug("EEH: Already referenced !\n");
-		return;
-	}
-	WARN_ON(edev->pdev);
-
-	pci_dev_get(dev);
-	edev->pdev = dev;
-	dev->dev.archdata.edev = edev;
-
-	eeh_addr_cache_insert_dev(dev);
-}
-
-/**
- * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
- * @bus: PCI bus
- *
- * This routine must be used to perform EEH initialization for PCI
- * devices which are attached to the indicated PCI bus. The PCI bus
- * is added after system boot through hotplug or dlpar.
- */
-void eeh_add_device_tree_late(struct pci_bus *bus)
-{
-	struct pci_dev *dev;
-
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		eeh_add_device_late(dev);
-		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-			struct pci_bus *subbus = dev->subordinate;
-			if (subbus)
-				eeh_add_device_tree_late(subbus);
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
-
-/**
- * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
- * @bus: PCI bus
- *
- * This routine must be used to add EEH sysfs files for PCI
- * devices which are attached to the indicated PCI bus. The PCI bus
- * is added after system boot through hotplug or dlpar.
- */
-void eeh_add_sysfs_files(struct pci_bus *bus)
-{
-	struct pci_dev *dev;
-
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		eeh_sysfs_add_device(dev);
-		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-			struct pci_bus *subbus = dev->subordinate;
-			if (subbus)
-				eeh_add_sysfs_files(subbus);
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
-
-/**
- * eeh_remove_device - Undo EEH setup for the indicated pci device
- * @dev: pci device to be removed
- * @purge_pe: remove the PE or not
- *
- * This routine should be called when a device is removed from
- * a running system (e.g. by hotplug or dlpar).  It unregisters
- * the PCI device from the EEH subsystem.  I/O errors affecting
- * this device will no longer be detected after this call; thus,
- * i/o errors affecting this slot may leave this device unusable.
- */
-static void eeh_remove_device(struct pci_dev *dev, int purge_pe)
-{
-	struct eeh_dev *edev;
-
-	if (!dev || !eeh_subsystem_enabled)
-		return;
-	edev = pci_dev_to_eeh_dev(dev);
-
-	/* Unregister the device with the EEH/PCI address search system */
-	pr_debug("EEH: Removing device %s\n", pci_name(dev));
-
-	if (!edev || !edev->pdev) {
-		pr_debug("EEH: Not referenced !\n");
-		return;
-	}
-	edev->pdev = NULL;
-	dev->dev.archdata.edev = NULL;
-	pci_dev_put(dev);
-
-	eeh_rmv_from_parent_pe(edev, purge_pe);
-	eeh_addr_cache_rmv_dev(dev);
-	eeh_sysfs_remove_device(dev);
-}
-
-/**
- * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device
- * @dev: PCI device
- * @purge_pe: remove the corresponding PE or not
- *
- * This routine must be called when a device is removed from the
- * running system through hotplug or dlpar. The corresponding
- * PCI address cache will be removed.
- */
-void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe)
-{
-	struct pci_bus *bus = dev->subordinate;
-	struct pci_dev *child, *tmp;
-
-	eeh_remove_device(dev, purge_pe);
-
-	if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
-			 eeh_remove_bus_device(child, purge_pe);
-	}
-}
-EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
-
-static int proc_eeh_show(struct seq_file *m, void *v)
-{
-	if (0 == eeh_subsystem_enabled) {
-		seq_printf(m, "EEH Subsystem is globally disabled\n");
-		seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
-	} else {
-		seq_printf(m, "EEH Subsystem is enabled\n");
-		seq_printf(m,
-				"no device=%llu\n"
-				"no device node=%llu\n"
-				"no config address=%llu\n"
-				"check not wanted=%llu\n"
-				"eeh_total_mmio_ffs=%llu\n"
-				"eeh_false_positives=%llu\n"
-				"eeh_slot_resets=%llu\n",
-				eeh_stats.no_device,
-				eeh_stats.no_dn,
-				eeh_stats.no_cfg_addr,
-				eeh_stats.ignored_check,
-				eeh_stats.total_mmio_ffs,
-				eeh_stats.false_positives,
-				eeh_stats.slot_resets);
-	}
-
-	return 0;
-}
-
-static int proc_eeh_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_eeh_show, NULL);
-}
-
-static const struct file_operations proc_eeh_operations = {
-	.open      = proc_eeh_open,
-	.read      = seq_read,
-	.llseek    = seq_lseek,
-	.release   = single_release,
-};
-
-static int __init eeh_init_proc(void)
-{
-	if (machine_is(pseries))
-		proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
-	return 0;
-}
-__initcall(eeh_init_proc);
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
deleted file mode 100644
index 5a4c879..0000000
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * PCI address cache; allows the lookup of PCI devices based on I/O address
- *
- * Copyright IBM Corporation 2004
- * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/list.h>
-#include <linux/pci.h>
-#include <linux/rbtree.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-#include <asm/pci-bridge.h>
-#include <asm/ppc-pci.h>
-
-
-/**
- * The pci address cache subsystem.  This subsystem places
- * PCI device address resources into a red-black tree, sorted
- * according to the address range, so that given only an i/o
- * address, the corresponding PCI device can be **quickly**
- * found. It is safe to perform an address lookup in an interrupt
- * context; this ability is an important feature.
- *
- * Currently, the only customer of this code is the EEH subsystem;
- * thus, this code has been somewhat tailored to suit EEH better.
- * In particular, the cache does *not* hold the addresses of devices
- * for which EEH is not enabled.
- *
- * (Implementation Note: The RB tree seems to be better/faster
- * than any hash algo I could think of for this problem, even
- * with the penalty of slow pointer chases for d-cache misses).
- */
-struct pci_io_addr_range {
-	struct rb_node rb_node;
-	unsigned long addr_lo;
-	unsigned long addr_hi;
-	struct eeh_dev *edev;
-	struct pci_dev *pcidev;
-	unsigned int flags;
-};
-
-static struct pci_io_addr_cache {
-	struct rb_root rb_root;
-	spinlock_t piar_lock;
-} pci_io_addr_cache_root;
-
-static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
-{
-	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
-
-	while (n) {
-		struct pci_io_addr_range *piar;
-		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-
-		if (addr < piar->addr_lo) {
-			n = n->rb_left;
-		} else {
-			if (addr > piar->addr_hi) {
-				n = n->rb_right;
-			} else {
-				pci_dev_get(piar->pcidev);
-				return piar->edev;
-			}
-		}
-	}
-
-	return NULL;
-}
-
-/**
- * eeh_addr_cache_get_dev - Get device, given only address
- * @addr: mmio (PIO) phys address or i/o port number
- *
- * Given an mmio phys address, or a port number, find a pci device
- * that implements this address.  Be sure to pci_dev_put the device
- * when finished.  I/O port numbers are assumed to be offset
- * from zero (that is, they do *not* have pci_io_addr added in).
- * It is safe to call this function within an interrupt.
- */
-struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
-{
-	struct eeh_dev *edev;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	edev = __eeh_addr_cache_get_device(addr);
-	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-	return edev;
-}
-
-#ifdef DEBUG
-/*
- * Handy-dandy debug print routine, does nothing more
- * than print out the contents of our addr cache.
- */
-static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
-{
-	struct rb_node *n;
-	int cnt = 0;
-
-	n = rb_first(&cache->rb_root);
-	while (n) {
-		struct pci_io_addr_range *piar;
-		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-		pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
-		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
-		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
-		cnt++;
-		n = rb_next(n);
-	}
-}
-#endif
-
-/* Insert address range into the rb tree. */
-static struct pci_io_addr_range *
-eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
-		      unsigned long ahi, unsigned int flags)
-{
-	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct pci_io_addr_range *piar;
-
-	/* Walk tree, find a place to insert into tree */
-	while (*p) {
-		parent = *p;
-		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
-		if (ahi < piar->addr_lo) {
-			p = &parent->rb_left;
-		} else if (alo > piar->addr_hi) {
-			p = &parent->rb_right;
-		} else {
-			if (dev != piar->pcidev ||
-			    alo != piar->addr_lo || ahi != piar->addr_hi) {
-				pr_warning("PIAR: overlapping address range\n");
-			}
-			return piar;
-		}
-	}
-	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
-	if (!piar)
-		return NULL;
-
-	pci_dev_get(dev);
-	piar->addr_lo = alo;
-	piar->addr_hi = ahi;
-	piar->edev = pci_dev_to_eeh_dev(dev);
-	piar->pcidev = dev;
-	piar->flags = flags;
-
-#ifdef DEBUG
-	pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
-	                  alo, ahi, pci_name(dev));
-#endif
-
-	rb_link_node(&piar->rb_node, parent, p);
-	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
-
-	return piar;
-}
-
-static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
-{
-	struct device_node *dn;
-	struct eeh_dev *edev;
-	int i;
-
-	dn = pci_device_to_OF_node(dev);
-	if (!dn) {
-		pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev));
-		return;
-	}
-
-	edev = of_node_to_eeh_dev(dn);
-	if (!edev) {
-		pr_warning("PCI: no EEH dev found for dn=%s\n",
-			dn->full_name);
-		return;
-	}
-
-	/* Skip any devices for which EEH is not enabled. */
-	if (!edev->pe) {
-#ifdef DEBUG
-		pr_info("PCI: skip building address cache for=%s - %s\n",
-			pci_name(dev), dn->full_name);
-#endif
-		return;
-	}
-
-	/* Walk resources on this device, poke them into the tree */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		unsigned long start = pci_resource_start(dev,i);
-		unsigned long end = pci_resource_end(dev,i);
-		unsigned int flags = pci_resource_flags(dev,i);
-
-		/* We are interested only bus addresses, not dma or other stuff */
-		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-			continue;
-		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
-			 continue;
-		eeh_addr_cache_insert(dev, start, end, flags);
-	}
-}
-
-/**
- * eeh_addr_cache_insert_dev - Add a device to the address cache
- * @dev: PCI device whose I/O addresses we are interested in.
- *
- * In order to support the fast lookup of devices based on addresses,
- * we maintain a cache of devices that can be quickly searched.
- * This routine adds a device to that cache.
- */
-void eeh_addr_cache_insert_dev(struct pci_dev *dev)
-{
-	unsigned long flags;
-
-	/* Ignore PCI bridges */
-	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
-		return;
-
-	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	__eeh_addr_cache_insert_dev(dev);
-	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-}
-
-static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
-{
-	struct rb_node *n;
-
-restart:
-	n = rb_first(&pci_io_addr_cache_root.rb_root);
-	while (n) {
-		struct pci_io_addr_range *piar;
-		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-
-		if (piar->pcidev == dev) {
-			rb_erase(n, &pci_io_addr_cache_root.rb_root);
-			pci_dev_put(piar->pcidev);
-			kfree(piar);
-			goto restart;
-		}
-		n = rb_next(n);
-	}
-}
-
-/**
- * eeh_addr_cache_rmv_dev - remove pci device from addr cache
- * @dev: device to remove
- *
- * Remove a device from the addr-cache tree.
- * This is potentially expensive, since it will walk
- * the tree multiple times (once per resource).
- * But so what; device removal doesn't need to be that fast.
- */
-void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	__eeh_addr_cache_rmv_dev(dev);
-	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-}
-
-/**
- * eeh_addr_cache_build - Build a cache of I/O addresses
- *
- * Build a cache of pci i/o addresses.  This cache will be used to
- * find the pci device that corresponds to a given address.
- * This routine scans all pci busses to build the cache.
- * Must be run late in boot process, after the pci controllers
- * have been scanned for devices (after all device resources are known).
- */
-void __init eeh_addr_cache_build(void)
-{
-	struct device_node *dn;
-	struct eeh_dev *edev;
-	struct pci_dev *dev = NULL;
-
-	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
-
-	for_each_pci_dev(dev) {
-		eeh_addr_cache_insert_dev(dev);
-
-		dn = pci_device_to_OF_node(dev);
-		if (!dn)
-			continue;
-
-		edev = of_node_to_eeh_dev(dn);
-		if (!edev)
-			continue;
-
-		pci_dev_get(dev);  /* matching put is in eeh_remove_device() */
-		dev->dev.archdata.edev = edev;
-		edev->pdev = dev;
-
-		eeh_sysfs_add_device(dev);
-	}
-
-#ifdef DEBUG
-	/* Verify tree built up above, echo back the list of addrs. */
-	eeh_addr_cache_print(&pci_io_addr_cache_root);
-#endif
-}
-
diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c
deleted file mode 100644
index 1efa28f..0000000
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * The file intends to implement dynamic creation of EEH device, which will
- * be bound with OF node and PCI device simutaneously. The EEH devices would
- * be foundamental information for EEH core components to work proerly. Besides,
- * We have to support multiple situations where dynamic creation of EEH device
- * is required:
- *
- * 1) Before PCI emunation starts, we need create EEH devices according to the
- *    PCI sensitive OF nodes.
- * 2) When PCI emunation is done, we need do the binding between PCI device and
- *    the associated EEH device.
- * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
- *    will be created while PCI sensitive OF node is detected from DR.
- * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
- *    PHB is newly inserted, we also need create EEH devices accordingly.
- *
- * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/string.h>
-
-#include <asm/pci-bridge.h>
-#include <asm/ppc-pci.h>
-
-/**
- * eeh_dev_init - Create EEH device according to OF node
- * @dn: device node
- * @data: PHB
- *
- * It will create EEH device according to the given OF node. The function
- * might be called by PCI emunation, DR, PHB hotplug.
- */
-void *eeh_dev_init(struct device_node *dn, void *data)
-{
-	struct pci_controller *phb = data;
-	struct eeh_dev *edev;
-
-	/* Allocate EEH device */
-	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
-	if (!edev) {
-		pr_warning("%s: out of memory\n", __func__);
-		return NULL;
-	}
-
-	/* Associate EEH device with OF node */
-	PCI_DN(dn)->edev = edev;
-	edev->dn  = dn;
-	edev->phb = phb;
-	INIT_LIST_HEAD(&edev->list);
-
-	return NULL;
-}
-
-/**
- * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
- * @phb: PHB
- *
- * Scan the PHB OF node and its child association, then create the
- * EEH devices accordingly
- */
-void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
-{
-	struct device_node *dn = phb->dn;
-
-	/* EEH PE for PHB */
-	eeh_phb_pe_create(phb);
-
-	/* EEH device for PHB */
-	eeh_dev_init(dn, phb);
-
-	/* EEH devices for children OF nodes */
-	traverse_pci_devices(dn, eeh_dev_init, phb);
-}
-
-/**
- * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
- *
- * Scan all the existing PHBs and create EEH devices for their OF
- * nodes and their children OF nodes
- */
-static int __init eeh_dev_phb_init(void)
-{
-	struct pci_controller *phb, *tmp;
-
-	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
-		eeh_dev_phb_init_dynamic(phb);
-
-	pr_info("EEH: devices created\n");
-
-	return 0;
-}
-
-core_initcall(eeh_dev_phb_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
deleted file mode 100644
index 0acc5a2..0000000
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ /dev/null
@@ -1,552 +0,0 @@
-/*
- * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
- * Copyright IBM Corp. 2004 2005
- * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
- */
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <asm/eeh.h>
-#include <asm/eeh_event.h>
-#include <asm/ppc-pci.h>
-#include <asm/pci-bridge.h>
-#include <asm/prom.h>
-#include <asm/rtas.h>
-
-/**
- * eeh_pcid_name - Retrieve name of PCI device driver
- * @pdev: PCI device
- *
- * This routine is used to retrieve the name of PCI device driver
- * if that's valid.
- */
-static inline const char *eeh_pcid_name(struct pci_dev *pdev)
-{
-	if (pdev && pdev->dev.driver)
-		return pdev->dev.driver->name;
-	return "";
-}
-
-/**
- * eeh_pcid_get - Get the PCI device driver
- * @pdev: PCI device
- *
- * The function is used to retrieve the PCI device driver for
- * the indicated PCI device. Besides, we will increase the reference
- * of the PCI device driver to prevent that being unloaded on
- * the fly. Otherwise, kernel crash would be seen.
- */
-static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
-{
-	if (!pdev || !pdev->driver)
-		return NULL;
-
-	if (!try_module_get(pdev->driver->driver.owner))
-		return NULL;
-
-	return pdev->driver;
-}
-
-/**
- * eeh_pcid_put - Dereference on the PCI device driver
- * @pdev: PCI device
- *
- * The function is called to do dereference on the PCI device
- * driver of the indicated PCI device.
- */
-static inline void eeh_pcid_put(struct pci_dev *pdev)
-{
-	if (!pdev || !pdev->driver)
-		return;
-
-	module_put(pdev->driver->driver.owner);
-}
-
-#if 0
-static void print_device_node_tree(struct pci_dn *pdn, int dent)
-{
-	int i;
-	struct device_node *pc;
-
-	if (!pdn)
-		return;
-	for (i = 0; i < dent; i++)
-		printk(" ");
-	printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
-		pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
-		pdn->eeh_pe_config_addr, pdn->node->full_name);
-	dent += 3;
-	pc = pdn->node->child;
-	while (pc) {
-		print_device_node_tree(PCI_DN(pc), dent);
-		pc = pc->sibling;
-	}
-}
-#endif
-
-/**
- * eeh_disable_irq - Disable interrupt for the recovering device
- * @dev: PCI device
- *
- * This routine must be called when reporting temporary or permanent
- * error to the particular PCI device to disable interrupt of that
- * device. If the device has enabled MSI or MSI-X interrupt, we needn't
- * do real work because EEH should freeze DMA transfers for those PCI
- * devices encountering EEH errors, which includes MSI or MSI-X.
- */
-static void eeh_disable_irq(struct pci_dev *dev)
-{
-	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
-
-	/* Don't disable MSI and MSI-X interrupts. They are
-	 * effectively disabled by the DMA Stopped state
-	 * when an EEH error occurs.
-	 */
-	if (dev->msi_enabled || dev->msix_enabled)
-		return;
-
-	if (!irq_has_action(dev->irq))
-		return;
-
-	edev->mode |= EEH_DEV_IRQ_DISABLED;
-	disable_irq_nosync(dev->irq);
-}
-
-/**
- * eeh_enable_irq - Enable interrupt for the recovering device
- * @dev: PCI device
- *
- * This routine must be called to enable interrupt while failed
- * device could be resumed.
- */
-static void eeh_enable_irq(struct pci_dev *dev)
-{
-	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
-
-	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
-		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
-		enable_irq(dev->irq);
-	}
-}
-
-/**
- * eeh_report_error - Report pci error to each device driver
- * @data: eeh device
- * @userdata: return value
- *
- * Report an EEH error to each device driver, collect up and
- * merge the device driver responses. Cumulative response
- * passed back in "userdata".
- */
-static void *eeh_report_error(void *data, void *userdata)
-{
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver;
-
-	/* We might not have the associated PCI device,
-	 * then we should continue for next one.
-	 */
-	if (!dev) return NULL;
-	dev->error_state = pci_channel_io_frozen;
-
-	driver = eeh_pcid_get(dev);
-	if (!driver) return NULL;
-
-	eeh_disable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected) {
-		eeh_pcid_put(dev);
-		return NULL;
-	}
-
-	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
-
-	/* A driver that needs a reset trumps all others */
-	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
-	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
-
-	eeh_pcid_put(dev);
-	return NULL;
-}
-
-/**
- * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
- * @data: eeh device
- * @userdata: return value
- *
- * Tells each device driver that IO ports, MMIO and config space I/O
- * are now enabled. Collects up and merges the device driver responses.
- * Cumulative response passed back in "userdata".
- */
-static void *eeh_report_mmio_enabled(void *data, void *userdata)
-{
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver;
-
-	driver = eeh_pcid_get(dev);
-	if (!driver) return NULL;
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->mmio_enabled) {
-		eeh_pcid_put(dev);
-		return NULL;
-	}
-
-	rc = driver->err_handler->mmio_enabled(dev);
-
-	/* A driver that needs a reset trumps all others */
-	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
-	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
-
-	eeh_pcid_put(dev);
-	return NULL;
-}
-
-/**
- * eeh_report_reset - Tell device that slot has been reset
- * @data: eeh device
- * @userdata: return value
- *
- * This routine must be called while EEH tries to reset particular
- * PCI device so that the associated PCI device driver could take
- * some actions, usually to save data the driver needs so that the
- * driver can work again while the device is recovered.
- */
-static void *eeh_report_reset(void *data, void *userdata)
-{
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver;
-
-	if (!dev) return NULL;
-	dev->error_state = pci_channel_io_normal;
-
-	driver = eeh_pcid_get(dev);
-	if (!driver) return NULL;
-
-	eeh_enable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->slot_reset) {
-		eeh_pcid_put(dev);
-		return NULL;
-	}
-
-	rc = driver->err_handler->slot_reset(dev);
-	if ((*res == PCI_ERS_RESULT_NONE) ||
-	    (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
-	if (*res == PCI_ERS_RESULT_DISCONNECT &&
-	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
-
-	eeh_pcid_put(dev);
-	return NULL;
-}
-
-/**
- * eeh_report_resume - Tell device to resume normal operations
- * @data: eeh device
- * @userdata: return value
- *
- * This routine must be called to notify the device driver that it
- * could resume so that the device driver can do some initialization
- * to make the recovered device work again.
- */
-static void *eeh_report_resume(void *data, void *userdata)
-{
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	struct pci_driver *driver;
-
-	if (!dev) return NULL;
-	dev->error_state = pci_channel_io_normal;
-
-	driver = eeh_pcid_get(dev);
-	if (!driver) return NULL;
-
-	eeh_enable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->resume) {
-		eeh_pcid_put(dev);
-		return NULL;
-	}
-
-	driver->err_handler->resume(dev);
-
-	eeh_pcid_put(dev);
-	return NULL;
-}
-
-/**
- * eeh_report_failure - Tell device driver that device is dead.
- * @data: eeh device
- * @userdata: return value
- *
- * This informs the device driver that the device is permanently
- * dead, and that no further recovery attempts will be made on it.
- */
-static void *eeh_report_failure(void *data, void *userdata)
-{
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	struct pci_driver *driver;
-
-	if (!dev) return NULL;
-	dev->error_state = pci_channel_io_perm_failure;
-
-	driver = eeh_pcid_get(dev);
-	if (!driver) return NULL;
-
-	eeh_disable_irq(dev);
-
-	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected) {
-		eeh_pcid_put(dev);
-		return NULL;
-	}
-
-	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
-
-	eeh_pcid_put(dev);
-	return NULL;
-}
-
-/**
- * eeh_reset_device - Perform actual reset of a pci slot
- * @pe: EEH PE
- * @bus: PCI bus corresponding to the isolcated slot
- *
- * This routine must be called to do reset on the indicated PE.
- * During the reset, udev might be invoked because those affected
- * PCI devices will be removed and then added.
- */
-static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
-{
-	int cnt, rc;
-
-	/* pcibios will clear the counter; save the value */
-	cnt = pe->freeze_count;
-
-	/*
-	 * We don't remove the corresponding PE instances because
-	 * we need the information afterwords. The attached EEH
-	 * devices are expected to be attached soon when calling
-	 * into pcibios_add_pci_devices().
-	 */
-	if (bus)
-		__pcibios_remove_pci_devices(bus, 0);
-
-	/* Reset the pci controller. (Asserts RST#; resets config space).
-	 * Reconfigure bridges and devices. Don't try to bring the system
-	 * up if the reset failed for some reason.
-	 */
-	rc = eeh_reset_pe(pe);
-	if (rc)
-		return rc;
-
-	/* Restore PE */
-	eeh_ops->configure_bridge(pe);
-	eeh_pe_restore_bars(pe);
-
-	/* Give the system 5 seconds to finish running the user-space
-	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
-	 * this is a hack, but if we don't do this, and try to bring
-	 * the device up before the scripts have taken it down,
-	 * potentially weird things happen.
-	 */
-	if (bus) {
-		ssleep(5);
-		pcibios_add_pci_devices(bus);
-	}
-	pe->freeze_count = cnt;
-
-	return 0;
-}
-
-/* The longest amount of time to wait for a pci device
- * to come back on line, in seconds.
- */
-#define MAX_WAIT_FOR_RECOVERY 150
-
-/**
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
-{
-	struct pci_bus *frozen_bus;
-	int rc = 0;
-	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
-
-	frozen_bus = eeh_pe_bus_get(pe);
-	if (!frozen_bus) {
-		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
-			__func__, pe->phb->global_number, pe->addr);
-		return;
-	}
-
-	pe->freeze_count++;
-	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
-		goto excess_failures;
-	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
-		pe->freeze_count);
-
-	/* Walk the various device drivers attached to this slot through
-	 * a reset sequence, giving each an opportunity to do what it needs
-	 * to accomplish the reset.  Each child gets a report of the
-	 * status ... if any child can't handle the reset, then the entire
-	 * slot is dlpar removed and added.
-	 */
-	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
-
-	/* Get the current PCI slot state. This can take a long time,
-	 * sometimes over 3 seconds for certain systems.
-	 */
-	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
-	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
-		printk(KERN_WARNING "EEH: Permanent failure\n");
-		goto hard_fail;
-	}
-
-	/* Since rtas may enable MMIO when posting the error log,
-	 * don't post the error log until after all dev drivers
-	 * have been informed.
-	 */
-	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
-
-	/* If all device drivers were EEH-unaware, then shut
-	 * down all of the device drivers, and hope they
-	 * go down willingly, without panicing the system.
-	 */
-	if (result == PCI_ERS_RESULT_NONE) {
-		rc = eeh_reset_device(pe, frozen_bus);
-		if (rc) {
-			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
-			goto hard_fail;
-		}
-	}
-
-	/* If all devices reported they can proceed, then re-enable MMIO */
-	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
-
-		if (rc < 0)
-			goto hard_fail;
-		if (rc) {
-			result = PCI_ERS_RESULT_NEED_RESET;
-		} else {
-			result = PCI_ERS_RESULT_NONE;
-			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
-		}
-	}
-
-	/* If all devices reported they can proceed, then re-enable DMA */
-	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
-
-		if (rc < 0)
-			goto hard_fail;
-		if (rc)
-			result = PCI_ERS_RESULT_NEED_RESET;
-		else
-			result = PCI_ERS_RESULT_RECOVERED;
-	}
-
-	/* If any device has a hard failure, then shut off everything. */
-	if (result == PCI_ERS_RESULT_DISCONNECT) {
-		printk(KERN_WARNING "EEH: Device driver gave up\n");
-		goto hard_fail;
-	}
-
-	/* If any device called out for a reset, then reset the slot */
-	if (result == PCI_ERS_RESULT_NEED_RESET) {
-		rc = eeh_reset_device(pe, NULL);
-		if (rc) {
-			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
-			goto hard_fail;
-		}
-		result = PCI_ERS_RESULT_NONE;
-		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
-	}
-
-	/* All devices should claim they have recovered by now. */
-	if ((result != PCI_ERS_RESULT_RECOVERED) &&
-	    (result != PCI_ERS_RESULT_NONE)) {
-		printk(KERN_WARNING "EEH: Not recovered\n");
-		goto hard_fail;
-	}
-
-	/* Tell all device drivers that they can resume operations */
-	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
-
-	return;
-
-excess_failures:
-	/*
-	 * About 90% of all real-life EEH failures in the field
-	 * are due to poorly seated PCI cards. Only 10% or so are
-	 * due to actual, failed cards.
-	 */
-	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
-	       "last hour and has been permanently disabled.\n"
-	       "Please try reseating or replacing it.\n",
-		pe->phb->global_number, pe->addr,
-		pe->freeze_count);
-	goto perm_error;
-
-hard_fail:
-	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
-	       "Please try reseating or replacing it\n",
-		pe->phb->global_number, pe->addr);
-
-perm_error:
-	eeh_slot_error_detail(pe, EEH_LOG_PERM);
-
-	/* Notify all devices that they're about to go down. */
-	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
-
-	/* Shut down the device drivers for good. */
-	if (frozen_bus)
-		pcibios_remove_pci_devices(frozen_bus);
-}
-
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
deleted file mode 100644
index 185bedd..0000000
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
- */
-
-#include <linux/delay.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include <linux/kthread.h>
-#include <asm/eeh_event.h>
-#include <asm/ppc-pci.h>
-
-/** Overview:
- *  EEH error states may be detected within exception handlers;
- *  however, the recovery processing needs to occur asynchronously
- *  in a normal kernel context and not an interrupt context.
- *  This pair of routines creates an event and queues it onto a
- *  work-queue, where a worker thread can drive recovery.
- */
-
-/* EEH event workqueue setup. */
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
-LIST_HEAD(eeh_eventlist);
-static void eeh_thread_launcher(struct work_struct *);
-DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
-
-/* Serialize reset sequences for a given pci device */
-DEFINE_MUTEX(eeh_event_mutex);
-
-/**
- * eeh_event_handler - Dispatch EEH events.
- * @dummy - unused
- *
- * The detection of a frozen slot can occur inside an interrupt,
- * where it can be hard to do anything about it.  The goal of this
- * routine is to pull these detection events out of the context
- * of the interrupt handler, and re-dispatch them for processing
- * at a later time in a normal context.
- */
-static int eeh_event_handler(void * dummy)
-{
-	unsigned long flags;
-	struct eeh_event *event;
-	struct eeh_pe *pe;
-
-	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-	event = NULL;
-
-	/* Unqueue the event, get ready to process. */
-	if (!list_empty(&eeh_eventlist)) {
-		event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-		list_del(&event->list);
-	}
-	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
-	if (event == NULL)
-		return 0;
-
-	/* Serialize processing of EEH events */
-	mutex_lock(&eeh_event_mutex);
-	pe = event->pe;
-	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-	pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
-		pe->phb->global_number, pe->addr);
-
-	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
-	eeh_handle_event(pe);
-	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
-
-	kfree(event);
-	mutex_unlock(&eeh_event_mutex);
-
-	/* If there are no new errors after an hour, clear the counter. */
-	if (pe && pe->freeze_count > 0) {
-		msleep_interruptible(3600*1000);
-		if (pe->freeze_count > 0)
-			pe->freeze_count--;
-
-	}
-
-	return 0;
-}
-
-/**
- * eeh_thread_launcher - Start kernel thread to handle EEH events
- * @dummy - unused
- *
- * This routine is called to start the kernel thread for processing
- * EEH event.
- */
-static void eeh_thread_launcher(struct work_struct *dummy)
-{
-	if (IS_ERR(kthread_run(eeh_event_handler, NULL, "eehd")))
-		printk(KERN_ERR "Failed to start EEH daemon\n");
-}
-
-/**
- * eeh_send_failure_event - Generate a PCI error event
- * @pe: EEH PE
- *
- * This routine can be called within an interrupt context;
- * the actual event will be delivered in a normal context
- * (from a workqueue).
- */
-int eeh_send_failure_event(struct eeh_pe *pe)
-{
-	unsigned long flags;
-	struct eeh_event *event;
-
-	event = kzalloc(sizeof(*event), GFP_ATOMIC);
-	if (!event) {
-		pr_err("EEH: out of memory, event not handled\n");
-		return -ENOMEM;
-	}
-	event->pe = pe;
-
-	/* We may or may not be called in an interrupt context */
-	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-	list_add(&event->list, &eeh_eventlist);
-	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
-	schedule_work(&eeh_event_wq);
-
-	return 0;
-}
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
deleted file mode 100644
index 9d4a9e8..0000000
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
- * The file intends to implement PE based on the information from
- * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
- * All the PEs should be organized as hierarchy tree. The first level
- * of the tree will be associated to existing PHBs since the particular
- * PE is only meaningful in one PHB domain.
- *
- * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/export.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/string.h>
-
-#include <asm/pci-bridge.h>
-#include <asm/ppc-pci.h>
-
-static LIST_HEAD(eeh_phb_pe);
-
-/**
- * eeh_pe_alloc - Allocate PE
- * @phb: PCI controller
- * @type: PE type
- *
- * Allocate PE instance dynamically.
- */
-static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
-{
-	struct eeh_pe *pe;
-
-	/* Allocate PHB PE */
-	pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
-	if (!pe) return NULL;
-
-	/* Initialize PHB PE */
-	pe->type = type;
-	pe->phb = phb;
-	INIT_LIST_HEAD(&pe->child_list);
-	INIT_LIST_HEAD(&pe->child);
-	INIT_LIST_HEAD(&pe->edevs);
-
-	return pe;
-}
-
-/**
- * eeh_phb_pe_create - Create PHB PE
- * @phb: PCI controller
- *
- * The function should be called while the PHB is detected during
- * system boot or PCI hotplug in order to create PHB PE.
- */
-int eeh_phb_pe_create(struct pci_controller *phb)
-{
-	struct eeh_pe *pe;
-
-	/* Allocate PHB PE */
-	pe = eeh_pe_alloc(phb, EEH_PE_PHB);
-	if (!pe) {
-		pr_err("%s: out of memory!\n", __func__);
-		return -ENOMEM;
-	}
-
-	/* Put it into the list */
-	eeh_lock();
-	list_add_tail(&pe->child, &eeh_phb_pe);
-	eeh_unlock();
-
-	pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
-
-	return 0;
-}
-
-/**
- * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
- * @phb: PCI controller
- *
- * The overall PEs form hierarchy tree. The first layer of the
- * hierarchy tree is composed of PHB PEs. The function is used
- * to retrieve the corresponding PHB PE according to the given PHB.
- */
-static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
-{
-	struct eeh_pe *pe;
-
-	list_for_each_entry(pe, &eeh_phb_pe, child) {
-		/*
-		 * Actually, we needn't check the type since
-		 * the PE for PHB has been determined when that
-		 * was created.
-		 */
-		if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
-			return pe;
-	}
-
-	return NULL;
-}
-
-/**
- * eeh_pe_next - Retrieve the next PE in the tree
- * @pe: current PE
- * @root: root PE
- *
- * The function is used to retrieve the next PE in the
- * hierarchy PE tree.
- */
-static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
-				  struct eeh_pe *root)
-{
-	struct list_head *next = pe->child_list.next;
-
-	if (next == &pe->child_list) {
-		while (1) {
-			if (pe == root)
-				return NULL;
-			next = pe->child.next;
-			if (next != &pe->parent->child_list)
-				break;
-			pe = pe->parent;
-		}
-	}
-
-	return list_entry(next, struct eeh_pe, child);
-}
-
-/**
- * eeh_pe_traverse - Traverse PEs in the specified PHB
- * @root: root PE
- * @fn: callback
- * @flag: extra parameter to callback
- *
- * The function is used to traverse the specified PE and its
- * child PEs. The traversing is to be terminated once the
- * callback returns something other than NULL, or no more PEs
- * to be traversed.
- */
-static void *eeh_pe_traverse(struct eeh_pe *root,
-			eeh_traverse_func fn, void *flag)
-{
-	struct eeh_pe *pe;
-	void *ret;
-
-	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
-		ret = fn(pe, flag);
-		if (ret) return ret;
-	}
-
-	return NULL;
-}
-
-/**
- * eeh_pe_dev_traverse - Traverse the devices from the PE
- * @root: EEH PE
- * @fn: function callback
- * @flag: extra parameter to callback
- *
- * The function is used to traverse the devices of the specified
- * PE and its child PEs.
- */
-void *eeh_pe_dev_traverse(struct eeh_pe *root,
-		eeh_traverse_func fn, void *flag)
-{
-	struct eeh_pe *pe;
-	struct eeh_dev *edev;
-	void *ret;
-
-	if (!root) {
-		pr_warning("%s: Invalid PE %p\n", __func__, root);
-		return NULL;
-	}
-
-	eeh_lock();
-
-	/* Traverse root PE */
-	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
-		eeh_pe_for_each_dev(pe, edev) {
-			ret = fn(edev, flag);
-			if (ret) {
-				eeh_unlock();
-				return ret;
-			}
-		}
-	}
-
-	eeh_unlock();
-
-	return NULL;
-}
-
-/**
- * __eeh_pe_get - Check the PE address
- * @data: EEH PE
- * @flag: EEH device
- *
- * For one particular PE, it can be identified by PE address
- * or tranditional BDF address. BDF address is composed of
- * Bus/Device/Function number. The extra data referred by flag
- * indicates which type of address should be used.
- */
-static void *__eeh_pe_get(void *data, void *flag)
-{
-	struct eeh_pe *pe = (struct eeh_pe *)data;
-	struct eeh_dev *edev = (struct eeh_dev *)flag;
-
-	/* Unexpected PHB PE */
-	if (pe->type & EEH_PE_PHB)
-		return NULL;
-
-	/* We prefer PE address */
-	if (edev->pe_config_addr &&
-	   (edev->pe_config_addr == pe->addr))
-		return pe;
-
-	/* Try BDF address */
-	if (edev->pe_config_addr &&
-	   (edev->config_addr == pe->config_addr))
-		return pe;
-
-	return NULL;
-}
-
-/**
- * eeh_pe_get - Search PE based on the given address
- * @edev: EEH device
- *
- * Search the corresponding PE based on the specified address which
- * is included in the eeh device. The function is used to check if
- * the associated PE has been created against the PE address. It's
- * notable that the PE address has 2 format: traditional PE address
- * which is composed of PCI bus/device/function number, or unified
- * PE address.
- */
-static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
-{
-	struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
-	struct eeh_pe *pe;
-
-	pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
-
-	return pe;
-}
-
-/**
- * eeh_pe_get_parent - Retrieve the parent PE
- * @edev: EEH device
- *
- * The whole PEs existing in the system are organized as hierarchy
- * tree. The function is used to retrieve the parent PE according
- * to the parent EEH device.
- */
-static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
-{
-	struct device_node *dn;
-	struct eeh_dev *parent;
-
-	/*
-	 * It might have the case for the indirect parent
-	 * EEH device already having associated PE, but
-	 * the direct parent EEH device doesn't have yet.
-	 */
-	dn = edev->dn->parent;
-	while (dn) {
-		/* We're poking out of PCI territory */
-		if (!PCI_DN(dn)) return NULL;
-
-		parent = of_node_to_eeh_dev(dn);
-		/* We're poking out of PCI territory */
-		if (!parent) return NULL;
-
-		if (parent->pe)
-			return parent->pe;
-
-		dn = dn->parent;
-	}
-
-	return NULL;
-}
-
-/**
- * eeh_add_to_parent_pe - Add EEH device to parent PE
- * @edev: EEH device
- *
- * Add EEH device to the parent PE. If the parent PE already
- * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
- * we have to create new PE to hold the EEH device and the new
- * PE will be linked to its parent PE as well.
- */
-int eeh_add_to_parent_pe(struct eeh_dev *edev)
-{
-	struct eeh_pe *pe, *parent;
-
-	eeh_lock();
-
-	/*
-	 * Search the PE has been existing or not according
-	 * to the PE address. If that has been existing, the
-	 * PE should be composed of PCI bus and its subordinate
-	 * components.
-	 */
-	pe = eeh_pe_get(edev);
-	if (pe && !(pe->type & EEH_PE_INVALID)) {
-		if (!edev->pe_config_addr) {
-			eeh_unlock();
-			pr_err("%s: PE with addr 0x%x already exists\n",
-				__func__, edev->config_addr);
-			return -EEXIST;
-		}
-
-		/* Mark the PE as type of PCI bus */
-		pe->type = EEH_PE_BUS;
-		edev->pe = pe;
-
-		/* Put the edev to PE */
-		list_add_tail(&edev->list, &pe->edevs);
-		eeh_unlock();
-		pr_debug("EEH: Add %s to Bus PE#%x\n",
-			edev->dn->full_name, pe->addr);
-
-		return 0;
-	} else if (pe && (pe->type & EEH_PE_INVALID)) {
-		list_add_tail(&edev->list, &pe->edevs);
-		edev->pe = pe;
-		/*
-		 * We're running to here because of PCI hotplug caused by
-		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
-		 */
-		parent = pe;
-		while (parent) {
-			if (!(parent->type & EEH_PE_INVALID))
-				break;
-			parent->type &= ~EEH_PE_INVALID;
-			parent = parent->parent;
-		}
-		eeh_unlock();
-		pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
-			edev->dn->full_name, pe->addr, pe->parent->addr);
-
-		return 0;
-	}
-
-	/* Create a new EEH PE */
-	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
-	if (!pe) {
-		eeh_unlock();
-		pr_err("%s: out of memory!\n", __func__);
-		return -ENOMEM;
-	}
-	pe->addr	= edev->pe_config_addr;
-	pe->config_addr	= edev->config_addr;
-
-	/*
-	 * Put the new EEH PE into hierarchy tree. If the parent
-	 * can't be found, the newly created PE will be attached
-	 * to PHB directly. Otherwise, we have to associate the
-	 * PE with its parent.
-	 */
-	parent = eeh_pe_get_parent(edev);
-	if (!parent) {
-		parent = eeh_phb_pe_get(edev->phb);
-		if (!parent) {
-			eeh_unlock();
-			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
-				__func__, edev->phb->global_number);
-			edev->pe = NULL;
-			kfree(pe);
-			return -EEXIST;
-		}
-	}
-	pe->parent = parent;
-
-	/*
-	 * Put the newly created PE into the child list and
-	 * link the EEH device accordingly.
-	 */
-	list_add_tail(&pe->child, &parent->child_list);
-	list_add_tail(&edev->list, &pe->edevs);
-	edev->pe = pe;
-	eeh_unlock();
-	pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
-		edev->dn->full_name, pe->addr, pe->parent->addr);
-
-	return 0;
-}
-
-/**
- * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
- * @edev: EEH device
- * @purge_pe: remove PE or not
- *
- * The PE hierarchy tree might be changed when doing PCI hotplug.
- * Also, the PCI devices or buses could be removed from the system
- * during EEH recovery. So we have to call the function remove the
- * corresponding PE accordingly if necessary.
- */
-int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
-{
-	struct eeh_pe *pe, *parent, *child;
-	int cnt;
-
-	if (!edev->pe) {
-		pr_warning("%s: No PE found for EEH device %s\n",
-			__func__, edev->dn->full_name);
-		return -EEXIST;
-	}
-
-	eeh_lock();
-
-	/* Remove the EEH device */
-	pe = edev->pe;
-	edev->pe = NULL;
-	list_del(&edev->list);
-
-	/*
-	 * Check if the parent PE includes any EEH devices.
-	 * If not, we should delete that. Also, we should
-	 * delete the parent PE if it doesn't have associated
-	 * child PEs and EEH devices.
-	 */
-	while (1) {
-		parent = pe->parent;
-		if (pe->type & EEH_PE_PHB)
-			break;
-
-		if (purge_pe) {
-			if (list_empty(&pe->edevs) &&
-			    list_empty(&pe->child_list)) {
-				list_del(&pe->child);
-				kfree(pe);
-			} else {
-				break;
-			}
-		} else {
-			if (list_empty(&pe->edevs)) {
-				cnt = 0;
-				list_for_each_entry(child, &pe->child_list, child) {
-					if (!(child->type & EEH_PE_INVALID)) {
-						cnt++;
-						break;
-					}
-				}
-
-				if (!cnt)
-					pe->type |= EEH_PE_INVALID;
-				else
-					break;
-			}
-		}
-
-		pe = parent;
-	}
-
-	eeh_unlock();
-
-	return 0;
-}
-
-/**
- * __eeh_pe_state_mark - Mark the state for the PE
- * @data: EEH PE
- * @flag: state
- *
- * The function is used to mark the indicated state for the given
- * PE. Also, the associated PCI devices will be put into IO frozen
- * state as well.
- */
-static void *__eeh_pe_state_mark(void *data, void *flag)
-{
-	struct eeh_pe *pe = (struct eeh_pe *)data;
-	int state = *((int *)flag);
-	struct eeh_dev *tmp;
-	struct pci_dev *pdev;
-
-	/*
-	 * Mark the PE with the indicated state. Also,
-	 * the associated PCI device will be put into
-	 * I/O frozen state to avoid I/O accesses from
-	 * the PCI device driver.
-	 */
-	pe->state |= state;
-	eeh_pe_for_each_dev(pe, tmp) {
-		pdev = eeh_dev_to_pci_dev(tmp);
-		if (pdev)
-			pdev->error_state = pci_channel_io_frozen;
-	}
-
-	return NULL;
-}
-
-/**
- * eeh_pe_state_mark - Mark specified state for PE and its associated device
- * @pe: EEH PE
- *
- * EEH error affects the current PE and its child PEs. The function
- * is used to mark appropriate state for the affected PEs and the
- * associated devices.
- */
-void eeh_pe_state_mark(struct eeh_pe *pe, int state)
-{
-	eeh_lock();
-	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
-	eeh_unlock();
-}
-
-/**
- * __eeh_pe_state_clear - Clear state for the PE
- * @data: EEH PE
- * @flag: state
- *
- * The function is used to clear the indicated state from the
- * given PE. Besides, we also clear the check count of the PE
- * as well.
- */
-static void *__eeh_pe_state_clear(void *data, void *flag)
-{
-	struct eeh_pe *pe = (struct eeh_pe *)data;
-	int state = *((int *)flag);
-
-	pe->state &= ~state;
-	pe->check_count = 0;
-
-	return NULL;
-}
-
-/**
- * eeh_pe_state_clear - Clear state for the PE and its children
- * @pe: PE
- * @state: state to be cleared
- *
- * When the PE and its children has been recovered from error,
- * we need clear the error state for that. The function is used
- * for the purpose.
- */
-void eeh_pe_state_clear(struct eeh_pe *pe, int state)
-{
-	eeh_lock();
-	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
-	eeh_unlock();
-}
-
-/**
- * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
- * @data: EEH device
- * @flag: Unused
- *
- * Loads the PCI configuration space base address registers,
- * the expansion ROM base address, the latency timer, and etc.
- * from the saved values in the device node.
- */
-static void *eeh_restore_one_device_bars(void *data, void *flag)
-{
-	int i;
-	u32 cmd;
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
-
-	for (i = 4; i < 10; i++)
-		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
-	/* 12 == Expansion ROM Address */
-	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
-
-#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
-#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
-
-	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
-		SAVED_BYTE(PCI_CACHE_LINE_SIZE));
-	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
-		SAVED_BYTE(PCI_LATENCY_TIMER));
-
-	/* max latency, min grant, interrupt pin and line */
-	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
-
-	/*
-	 * Restore PERR & SERR bits, some devices require it,
-	 * don't touch the other command bits
-	 */
-	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
-	if (edev->config_space[1] & PCI_COMMAND_PARITY)
-		cmd |= PCI_COMMAND_PARITY;
-	else
-		cmd &= ~PCI_COMMAND_PARITY;
-	if (edev->config_space[1] & PCI_COMMAND_SERR)
-		cmd |= PCI_COMMAND_SERR;
-	else
-		cmd &= ~PCI_COMMAND_SERR;
-	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
-
-	return NULL;
-}
-
-/**
- * eeh_pe_restore_bars - Restore the PCI config space info
- * @pe: EEH PE
- *
- * This routine performs a recursive walk to the children
- * of this device as well.
- */
-void eeh_pe_restore_bars(struct eeh_pe *pe)
-{
-	/*
-	 * We needn't take the EEH lock since eeh_pe_dev_traverse()
-	 * will take that.
-	 */
-	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
-}
-
-/**
- * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
- * @pe: EEH PE
- *
- * Retrieve the PCI bus according to the given PE. Basically,
- * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
- * primary PCI bus will be retrieved. The parent bus will be
- * returned for BUS PE. However, we don't have associated PCI
- * bus for DEVICE PE.
- */
-struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
-{
-	struct pci_bus *bus = NULL;
-	struct eeh_dev *edev;
-	struct pci_dev *pdev;
-
-	eeh_lock();
-
-	if (pe->type & EEH_PE_PHB) {
-		bus = pe->phb->bus;
-	} else if (pe->type & EEH_PE_BUS ||
-		   pe->type & EEH_PE_DEVICE) {
-		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
-		pdev = eeh_dev_to_pci_dev(edev);
-		if (pdev)
-			bus = pdev->bus;
-	}
-
-	eeh_unlock();
-
-	return bus;
-}
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c
deleted file mode 100644
index d377083..0000000
--- a/arch/powerpc/platforms/pseries/eeh_sysfs.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
- * Copyright IBM Corporation 2007
- * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
- */
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <asm/ppc-pci.h>
-#include <asm/pci-bridge.h>
-
-/**
- * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
- * @_name: name of file in sysfs directory
- * @_memb: name of member in struct pci_dn to access
- * @_format: printf format for display
- *
- * All of the attributes look very similar, so just
- * auto-gen a cut-n-paste routine to display them.
- */
-#define EEH_SHOW_ATTR(_name,_memb,_format)               \
-static ssize_t eeh_show_##_name(struct device *dev,      \
-		struct device_attribute *attr, char *buf)          \
-{                                                        \
-	struct pci_dev *pdev = to_pci_dev(dev);               \
-	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);      \
-	                                                      \
-	if (!edev)                                            \
-		return 0;                                     \
-	                                                      \
-	return sprintf(buf, _format "\n", edev->_memb);       \
-}                                                        \
-static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
-
-EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
-EEH_SHOW_ATTR(eeh_config_addr,     config_addr,     "0x%x");
-EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
-
-void eeh_sysfs_add_device(struct pci_dev *pdev)
-{
-	int rc=0;
-
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-
-	if (rc)
-		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
-}
-
-void eeh_sysfs_remove_device(struct pci_dev *pdev)
-{
-	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-}
-
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index c91b22b..efe6137 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -64,91 +64,6 @@ pcibios_find_pci_bus(struct device_node *dn)
 }
 EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
 
-/**
- * __pcibios_remove_pci_devices - remove all devices under this bus
- * @bus: the indicated PCI bus
- * @purge_pe: destroy the PE on removal of PCI devices
- *
- * Remove all of the PCI devices under this bus both from the
- * linux pci device tree, and from the powerpc EEH address cache.
- * By default, the corresponding PE will be destroied during the
- * normal PCI hotplug path. For PCI hotplug during EEH recovery,
- * the corresponding PE won't be destroied and deallocated.
- */
-void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
-{
-	struct pci_dev *dev, *tmp;
-	struct pci_bus *child_bus;
-
-	/* First go down child busses */
-	list_for_each_entry(child_bus, &bus->children, node)
-		__pcibios_remove_pci_devices(child_bus, purge_pe);
-
-	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
-		pci_domain_nr(bus),  bus->number);
-	list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
-		pr_debug("     * Removing %s...\n", pci_name(dev));
-		eeh_remove_bus_device(dev, purge_pe);
-		pci_stop_and_remove_bus_device(dev);
-	}
-}
-
-/**
- * pcibios_remove_pci_devices - remove all devices under this bus
- *
- * Remove all of the PCI devices under this bus both from the
- * linux pci device tree, and from the powerpc EEH address cache.
- */
-void pcibios_remove_pci_devices(struct pci_bus *bus)
-{
-	__pcibios_remove_pci_devices(bus, 1);
-}
-EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
-
-/**
- * pcibios_add_pci_devices - adds new pci devices to bus
- *
- * This routine will find and fixup new pci devices under
- * the indicated bus. This routine presumes that there
- * might already be some devices under this bridge, so
- * it carefully tries to add only new devices.  (And that
- * is how this routine differs from other, similar pcibios
- * routines.)
- */
-void pcibios_add_pci_devices(struct pci_bus * bus)
-{
-	int slotno, num, mode, pass, max;
-	struct pci_dev *dev;
-	struct device_node *dn = pci_bus_to_OF_node(bus);
-
-	eeh_add_device_tree_early(dn);
-
-	mode = PCI_PROBE_NORMAL;
-	if (ppc_md.pci_probe_mode)
-		mode = ppc_md.pci_probe_mode(bus);
-
-	if (mode == PCI_PROBE_DEVTREE) {
-		/* use ofdt-based probe */
-		of_rescan_bus(dn, bus);
-	} else if (mode == PCI_PROBE_NORMAL) {
-		/* use legacy probe */
-		slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
-		num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
-		if (!num)
-			return;
-		pcibios_setup_bus_devices(bus);
-		max = bus->busn_res.start;
-		for (pass=0; pass < 2; pass++)
-			list_for_each_entry(dev, &bus->devices, bus_list) {
-			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
-			    dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
-				max = pci_scan_bridge(bus, dev, max, pass);
-		}
-	}
-	pcibios_finish_adding_to_bus(bus);
-}
-EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
-
 struct pci_controller *init_phb_dynamic(struct device_node *dn)
 {
 	struct pci_controller *phb;
-- 
cgit v0.10.2


From 9ff67433cef319a02cb64dbe3f710a8566ebfcee Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:53 +0800
Subject: powerpc/eeh: Make eeh_phb_pe_get() public

One of the possible cases indicated by P7IOC interrupt is fenced
PHB. For that case, we need fetch the PE corresponding to the PHB
and disable the PHB and all subordinate PCI buses/devices, recover
from the fenced state and eventually enable the whole PHB. We need
one function to fetch the PHB PE outside eeh_pe.c and the patch is
going to make eeh_phb_pe_get() public for that purpose.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e32c3c5..4ac6f70 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -184,6 +184,7 @@ static inline void eeh_unlock(void)
 
 typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int eeh_phb_pe_create(struct pci_controller *phb);
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 9d4a9e8..71c4544 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -95,7 +95,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
  * hierarchy tree is composed of PHB PEs. The function is used
  * to retrieve the corresponding PHB PE according to the given PHB.
  */
-static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
 {
 	struct eeh_pe *pe;
 
-- 
cgit v0.10.2


From 01566808547b7ecc5017a741d50dead9e53f86fa Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:54 +0800
Subject: powerpc/eeh: Make eeh_pe_get() public

While processing EEH event interrupt from P7IOC, we need function
to retrieve the PE according to the indicated EEH device. The patch
makes function eeh_pe_get() public so that other source files can call
it for that purpose. Also, the patch fixes referring to wrong BDF
(Bus/Device/Function) address while searching PE in function
__eeh_pe_get().

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 4ac6f70..acdfcaa 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -185,6 +185,7 @@ static inline void eeh_unlock(void)
 typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int eeh_phb_pe_create(struct pci_controller *phb);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 71c4544..3d2dcf5 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -228,7 +228,7 @@ static void *__eeh_pe_get(void *data, void *flag)
 		return pe;
 
 	/* Try BDF address */
-	if (edev->pe_config_addr &&
+	if (edev->config_addr &&
 	   (edev->config_addr == pe->config_addr))
 		return pe;
 
@@ -246,7 +246,7 @@ static void *__eeh_pe_get(void *data, void *flag)
  * which is composed of PCI bus/device/function number, or unified
  * PE address.
  */
-static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
 {
 	struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
 	struct eeh_pe *pe;
-- 
cgit v0.10.2


From 8cdb283371241d298332c44d7c722e26d9858ec1 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:55 +0800
Subject: powerpc/eeh: Trace PCI bus from PE

There're several types of PEs can be supported for now: PHB, Bus
and Device dependent PE. For PCI bus dependent PE, tracing the
corresponding PCI bus from PE (struct eeh_pe) would make the code
more efficient. The patch also enables the retrieval of PCI bus based
on the PCI bus dependent PE.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index acdfcaa..f3b49d6 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -59,6 +59,7 @@ struct eeh_pe {
 	int config_addr;		/* Traditional PCI address	*/
 	int addr;			/* PE configuration address	*/
 	struct pci_controller *phb;	/* Associated PHB		*/
+	struct pci_bus *bus;		/* Top PCI bus for bus PE	*/
 	int check_count;		/* Times of ignored error	*/
 	int freeze_count;		/* Times of froze up		*/
 	int false_positives;		/* Times of reported #ff's	*/
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 3d2dcf5..c963667 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -365,6 +365,17 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	pe->config_addr	= edev->config_addr;
 
 	/*
+	 * While doing PE reset, we probably hot-reset the
+	 * upstream bridge. However, the PCI devices including
+	 * the associated EEH devices might be removed when EEH
+	 * core is doing recovery. So that won't safe to retrieve
+	 * the bridge through downstream EEH device. We have to
+	 * trace the parent PCI bus, then the upstream bridge.
+	 */
+	if (eeh_probe_mode_dev())
+		pe->bus = eeh_dev_to_pci_dev(edev)->bus;
+
+	/*
 	 * Put the new EEH PE into hierarchy tree. If the parent
 	 * can't be found, the newly created PE will be attached
 	 * to PHB directly. Otherwise, we have to associate the
@@ -641,12 +652,18 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 		bus = pe->phb->bus;
 	} else if (pe->type & EEH_PE_BUS ||
 		   pe->type & EEH_PE_DEVICE) {
+		if (pe->bus) {
+			bus = pe->bus;
+			goto out;
+		}
+
 		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
 		pdev = eeh_dev_to_pci_dev(edev);
 		if (pdev)
 			bus = pdev->bus;
 	}
 
+out:
 	eeh_unlock();
 
 	return bus;
-- 
cgit v0.10.2


From 51fb5f563274de01e47ad2cbdd48018557926fe3 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:56 +0800
Subject: powerpc/eeh: Make eeh_init() public

For EEH on PowerNV platform, we will do EEH probe based on the
real PCI devices. The PCI devices are available after PCI probe.
So we have to call eeh_init() explicitly on PowerNV platform
after PCI probe. The patch also does EEH probe for PowerNV platform
in eeh_init().

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index f3b49d6..beb3cbc 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -132,7 +132,7 @@ struct eeh_ops {
 	char *name;
 	int (*init)(void);
 	void* (*of_probe)(struct device_node *dn, void *flag);
-	void* (*dev_probe)(struct pci_dev *dev, void *flag);
+	int (*dev_probe)(struct pci_dev *dev, void *flag);
 	int (*set_option)(struct eeh_pe *pe, int option);
 	int (*get_pe_addr)(struct eeh_pe *pe);
 	int (*get_state)(struct eeh_pe *pe, int *state);
@@ -196,6 +196,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 void *eeh_dev_init(struct device_node *dn, void *data);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
+int __init eeh_init(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 unsigned long eeh_check_failure(const volatile void __iomem *token,
@@ -224,6 +225,11 @@ void eeh_remove_bus_device(struct pci_dev *, int);
 
 #else /* !CONFIG_EEH */
 
+static inline int eeh_init(void)
+{
+	return 0;
+}
+
 static inline void *eeh_dev_init(struct device_node *dn, void *data)
 {
 	return NULL;
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 8a83451..c865c5f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -674,11 +674,21 @@ int __exit eeh_ops_unregister(const char *name)
  * Even if force-off is set, the EEH hardware is still enabled, so that
  * newer systems can boot.
  */
-static int __init eeh_init(void)
+int __init eeh_init(void)
 {
 	struct pci_controller *hose, *tmp;
 	struct device_node *phb;
-	int ret;
+	static int cnt = 0;
+	int ret = 0;
+
+	/*
+	 * We have to delay the initialization on PowerNV after
+	 * the PCI hierarchy tree has been built because the PEs
+	 * are figured out based on PCI devices instead of device
+	 * tree nodes
+	 */
+	if (machine_is(powernv) && cnt++ <= 0)
+		return ret;
 
 	/* call platform initialization function */
 	if (!eeh_ops) {
@@ -700,6 +710,14 @@ static int __init eeh_init(void)
 			phb = hose->dn;
 			traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
 		}
+	} else if (eeh_probe_mode_dev()) {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node)
+			pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL);
+	} else {
+		pr_warning("%s: Invalid probe mode %d\n",
+			   __func__, eeh_probe_mode);
+		return -EINVAL;
 	}
 
 	if (eeh_subsystem_enabled)
-- 
cgit v0.10.2


From 21fd21f59082c2538883a280e7f0e9b374cf6cec Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:57 +0800
Subject: powerpc/eeh: EEH post initialization operation

The patch adds new EEH operation post_init. It's used to notify
the platform that EEH core has completed the EEH probe. By that,
PowerNV platform starts to use the services supplied by EEH
functionality.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index beb3cbc..beec788 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -131,6 +131,7 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 struct eeh_ops {
 	char *name;
 	int (*init)(void);
+	int (*post_init)(void);
 	void* (*of_probe)(struct device_node *dn, void *flag);
 	int (*dev_probe)(struct pci_dev *dev, void *flag);
 	int (*set_option)(struct eeh_pe *pe, int option);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index c865c5f..a29cf47 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -720,6 +720,17 @@ int __init eeh_init(void)
 		return -EINVAL;
 	}
 
+	/*
+	 * Call platform post-initialization. Actually, It's good chance
+	 * to inform platform that EEH is ready to supply service if the
+	 * I/O cache stuff has been built up.
+	 */
+	if (eeh_ops->post_init) {
+		ret = eeh_ops->post_init();
+		if (ret)
+			return ret;
+	}
+
 	if (eeh_subsystem_enabled)
 		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
 	else
-- 
cgit v0.10.2


From 326a98ea93b5d2bbf53db5c25533ca3a7c4cce65 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:58 +0800
Subject: powerpc/eeh: Refactor eeh_reset_pe_once()

We shouldn't check that the returned PE status is exactly equal to
(EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) but instead only check
that they are both set.

[benh: changelog]
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index a29cf47..cda0b62 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -565,6 +565,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
  */
 int eeh_reset_pe(struct eeh_pe *pe)
 {
+	int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	int i, rc;
 
 	/* Take three shots at resetting the bus */
@@ -572,7 +573,7 @@ int eeh_reset_pe(struct eeh_pe *pe)
 		eeh_reset_pe_once(pe);
 
 		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
-		if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
+		if ((rc & flags) == flags)
 			return 0;
 
 		if (rc < 0) {
-- 
cgit v0.10.2


From 26a74850b35d85a81155aa0e51211fbd6eecad25 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:20:59 +0800
Subject: powerpc/eeh: Delay EEH probe during hotplug

While doing EEH recovery, the PCI devices of the problematic PE
should be removed and then added to the system again. During the
so-called hotplug event, the PCI devices of the problematic PE
will be probed through early/late phase. We would delay EEH probe
on late point for PowerNV platform since the PCI device isn't
available in early phase.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index cda0b62..7d169d3 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -758,6 +758,14 @@ static void eeh_add_device_early(struct device_node *dn)
 {
 	struct pci_controller *phb;
 
+	/*
+	 * If we're doing EEH probe based on PCI device, we
+	 * would delay the probe until late stage because
+	 * the PCI device isn't available this moment.
+	 */
+	if (!eeh_probe_mode_devtree())
+		return;
+
 	if (!of_node_to_eeh_dev(dn))
 		return;
 	phb = of_node_to_eeh_dev(dn)->phb;
@@ -766,7 +774,6 @@ static void eeh_add_device_early(struct device_node *dn)
 	if (NULL == phb || 0 == phb->buid)
 		return;
 
-	/* FIXME: hotplug support on POWERNV */
 	eeh_ops->of_probe(dn, NULL);
 }
 
@@ -817,6 +824,13 @@ static void eeh_add_device_late(struct pci_dev *dev)
 	edev->pdev = dev;
 	dev->dev.archdata.edev = edev;
 
+	/*
+	 * We have to do the EEH probe here because the PCI device
+	 * hasn't been created yet in the early stage.
+	 */
+	if (eeh_probe_mode_dev())
+		eeh_ops->dev_probe(dev, NULL);
+
 	eeh_addr_cache_insert_dev(dev);
 }
 
-- 
cgit v0.10.2


From c86085580d5f60d2d3cea9c60d50e284558d3de7 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:00 +0800
Subject: powerpc/eeh: Single kthread to handle events

We possiblly have multiple kthreads running for multiple EEH errors
(events) and use one spinlock to make the process of handling those
EEH events serialized. That's unnecessary and the patch creates only
one kthread, which is started during EEH core initialization time in
eeh_init(). A new semaphore introduced to count the number of existing
EEH events in the queue and the kthread waiting on the semaphore.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index de67d83..de92c86 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -31,6 +31,7 @@ struct eeh_event {
 	struct eeh_pe		*pe;	/* EEH PE		*/
 };
 
+int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_handle_event(struct eeh_pe *pe);
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 7d169d3..777ecc0 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -704,6 +704,11 @@ int __init eeh_init(void)
 
 	raw_spin_lock_init(&confirm_error_lock);
 
+	/* Initialize EEH event */
+	ret = eeh_event_init();
+	if (ret)
+		return ret;
+
 	/* Enable EEH for all adapters */
 	if (eeh_probe_mode_devtree()) {
 		list_for_each_entry_safe(hose, tmp,
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 185bedd..62e532d 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -18,11 +18,10 @@
 
 #include <linux/delay.h>
 #include <linux/list.h>
-#include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/semaphore.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
-#include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <asm/eeh_event.h>
 #include <asm/ppc-pci.h>
@@ -35,14 +34,9 @@
  *  work-queue, where a worker thread can drive recovery.
  */
 
-/* EEH event workqueue setup. */
 static DEFINE_SPINLOCK(eeh_eventlist_lock);
+static struct semaphore eeh_eventlist_sem;
 LIST_HEAD(eeh_eventlist);
-static void eeh_thread_launcher(struct work_struct *);
-DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
-
-/* Serialize reset sequences for a given pci device */
-DEFINE_MUTEX(eeh_event_mutex);
 
 /**
  * eeh_event_handler - Dispatch EEH events.
@@ -60,55 +54,62 @@ static int eeh_event_handler(void * dummy)
 	struct eeh_event *event;
 	struct eeh_pe *pe;
 
-	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-	event = NULL;
-
-	/* Unqueue the event, get ready to process. */
-	if (!list_empty(&eeh_eventlist)) {
-		event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-		list_del(&event->list);
-	}
-	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
-	if (event == NULL)
-		return 0;
-
-	/* Serialize processing of EEH events */
-	mutex_lock(&eeh_event_mutex);
-	pe = event->pe;
-	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-	pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
-		pe->phb->global_number, pe->addr);
-
-	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
-	eeh_handle_event(pe);
-	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
-
-	kfree(event);
-	mutex_unlock(&eeh_event_mutex);
-
-	/* If there are no new errors after an hour, clear the counter. */
-	if (pe && pe->freeze_count > 0) {
-		msleep_interruptible(3600*1000);
-		if (pe->freeze_count > 0)
-			pe->freeze_count--;
-
+	while (!kthread_should_stop()) {
+		down(&eeh_eventlist_sem);
+
+		/* Fetch EEH event from the queue */
+		spin_lock_irqsave(&eeh_eventlist_lock, flags);
+		event = NULL;
+		if (!list_empty(&eeh_eventlist)) {
+			event = list_entry(eeh_eventlist.next,
+					   struct eeh_event, list);
+			list_del(&event->list);
+		}
+		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+		if (!event)
+			continue;
+
+		/* We might have event without binding PE */
+		pe = event->pe;
+		if (pe) {
+			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+			pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
+				 pe->phb->global_number, pe->addr);
+			eeh_handle_event(pe);
+			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+		} else {
+			eeh_handle_event(NULL);
+		}
+
+		kfree(event);
 	}
 
 	return 0;
 }
 
 /**
- * eeh_thread_launcher - Start kernel thread to handle EEH events
- * @dummy - unused
+ * eeh_event_init - Start kernel thread to handle EEH events
  *
  * This routine is called to start the kernel thread for processing
  * EEH event.
  */
-static void eeh_thread_launcher(struct work_struct *dummy)
+int eeh_event_init(void)
 {
-	if (IS_ERR(kthread_run(eeh_event_handler, NULL, "eehd")))
-		printk(KERN_ERR "Failed to start EEH daemon\n");
+	struct task_struct *t;
+	int ret = 0;
+
+	/* Initialize semaphore */
+	sema_init(&eeh_eventlist_sem, 0);
+
+	t = kthread_run(eeh_event_handler, NULL, "eehd");
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		pr_err("%s: Failed to start EEH daemon (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 /**
@@ -136,7 +137,8 @@ int eeh_send_failure_event(struct eeh_pe *pe)
 	list_add(&event->list, &eeh_eventlist);
 	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
 
-	schedule_work(&eeh_event_wq);
+	/* For EEH deamon to knick in */
+	up(&eeh_eventlist_sem);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 5a71978e4b6ee6a01bc6aab926a3571055123029 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:01 +0800
Subject: powerpc/eeh: Trace time on first error for PE

We're not expecting that one specific PE got frozen for over 5
times in last hour. Otherwise, the PE will be removed from the
system upon newly coming EEH errors. The patch introduces time
stamp to trace the first error on specific PE in last hour and
function to update that accordingly. Besides, the time stamp
is recovered during PE hotplug path as we did for frozen count.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index beec788..e1109fd 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -24,6 +24,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/string.h>
+#include <linux/time.h>
 
 struct pci_dev;
 struct pci_bus;
@@ -62,6 +63,7 @@ struct eeh_pe {
 	struct pci_bus *bus;		/* Top PCI bus for bus PE	*/
 	int check_count;		/* Times of ignored error	*/
 	int freeze_count;		/* Times of froze up		*/
+	struct timeval tstamp;		/* Time on first-time freeze	*/
 	int false_positives;		/* Times of reported #ff's	*/
 	struct eeh_pe *parent;		/* Parent PE			*/
 	struct list_head child_list;	/* Link PE to the child list	*/
@@ -190,6 +192,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
+void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
 		eeh_traverse_func fn, void *flag);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index fb927af..678bc6c 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata)
  */
 static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 {
+	struct timeval tstamp;
 	int cnt, rc;
 
 	/* pcibios will clear the counter; save the value */
 	cnt = pe->freeze_count;
+	tstamp = pe->tstamp;
 
 	/*
 	 * We don't remove the corresponding PE instances because
@@ -385,6 +387,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 		ssleep(5);
 		pcibios_add_pci_devices(bus);
 	}
+
+	pe->tstamp = tstamp;
 	pe->freeze_count = cnt;
 
 	return 0;
@@ -425,6 +429,7 @@ void eeh_handle_event(struct eeh_pe *pe)
 		return;
 	}
 
+	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
 	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
 		goto excess_failures;
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index c963667..ae75722 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -482,6 +482,33 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
 }
 
 /**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+	struct timeval tstamp;
+
+	if (!pe) return;
+
+	if (pe->freeze_count <= 0) {
+		pe->freeze_count = 0;
+		do_gettimeofday(&pe->tstamp);
+	} else {
+		do_gettimeofday(&tstamp);
+		if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+			pe->tstamp = tstamp;
+			pe->freeze_count = 0;
+		}
+	}
+}
+
+/**
  * __eeh_pe_state_mark - Mark the state for the PE
  * @data: EEH PE
  * @flag: state
-- 
cgit v0.10.2


From 99866595340fa24079f61962b2541db3225e7aad Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:02 +0800
Subject: powerpc/eeh: Allow to purge EEH events

On PowerNV platform, we might run into the situation where subsequent
events are duplicated events of former one, which is being processed.
For the case, we need the function implemented by the patch to purge
EEH events accordingly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index de92c86..89d5670 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -33,6 +33,7 @@ struct eeh_event {
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
+void eeh_remove_event(struct eeh_pe *pe);
 void eeh_handle_event(struct eeh_pe *pe);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 62e532d..39bcd81 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -142,3 +142,40 @@ int eeh_send_failure_event(struct eeh_pe *pe)
 
 	return 0;
 }
+
+/**
+ * eeh_remove_event - Remove EEH event from the queue
+ * @pe: Event binding to the PE
+ *
+ * On PowerNV platform, we might have subsequent coming events
+ * is part of the former one. For that case, those subsequent
+ * coming events are totally duplicated and unnecessary, thus
+ * they should be removed.
+ */
+void eeh_remove_event(struct eeh_pe *pe)
+{
+	unsigned long flags;
+	struct eeh_event *event, *tmp;
+
+	spin_lock_irqsave(&eeh_eventlist_lock, flags);
+	list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
+		/*
+		 * If we don't have valid PE passed in, that means
+		 * we already have event corresponding to dead IOC
+		 * and all events should be purged.
+		 */
+		if (!pe) {
+			list_del(&event->list);
+			kfree(event);
+		} else if (pe->type & EEH_PE_PHB) {
+			if (event->pe && event->pe->phb == pe->phb) {
+				list_del(&event->list);
+				kfree(event);
+			}
+		} else if (event->pe == pe) {
+			list_del(&event->list);
+			kfree(event);
+		}
+	}
+	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+}
-- 
cgit v0.10.2


From 4907581dc21f43f94d3a15dd98f62a8f936b3050 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:03 +0800
Subject: powerpc/eeh: Export confirm_error_lock

An EEH event is created and queued to the event queue for each
ingress EEH error. When there're mutiple EEH errors, we need serialize
the process to keep consistent PE state (flags). The spinlock
"confirm_error_lock" was introduced for the purpose. We'll inject
EEH event upon error reporting interrupts on PowerNV platform. So
we export the spinlock for that to use for consistent PE state.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e1109fd..0c0ac93 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -150,6 +150,7 @@ struct eeh_ops {
 extern struct eeh_ops *eeh_ops;
 extern int eeh_subsystem_enabled;
 extern struct mutex eeh_mutex;
+extern raw_spinlock_t confirm_error_lock;
 extern int eeh_probe_mode;
 
 #define EEH_PROBE_MODE_DEV	(1<<0)	/* From PCI device	*/
@@ -180,6 +181,16 @@ static inline void eeh_unlock(void)
 	mutex_unlock(&eeh_mutex);
 }
 
+static inline void eeh_serialize_lock(unsigned long *flags)
+{
+	raw_spin_lock_irqsave(&confirm_error_lock, *flags);
+}
+
+static inline void eeh_serialize_unlock(unsigned long flags)
+{
+	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+}
+
 /*
  * Max number of EEH freezes allowed before we consider the device
  * to be permanently disabled.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 777ecc0..81cd031 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -107,7 +107,7 @@ int eeh_probe_mode;
 DEFINE_MUTEX(eeh_mutex);
 
 /* Lock to avoid races due to multiple reports of an error */
-static DEFINE_RAW_SPINLOCK(confirm_error_lock);
+DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
@@ -325,7 +325,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 * in one slot might report errors simultaneously, and we
 	 * only want one error recovery routine running.
 	 */
-	raw_spin_lock_irqsave(&confirm_error_lock, flags);
+	eeh_serialize_lock(&flags);
 	rc = 1;
 	if (pe->state & EEH_PE_ISOLATED) {
 		pe->check_count++;
@@ -374,7 +374,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 * bridges.
 	 */
 	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
-	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+	eeh_serialize_unlock(flags);
 
 	eeh_send_failure_event(pe);
 
@@ -386,7 +386,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	return 1;
 
 dn_unlock:
-	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+	eeh_serialize_unlock(flags);
 	return rc;
 }
 
@@ -702,8 +702,6 @@ int __init eeh_init(void)
 		return ret;
 	}
 
-	raw_spin_lock_init(&confirm_error_lock);
-
 	/* Initialize EEH event */
 	ret = eeh_event_init();
 	if (ret)
-- 
cgit v0.10.2


From 8a6b1bc70dbb538cb8a39e8c5be9c3dfd7b1f40e Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:04 +0800
Subject: powerpc/eeh: EEH core to handle special event

On PowerNV platform, the EEH event caused by interrupt won't have
binding PE. The patch enables EEH core to handle the special event.
To avoid the current logic we have, The eeh_handle_event() is renamed
to eeh_handle_normal_event(), and the eeh_handle_special_event() is
introduced. The function eeh_handle_event() dispatches to above two
functions according to the input parameter. Besides, new backend
"next_error" added to eeh_ops and it's expected to have following
return values:

        4 - Dead IOC           3 - Dead PHB
        2 - Fenced PHB         1 - Frozen PE
        0 - No error found

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 0c0ac93..a0b11fb 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -53,6 +53,7 @@ struct device_node;
 
 #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
 #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
+#define EEH_PE_PHB_DEAD		(1 << 2)	/* Dead PHB		*/
 
 struct eeh_pe {
 	int type;			/* PE type: PHB/Bus/Device	*/
@@ -145,6 +146,7 @@ struct eeh_ops {
 	int (*configure_bridge)(struct eeh_pe *pe);
 	int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
 	int (*write_config)(struct device_node *dn, int where, int size, u32 val);
+	int (*next_error)(struct eeh_pe **pe);
 };
 
 extern struct eeh_ops *eeh_ops;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 678bc6c..0974e13 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
  */
 #define MAX_WAIT_FOR_RECOVERY 150
 
-/**
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
+static void eeh_handle_normal_event(struct eeh_pe *pe)
 {
 	struct pci_bus *frozen_bus;
 	int rc = 0;
@@ -554,3 +537,112 @@ perm_error:
 	if (frozen_bus)
 		pcibios_remove_pci_devices(frozen_bus);
 }
+
+static void eeh_handle_special_event(void)
+{
+	struct eeh_pe *pe, *phb_pe;
+	struct pci_bus *bus;
+	struct pci_controller *hose, *tmp;
+	unsigned long flags;
+	int rc = 0;
+
+	/*
+	 * The return value from next_error() has been classified as follows.
+	 * It might be good to enumerate them. However, next_error() is only
+	 * supported by PowerNV platform for now. So it would be fine to use
+	 * integer directly:
+	 *
+	 * 4 - Dead IOC           3 - Dead PHB
+	 * 2 - Fenced PHB         1 - Frozen PE
+	 * 0 - No error found
+	 *
+	 */
+	rc = eeh_ops->next_error(&pe);
+	if (rc <= 0)
+		return;
+
+	switch (rc) {
+	case 4:
+		/* Mark all PHBs in dead state */
+		eeh_serialize_lock(&flags);
+		list_for_each_entry_safe(hose, tmp,
+				&hose_list, list_node) {
+			phb_pe = eeh_phb_pe_get(hose);
+			if (!phb_pe) continue;
+
+			eeh_pe_state_mark(phb_pe,
+				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+		}
+		eeh_serialize_unlock(flags);
+
+		/* Purge all events */
+		eeh_remove_event(NULL);
+		break;
+	case 3:
+	case 2:
+	case 1:
+		/* Mark the PE in fenced state */
+		eeh_serialize_lock(&flags);
+		if (rc == 3)
+			eeh_pe_state_mark(pe,
+				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+		else
+			eeh_pe_state_mark(pe,
+				EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+		eeh_serialize_unlock(flags);
+
+		/* Purge all events of the PHB */
+		eeh_remove_event(pe);
+		break;
+	default:
+		pr_err("%s: Invalid value %d from next_error()\n",
+		       __func__, rc);
+		return;
+	}
+
+	/*
+	 * For fenced PHB and frozen PE, it's handled as normal
+	 * event. We have to remove the affected PHBs for dead
+	 * PHB and IOC
+	 */
+	if (rc == 2 || rc == 1)
+		eeh_handle_normal_event(pe);
+	else {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node) {
+			phb_pe = eeh_phb_pe_get(hose);
+			if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
+				continue;
+
+			bus = eeh_pe_bus_get(phb_pe);
+			/* Notify all devices that they're about to go down. */
+			eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+			pcibios_remove_pci_devices(bus);
+		}
+	}
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+	if (pe)
+		eeh_handle_normal_event(pe);
+	else
+		eeh_handle_special_event();
+}
-- 
cgit v0.10.2


From 23773230c823cf79415a436aa26009025008fef5 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:05 +0800
Subject: powerpc/eeh: Sync OPAL API with firmware

The patch synchronizes OPAL APIs between kernel and firmware. Also,
we starts to replace opal_pci_get_phb_diag_data() with the similar
opal_pci_get_phb_diag_data2() and the former OPAL API would return
OPAL_UNSUPPORTED from now on.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index cbb9305..2880797 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -117,7 +117,13 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_SET_SLOT_LED_STATUS		55
 #define OPAL_GET_EPOW_STATUS			56
 #define OPAL_SET_SYSTEM_ATTENTION_LED		57
+#define OPAL_RESERVED1				58
+#define OPAL_RESERVED2				59
+#define OPAL_PCI_NEXT_ERROR			60
+#define OPAL_PCI_EEH_FREEZE_STATUS2		61
+#define OPAL_PCI_POLL				62
 #define OPAL_PCI_MSI_EOI			63
+#define OPAL_PCI_GET_PHB_DIAG_DATA2		64
 
 #ifndef __ASSEMBLY__
 
@@ -125,6 +131,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
 enum OpalVendorApiTokens {
 	OPAL_START_VENDOR_API_RANGE = 1000, OPAL_END_VENDOR_API_RANGE = 1999
 };
+
 enum OpalFreezeState {
 	OPAL_EEH_STOPPED_NOT_FROZEN = 0,
 	OPAL_EEH_STOPPED_MMIO_FREEZE = 1,
@@ -134,55 +141,69 @@ enum OpalFreezeState {
 	OPAL_EEH_STOPPED_TEMP_UNAVAIL = 5,
 	OPAL_EEH_STOPPED_PERM_UNAVAIL = 6
 };
+
 enum OpalEehFreezeActionToken {
 	OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO = 1,
 	OPAL_EEH_ACTION_CLEAR_FREEZE_DMA = 2,
 	OPAL_EEH_ACTION_CLEAR_FREEZE_ALL = 3
 };
+
 enum OpalPciStatusToken {
-	OPAL_EEH_PHB_NO_ERROR = 0,
-	OPAL_EEH_PHB_FATAL = 1,
-	OPAL_EEH_PHB_RECOVERABLE = 2,
-	OPAL_EEH_PHB_BUS_ERROR = 3,
-	OPAL_EEH_PCI_NO_DEVSEL = 4,
-	OPAL_EEH_PCI_TA = 5,
-	OPAL_EEH_PCIEX_UR = 6,
-	OPAL_EEH_PCIEX_CA = 7,
-	OPAL_EEH_PCI_MMIO_ERROR = 8,
-	OPAL_EEH_PCI_DMA_ERROR = 9
+	OPAL_EEH_NO_ERROR	= 0,
+	OPAL_EEH_IOC_ERROR	= 1,
+	OPAL_EEH_PHB_ERROR	= 2,
+	OPAL_EEH_PE_ERROR	= 3,
+	OPAL_EEH_PE_MMIO_ERROR	= 4,
+	OPAL_EEH_PE_DMA_ERROR	= 5
 };
+
+enum OpalPciErrorSeverity {
+	OPAL_EEH_SEV_NO_ERROR	= 0,
+	OPAL_EEH_SEV_IOC_DEAD	= 1,
+	OPAL_EEH_SEV_PHB_DEAD	= 2,
+	OPAL_EEH_SEV_PHB_FENCED	= 3,
+	OPAL_EEH_SEV_PE_ER	= 4,
+	OPAL_EEH_SEV_INF	= 5
+};
+
 enum OpalShpcAction {
 	OPAL_SHPC_GET_LINK_STATE = 0,
 	OPAL_SHPC_GET_SLOT_STATE = 1
 };
+
 enum OpalShpcLinkState {
 	OPAL_SHPC_LINK_DOWN = 0,
 	OPAL_SHPC_LINK_UP = 1
 };
+
 enum OpalMmioWindowType {
 	OPAL_M32_WINDOW_TYPE = 1,
 	OPAL_M64_WINDOW_TYPE = 2,
 	OPAL_IO_WINDOW_TYPE = 3
 };
+
 enum OpalShpcSlotState {
 	OPAL_SHPC_DEV_NOT_PRESENT = 0,
 	OPAL_SHPC_DEV_PRESENT = 1
 };
+
 enum OpalExceptionHandler {
 	OPAL_MACHINE_CHECK_HANDLER = 1,
 	OPAL_HYPERVISOR_MAINTENANCE_HANDLER = 2,
 	OPAL_SOFTPATCH_HANDLER = 3
 };
+
 enum OpalPendingState {
-	OPAL_EVENT_OPAL_INTERNAL = 0x1,
-	OPAL_EVENT_NVRAM = 0x2,
-	OPAL_EVENT_RTC = 0x4,
-	OPAL_EVENT_CONSOLE_OUTPUT = 0x8,
-	OPAL_EVENT_CONSOLE_INPUT = 0x10,
-	OPAL_EVENT_ERROR_LOG_AVAIL = 0x20,
-	OPAL_EVENT_ERROR_LOG = 0x40,
-	OPAL_EVENT_EPOW = 0x80,
-	OPAL_EVENT_LED_STATUS = 0x100
+	OPAL_EVENT_OPAL_INTERNAL	= 0x1,
+	OPAL_EVENT_NVRAM		= 0x2,
+	OPAL_EVENT_RTC			= 0x4,
+	OPAL_EVENT_CONSOLE_OUTPUT	= 0x8,
+	OPAL_EVENT_CONSOLE_INPUT	= 0x10,
+	OPAL_EVENT_ERROR_LOG_AVAIL	= 0x20,
+	OPAL_EVENT_ERROR_LOG		= 0x40,
+	OPAL_EVENT_EPOW			= 0x80,
+	OPAL_EVENT_LED_STATUS		= 0x100,
+	OPAL_EVENT_PCI_ERROR		= 0x200
 };
 
 /* Machine check related definitions */
@@ -364,15 +385,80 @@ struct opal_machine_check_event {
 	} u;
 };
 
+enum {
+	OPAL_P7IOC_DIAG_TYPE_NONE	= 0,
+	OPAL_P7IOC_DIAG_TYPE_RGC	= 1,
+	OPAL_P7IOC_DIAG_TYPE_BI		= 2,
+	OPAL_P7IOC_DIAG_TYPE_CI		= 3,
+	OPAL_P7IOC_DIAG_TYPE_MISC	= 4,
+	OPAL_P7IOC_DIAG_TYPE_I2C	= 5,
+	OPAL_P7IOC_DIAG_TYPE_LAST	= 6
+};
+
+struct OpalIoP7IOCErrorData {
+	uint16_t type;
+
+	/* GEM */
+	uint64_t gemXfir;
+	uint64_t gemRfir;
+	uint64_t gemRirqfir;
+	uint64_t gemMask;
+	uint64_t gemRwof;
+
+	/* LEM */
+	uint64_t lemFir;
+	uint64_t lemErrMask;
+	uint64_t lemAction0;
+	uint64_t lemAction1;
+	uint64_t lemWof;
+
+	union {
+		struct OpalIoP7IOCRgcErrorData {
+			uint64_t rgcStatus;		/* 3E1C10 */
+			uint64_t rgcLdcp;		/* 3E1C18 */
+		}rgc;
+		struct OpalIoP7IOCBiErrorData {
+			uint64_t biLdcp0;		/* 3C0100, 3C0118 */
+			uint64_t biLdcp1;		/* 3C0108, 3C0120 */
+			uint64_t biLdcp2;		/* 3C0110, 3C0128 */
+			uint64_t biFenceStatus;		/* 3C0130, 3C0130 */
+
+			uint8_t  biDownbound;		/* BI Downbound or Upbound */
+		}bi;
+		struct OpalIoP7IOCCiErrorData {
+			uint64_t ciPortStatus;		/* 3Dn008 */
+			uint64_t ciPortLdcp;		/* 3Dn010 */
+
+			uint8_t	 ciPort;		/* Index of CI port: 0/1 */
+		}ci;
+	};
+};
+
 /**
  * This structure defines the overlay which will be used to store PHB error
  * data upon request.
  */
 enum {
+	OPAL_PHB_ERROR_DATA_VERSION_1 = 1,
+};
+
+enum {
+	OPAL_PHB_ERROR_DATA_TYPE_P7IOC = 1,
+};
+
+enum {
 	OPAL_P7IOC_NUM_PEST_REGS = 128,
 };
 
+struct OpalIoPhbErrorCommon {
+	uint32_t version;
+	uint32_t ioType;
+	uint32_t len;
+};
+
 struct OpalIoP7IOCPhbErrorData {
+	struct OpalIoPhbErrorCommon common;
+
 	uint32_t brdgCtl;
 
 	// P7IOC utl regs
@@ -530,14 +616,21 @@ int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number,
 					uint64_t pci_mem_size);
 int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope, uint8_t assert_state);
 
-int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer, uint64_t diag_buffer_len);
-int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer, uint64_t diag_buffer_len);
+int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer,
+				   uint64_t diag_buffer_len);
+int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer,
+				   uint64_t diag_buffer_len);
+int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id, void *diag_buffer,
+				    uint64_t diag_buffer_len);
 int64_t opal_pci_fence_phb(uint64_t phb_id);
 int64_t opal_pci_reinit(uint64_t phb_id, uint8_t reinit_scope);
 int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action);
 int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action);
 int64_t opal_get_epow_status(uint64_t *status);
 int64_t opal_set_system_attention_led(uint8_t led_action);
+int64_t opal_pci_next_error(uint64_t phb_id, uint64_t *first_frozen_pe,
+			    uint16_t *pci_error_type, uint16_t *severity);
+int64_t opal_pci_poll(uint64_t phb_id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 6fabe92..e88863f 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -107,4 +107,7 @@ OPAL_CALL(opal_pci_mask_pe_error,		OPAL_PCI_MASK_PE_ERROR);
 OPAL_CALL(opal_set_slot_led_status,		OPAL_SET_SLOT_LED_STATUS);
 OPAL_CALL(opal_get_epow_status,			OPAL_GET_EPOW_STATUS);
 OPAL_CALL(opal_set_system_attention_led,	OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error,			OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll,			OPAL_PCI_POLL);
 OPAL_CALL(opal_pci_msi_eoi,			OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2,		OPAL_PCI_GET_PHB_DIAG_DATA2);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index e16b729..5edceb7 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -203,7 +203,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
 
 	spin_lock_irqsave(&phb->lock, flags);
 
-	rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+					 PNV_PCI_DIAG_BUF_SIZE);
 	has_diag = (rc == OPAL_SUCCESS);
 
 	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
-- 
cgit v0.10.2


From 8747f36324bbe7f762bd9744d2dd20ebda021547 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:06 +0800
Subject: powerpc/eeh: EEH backend for P7IOC

For EEH on PowerNV platform, the overall architecture is different
from that on pSeries platform. In order to support multiple I/O chips
in future, we split EEH to 3 layers for PowerNV platform: EEH core,
platform layer, I/O layer. It would give EEH implementation on PowerNV
platform much more flexibility in future.

The patch adds the EEH backend for P7IOC.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index bcc3cb4..09bd0cb 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -3,3 +3,4 @@ obj-y			+= opal-rtc.o opal-nvram.o
 
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
+obj-$(CONFIG_EEH)	+= eeh-ioda.o
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
new file mode 100644
index 0000000..f12e888
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -0,0 +1,45 @@
+/*
+ * The file intends to implement the functions needed by EEH, which is
+ * built on IODA compliant chip. Actually, lots of functions related
+ * to EEH would be built based on the OPAL APIs.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/tce.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+struct pnv_eeh_ops ioda_eeh_ops = {
+	.post_init		= NULL,
+	.set_option		= NULL,
+	.get_state		= NULL,
+	.reset			= NULL,
+	.get_log		= NULL,
+	.configure_bridge	= NULL,
+	.next_error		= NULL
+};
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 25d76c4..336c9dc 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -66,15 +66,35 @@ struct pnv_ioda_pe {
 	struct list_head	list;
 };
 
+/* IOC dependent EEH operations */
+#ifdef CONFIG_EEH
+struct pnv_eeh_ops {
+	int (*post_init)(struct pci_controller *hose);
+	int (*set_option)(struct eeh_pe *pe, int option);
+	int (*get_state)(struct eeh_pe *pe);
+	int (*reset)(struct eeh_pe *pe, int option);
+	int (*get_log)(struct eeh_pe *pe, int severity,
+		       char *drv_log, unsigned long len);
+	int (*configure_bridge)(struct eeh_pe *pe);
+	int (*next_error)(struct eeh_pe **pe);
+};
+#endif /* CONFIG_EEH */
+
 struct pnv_phb {
 	struct pci_controller	*hose;
 	enum pnv_phb_type	type;
 	enum pnv_phb_model	model;
+	u64			hub_id;
 	u64			opal_id;
 	void __iomem		*regs;
 	int			initialized;
 	spinlock_t		lock;
 
+#ifdef CONFIG_EEH
+	struct pnv_eeh_ops	*eeh_ops;
+	int			eeh_enabled;
+#endif
+
 #ifdef CONFIG_PCI_MSI
 	unsigned int		msi_base;
 	unsigned int		msi32_support;
@@ -150,6 +170,9 @@ struct pnv_phb {
 };
 
 extern struct pci_ops pnv_pci_ops;
+#ifdef CONFIG_EEH
+extern struct pnv_eeh_ops ioda_eeh_ops;
+#endif
 
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
-- 
cgit v0.10.2


From 73370c662b4e453185201b44b775a49e95870009 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:07 +0800
Subject: powerpc/eeh: I/O chip post initialization

The post initialization (struct eeh_ops::post_init) is called after
the EEH probe is done. On the other hand, the EEH core post
initialization is designed to call platform and then I/O chip backend
on PowerNV platform.

The patch adds the backend for I/O chip to notify the platform
that the specific PHB is ready to supply EEH service.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index f12e888..7b27241 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -34,8 +34,27 @@
 #include "powernv.h"
 #include "pci.h"
 
+/**
+ * ioda_eeh_post_init - Chip dependent post initialization
+ * @hose: PCI controller
+ *
+ * The function will be called after eeh PEs and devices
+ * have been built. That means the EEH is ready to supply
+ * service with I/O cache.
+ */
+static int ioda_eeh_post_init(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+
+	/* FIXME: Enable it for PHB3 later */
+	if (phb->type == PNV_PHB_IODA1)
+		phb->eeh_enabled = 1;
+
+	return 0;
+}
+
 struct pnv_eeh_ops ioda_eeh_ops = {
-	.post_init		= NULL,
+	.post_init		= ioda_eeh_post_init,
 	.set_option		= NULL,
 	.get_state		= NULL,
 	.reset			= NULL,
-- 
cgit v0.10.2


From eb0059836baa14c58ebad030684846213aaece89 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:08 +0800
Subject: powerpc/eeh: I/O chip EEH enable option

The patch adds the backend to enable or disable EEH functionality
for the specified PE. The backend is also used to enable MMIO or
DMA path for the problematic PE. It's notable that all PEs on
PowerNV platform support EEH functionality by default, and we
disallow to disable EEH for the specific PE.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 7b27241..744eb9e 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -53,9 +53,72 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 	return 0;
 }
 
+/**
+ * ioda_eeh_set_option - Set EEH operation or I/O setting
+ * @pe: EEH PE
+ * @option: options
+ *
+ * Enable or disable EEH option for the indicated PE. The
+ * function also can be used to enable I/O or DMA for the
+ * PE.
+ */
+static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
+{
+	s64 ret;
+	u32 pe_no;
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+
+	/* Check on PE number */
+	if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
+		pr_err("%s: PE address %x out of range [0, %x] "
+		       "on PHB#%x\n",
+			__func__, pe->addr, phb->ioda.total_pe,
+			hose->global_number);
+		return -EINVAL;
+	}
+
+	pe_no = pe->addr;
+	switch (option) {
+	case EEH_OPT_DISABLE:
+		ret = -EEXIST;
+		break;
+	case EEH_OPT_ENABLE:
+		ret = 0;
+		break;
+	case EEH_OPT_THAW_MMIO:
+		ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+				OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO);
+		if (ret) {
+			pr_warning("%s: Failed to enable MMIO for "
+				   "PHB#%x-PE#%x, err=%lld\n",
+				__func__, hose->global_number, pe_no, ret);
+			return -EIO;
+		}
+
+		break;
+	case EEH_OPT_THAW_DMA:
+		ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+				OPAL_EEH_ACTION_CLEAR_FREEZE_DMA);
+		if (ret) {
+			pr_warning("%s: Failed to enable DMA for "
+				   "PHB#%x-PE#%x, err=%lld\n",
+				__func__, hose->global_number, pe_no, ret);
+			return -EIO;
+		}
+
+		break;
+	default:
+		pr_warning("%s: Invalid option %d\n", __func__, option);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
 struct pnv_eeh_ops ioda_eeh_ops = {
 	.post_init		= ioda_eeh_post_init,
-	.set_option		= NULL,
+	.set_option		= ioda_eeh_set_option,
 	.get_state		= NULL,
 	.reset			= NULL,
 	.get_log		= NULL,
-- 
cgit v0.10.2


From 8c41a7f3f7593fe57578f3f649232a5842d4c65d Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:09 +0800
Subject: powerpc/eeh: I/O chip EEH state retrieval

The patch adds I/O chip backend to retrieve the state for the
indicated PE. While the PE state is temperarily unavailable,
the upper layer (powernv platform) should return default delay
(1 second).

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 744eb9e..a76870b 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -116,10 +116,107 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
 	return ret;
 }
 
+/**
+ * ioda_eeh_get_state - Retrieve the state of PE
+ * @pe: EEH PE
+ *
+ * The PE's state should be retrieved from the PEEV, PEST
+ * IODA tables. Since the OPAL has exported the function
+ * to do it, it'd better to use that.
+ */
+static int ioda_eeh_get_state(struct eeh_pe *pe)
+{
+	s64 ret = 0;
+	u8 fstate;
+	u16 pcierr;
+	u32 pe_no;
+	int result;
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+
+	/*
+	 * Sanity check on PE address. The PHB PE address should
+	 * be zero.
+	 */
+	if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
+		pr_err("%s: PE address %x out of range [0, %x] "
+		       "on PHB#%x\n",
+		       __func__, pe->addr, phb->ioda.total_pe,
+		       hose->global_number);
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	/* Retrieve PE status through OPAL */
+	pe_no = pe->addr;
+	ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+			&fstate, &pcierr, NULL);
+	if (ret) {
+		pr_err("%s: Failed to get EEH status on "
+		       "PHB#%x-PE#%x\n, err=%lld\n",
+		       __func__, hose->global_number, pe_no, ret);
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	/* Check PHB status */
+	if (pe->type & EEH_PE_PHB) {
+		result = 0;
+		result &= ~EEH_STATE_RESET_ACTIVE;
+
+		if (pcierr != OPAL_EEH_PHB_ERROR) {
+			result |= EEH_STATE_MMIO_ACTIVE;
+			result |= EEH_STATE_DMA_ACTIVE;
+			result |= EEH_STATE_MMIO_ENABLED;
+			result |= EEH_STATE_DMA_ENABLED;
+		}
+
+		return result;
+	}
+
+	/* Parse result out */
+	result = 0;
+	switch (fstate) {
+	case OPAL_EEH_STOPPED_NOT_FROZEN:
+		result &= ~EEH_STATE_RESET_ACTIVE;
+		result |= EEH_STATE_MMIO_ACTIVE;
+		result |= EEH_STATE_DMA_ACTIVE;
+		result |= EEH_STATE_MMIO_ENABLED;
+		result |= EEH_STATE_DMA_ENABLED;
+		break;
+	case OPAL_EEH_STOPPED_MMIO_FREEZE:
+		result &= ~EEH_STATE_RESET_ACTIVE;
+		result |= EEH_STATE_DMA_ACTIVE;
+		result |= EEH_STATE_DMA_ENABLED;
+		break;
+	case OPAL_EEH_STOPPED_DMA_FREEZE:
+		result &= ~EEH_STATE_RESET_ACTIVE;
+		result |= EEH_STATE_MMIO_ACTIVE;
+		result |= EEH_STATE_MMIO_ENABLED;
+		break;
+	case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
+		result &= ~EEH_STATE_RESET_ACTIVE;
+		break;
+	case OPAL_EEH_STOPPED_RESET:
+		result |= EEH_STATE_RESET_ACTIVE;
+		break;
+	case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
+		result |= EEH_STATE_UNAVAILABLE;
+		break;
+	case OPAL_EEH_STOPPED_PERM_UNAVAIL:
+		result |= EEH_STATE_NOT_SUPPORT;
+		break;
+	default:
+		pr_warning("%s: Unexpected EEH status 0x%x "
+			   "on PHB#%x-PE#%x\n",
+			   __func__, fstate, hose->global_number, pe_no);
+	}
+
+	return result;
+}
+
 struct pnv_eeh_ops ioda_eeh_ops = {
 	.post_init		= ioda_eeh_post_init,
 	.set_option		= ioda_eeh_set_option,
-	.get_state		= NULL,
+	.get_state		= ioda_eeh_get_state,
 	.reset			= NULL,
 	.get_log		= NULL,
 	.configure_bridge	= NULL,
-- 
cgit v0.10.2


From 9d5cab00108f42a5ab51cfb1fcb03130679a5762 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:10 +0800
Subject: powerpc/eeh: I/O chip PE reset

The patch adds the I/O chip backend to do PE reset. For now, we
focus on PCI bus dependent PE. If PHB PE has been put into error
state, the PHB will take complete reset. Besides, the root bridge
will take fundamental or hot reset accordingly if the indicated
PE locates at the toppest of PCI hierarchy tree. Otherwise, the
upstream p2p bridge will take hot reset.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index a76870b..ea5fa05 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -213,11 +213,243 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
 	return result;
 }
 
+static int ioda_eeh_pe_clear(struct eeh_pe *pe)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	u32 pe_no;
+	u8 fstate;
+	u16 pcierr;
+	s64 ret;
+
+	pe_no = pe->addr;
+	hose = pe->phb;
+	phb = pe->phb->private_data;
+
+	/* Clear the EEH error on the PE */
+	ret = opal_pci_eeh_freeze_clear(phb->opal_id,
+			pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	if (ret) {
+		pr_err("%s: Failed to clear EEH error for "
+		       "PHB#%x-PE#%x, err=%lld\n",
+		       __func__, hose->global_number, pe_no, ret);
+		return -EIO;
+	}
+
+	/*
+	 * Read the PE state back and verify that the frozen
+	 * state has been removed.
+	 */
+	ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+			&fstate, &pcierr, NULL);
+	if (ret) {
+		pr_err("%s: Failed to get EEH status on "
+		       "PHB#%x-PE#%x\n, err=%lld\n",
+		       __func__, hose->global_number, pe_no, ret);
+		return -EIO;
+	}
+
+	if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) {
+		pr_err("%s: Frozen state not cleared on "
+		       "PHB#%x-PE#%x, sts=%x\n",
+		       __func__, hose->global_number, pe_no, fstate);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
+{
+	s64 rc = OPAL_HARDWARE;
+
+	while (1) {
+		rc = opal_pci_poll(phb->opal_id);
+		if (rc <= 0)
+			break;
+
+		msleep(rc);
+	}
+
+	return rc;
+}
+
+static int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
+{
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc = OPAL_HARDWARE;
+
+	pr_debug("%s: Reset PHB#%x, option=%d\n",
+		 __func__, hose->global_number, option);
+
+	/* Issue PHB complete reset request */
+	if (option == EEH_RESET_FUNDAMENTAL ||
+	    option == EEH_RESET_HOT)
+		rc = opal_pci_reset(phb->opal_id,
+				OPAL_PHB_COMPLETE,
+				OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_DEACTIVATE)
+		rc = opal_pci_reset(phb->opal_id,
+				OPAL_PHB_COMPLETE,
+				OPAL_DEASSERT_RESET);
+	if (rc < 0)
+		goto out;
+
+	/*
+	 * Poll state of the PHB until the request is done
+	 * successfully.
+	 */
+	rc = ioda_eeh_phb_poll(phb);
+out:
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static int ioda_eeh_root_reset(struct pci_controller *hose, int option)
+{
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc = OPAL_SUCCESS;
+
+	pr_debug("%s: Reset PHB#%x, option=%d\n",
+		 __func__, hose->global_number, option);
+
+	/*
+	 * During the reset deassert time, we needn't care
+	 * the reset scope because the firmware does nothing
+	 * for fundamental or hot reset during deassert phase.
+	 */
+	if (option == EEH_RESET_FUNDAMENTAL)
+		rc = opal_pci_reset(phb->opal_id,
+				OPAL_PCI_FUNDAMENTAL_RESET,
+				OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_HOT)
+		rc = opal_pci_reset(phb->opal_id,
+				OPAL_PCI_HOT_RESET,
+				OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_DEACTIVATE)
+		rc = opal_pci_reset(phb->opal_id,
+				OPAL_PCI_HOT_RESET,
+				OPAL_DEASSERT_RESET);
+	if (rc < 0)
+		goto out;
+
+	/* Poll state of the PHB until the request is done */
+	rc = ioda_eeh_phb_poll(phb);
+out:
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static int ioda_eeh_bridge_reset(struct pci_controller *hose,
+		struct pci_dev *dev, int option)
+{
+	u16 ctrl;
+
+	pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n",
+		 __func__, hose->global_number, dev->bus->number,
+		 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option);
+
+	switch (option) {
+	case EEH_RESET_FUNDAMENTAL:
+	case EEH_RESET_HOT:
+		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+		ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+		ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * ioda_eeh_reset - Reset the indicated PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Do reset on the indicated PE. For PCI bus sensitive PE,
+ * we need to reset the parent p2p bridge. The PHB has to
+ * be reinitialized if the p2p bridge is root bridge. For
+ * PCI device sensitive PE, we will try to reset the device
+ * through FLR. For now, we don't have OPAL APIs to do HARD
+ * reset yet, so all reset would be SOFT (HOT) reset.
+ */
+static int ioda_eeh_reset(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct eeh_dev *edev;
+	struct pci_dev *dev;
+	int ret;
+
+	/*
+	 * Anyway, we have to clear the problematic state for the
+	 * corresponding PE. However, we needn't do it if the PE
+	 * is PHB associated. That means the PHB is having fatal
+	 * errors and it needs reset. Further more, the AIB interface
+	 * isn't reliable any more.
+	 */
+	if (!(pe->type & EEH_PE_PHB) &&
+	    (option == EEH_RESET_HOT ||
+	    option == EEH_RESET_FUNDAMENTAL)) {
+		ret = ioda_eeh_pe_clear(pe);
+		if (ret)
+			return -EIO;
+	}
+
+	/*
+	 * The rules applied to reset, either fundamental or hot reset:
+	 *
+	 * We always reset the direct upstream bridge of the PE. If the
+	 * direct upstream bridge isn't root bridge, we always take hot
+	 * reset no matter what option (fundamental or hot) is. Otherwise,
+	 * we should do the reset according to the required option.
+	 */
+	if (pe->type & EEH_PE_PHB) {
+		ret = ioda_eeh_phb_reset(hose, option);
+	} else {
+		if (pe->type & EEH_PE_DEVICE) {
+			/*
+			 * If it's device PE, we didn't refer to the parent
+			 * PCI bus yet. So we have to figure it out indirectly.
+			 */
+			edev = list_first_entry(&pe->edevs,
+					struct eeh_dev, list);
+			dev = eeh_dev_to_pci_dev(edev);
+			dev = dev->bus->self;
+		} else {
+			/*
+			 * If it's bus PE, the parent PCI bus is already there
+			 * and just pick it up.
+			 */
+			dev = pe->bus->self;
+		}
+
+		/*
+		 * Do reset based on the fact that the direct upstream bridge
+		 * is root bridge (port) or not.
+		 */
+		if (dev->bus->number == 0)
+			ret = ioda_eeh_root_reset(hose, option);
+		else
+			ret = ioda_eeh_bridge_reset(hose, dev, option);
+	}
+
+	return ret;
+}
+
 struct pnv_eeh_ops ioda_eeh_ops = {
 	.post_init		= ioda_eeh_post_init,
 	.set_option		= ioda_eeh_set_option,
 	.get_state		= ioda_eeh_get_state,
-	.reset			= NULL,
+	.reset			= ioda_eeh_reset,
 	.get_log		= NULL,
 	.configure_bridge	= NULL,
 	.next_error		= NULL
-- 
cgit v0.10.2


From bf90dfea2397fe136ce35bc896c3bc84133272c6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:11 +0800
Subject: powerpc/eeh: I/O chip PE log and bridge setup

The patch adds backends to retrieve error log and configure p2p
bridges for the indicated PE.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index ea5fa05..8d9c2d2 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -445,12 +445,65 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 	return ret;
 }
 
+/**
+ * ioda_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: Severity level of the log
+ * @drv_log: buffer to store the log
+ * @len: space of the log buffer
+ *
+ * The function is used to retrieve error log from P7IOC.
+ */
+static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
+			    char *drv_log, unsigned long len)
+{
+	s64 ret;
+	unsigned long flags;
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+
+	spin_lock_irqsave(&phb->lock, flags);
+
+	ret = opal_pci_get_phb_diag_data2(phb->opal_id,
+			phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+	if (ret) {
+		spin_unlock_irqrestore(&phb->lock, flags);
+		pr_warning("%s: Failed to get log for PHB#%x-PE#%x\n",
+			   __func__, hose->global_number, pe->addr);
+		return -EIO;
+	}
+
+	/*
+	 * FIXME: We probably need log the error in somewhere.
+	 * Lets make it up in future.
+	 */
+	/* pr_info("%s", phb->diag.blob); */
+
+	spin_unlock_irqrestore(&phb->lock, flags);
+
+	return 0;
+}
+
+/**
+ * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
+ * @pe: EEH PE
+ *
+ * For particular PE, it might have included PCI bridges. In order
+ * to make the PE work properly, those PCI bridges should be configured
+ * correctly. However, we need do nothing on P7IOC since the reset
+ * function will do everything that should be covered by the function.
+ */
+static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
+{
+	return 0;
+}
+
 struct pnv_eeh_ops ioda_eeh_ops = {
 	.post_init		= ioda_eeh_post_init,
 	.set_option		= ioda_eeh_set_option,
 	.get_state		= ioda_eeh_get_state,
 	.reset			= ioda_eeh_reset,
-	.get_log		= NULL,
-	.configure_bridge	= NULL,
+	.get_log		= ioda_eeh_get_log,
+	.configure_bridge	= ioda_eeh_configure_bridge,
 	.next_error		= NULL
 };
-- 
cgit v0.10.2


From 70f942db4669c4417b7bb4f3353b3eddf1179aae Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:12 +0800
Subject: powerpc/eeh: I/O chip next error

The patch implements the backend for EEH core to retrieve next
EEH error to handle. For the informational errors, we won't bother
the EEH core. Otherwise, the EEH should take appropriate actions
depending on the return value:

	0 - No further errors detected
	1 - Frozen PE
	2 - Fenced PHB
	3 - Dead PHB
	4 - Dead IOC

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 8d9c2d2..a3eebd1 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -34,6 +34,15 @@
 #include "powernv.h"
 #include "pci.h"
 
+/* Debugging option */
+#ifdef IODA_EEH_DBG_ON
+#define IODA_EEH_DBG(args...)	pr_info(args)
+#else
+#define IODA_EEH_DBG(args...)
+#endif
+
+static char *hub_diag = NULL;
+
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
  * @hose: PCI controller
@@ -47,8 +56,19 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 	struct pnv_phb *phb = hose->private_data;
 
 	/* FIXME: Enable it for PHB3 later */
-	if (phb->type == PNV_PHB_IODA1)
+	if (phb->type == PNV_PHB_IODA1) {
+		if (!hub_diag) {
+			hub_diag = (char *)__get_free_page(GFP_KERNEL |
+							   __GFP_ZERO);
+			if (!hub_diag) {
+				pr_err("%s: Out of memory !\n",
+				       __func__);
+				return -ENOMEM;
+			}
+		}
+
 		phb->eeh_enabled = 1;
+	}
 
 	return 0;
 }
@@ -498,6 +518,316 @@ static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
 	return 0;
 }
 
+static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
+{
+	/* GEM */
+	pr_info("  GEM XFIR:        %016llx\n", data->gemXfir);
+	pr_info("  GEM RFIR:        %016llx\n", data->gemRfir);
+	pr_info("  GEM RIRQFIR:     %016llx\n", data->gemRirqfir);
+	pr_info("  GEM Mask:        %016llx\n", data->gemMask);
+	pr_info("  GEM RWOF:        %016llx\n", data->gemRwof);
+
+	/* LEM */
+	pr_info("  LEM FIR:         %016llx\n", data->lemFir);
+	pr_info("  LEM Error Mask:  %016llx\n", data->lemErrMask);
+	pr_info("  LEM Action 0:    %016llx\n", data->lemAction0);
+	pr_info("  LEM Action 1:    %016llx\n", data->lemAction1);
+	pr_info("  LEM WOF:         %016llx\n", data->lemWof);
+}
+
+static void ioda_eeh_hub_diag(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct OpalIoP7IOCErrorData *data;
+	long rc;
+
+	data = (struct OpalIoP7IOCErrorData *)ioda_eeh_hub_diag;
+	rc = opal_pci_get_hub_diag_data(phb->hub_id, data, PAGE_SIZE);
+	if (rc != OPAL_SUCCESS) {
+		pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n",
+			   __func__, phb->hub_id, rc);
+		return;
+	}
+
+	switch (data->type) {
+	case OPAL_P7IOC_DIAG_TYPE_RGC:
+		pr_info("P7IOC diag-data for RGC\n\n");
+		ioda_eeh_hub_diag_common(data);
+		pr_info("  RGC Status:      %016llx\n", data->rgc.rgcStatus);
+		pr_info("  RGC LDCP:        %016llx\n", data->rgc.rgcLdcp);
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_BI:
+		pr_info("P7IOC diag-data for BI %s\n\n",
+			data->bi.biDownbound ? "Downbound" : "Upbound");
+		ioda_eeh_hub_diag_common(data);
+		pr_info("  BI LDCP 0:       %016llx\n", data->bi.biLdcp0);
+		pr_info("  BI LDCP 1:       %016llx\n", data->bi.biLdcp1);
+		pr_info("  BI LDCP 2:       %016llx\n", data->bi.biLdcp2);
+		pr_info("  BI Fence Status: %016llx\n", data->bi.biFenceStatus);
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_CI:
+		pr_info("P7IOC diag-data for CI Port %d\\nn",
+			data->ci.ciPort);
+		ioda_eeh_hub_diag_common(data);
+		pr_info("  CI Port Status:  %016llx\n", data->ci.ciPortStatus);
+		pr_info("  CI Port LDCP:    %016llx\n", data->ci.ciPortLdcp);
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_MISC:
+		pr_info("P7IOC diag-data for MISC\n\n");
+		ioda_eeh_hub_diag_common(data);
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_I2C:
+		pr_info("P7IOC diag-data for I2C\n\n");
+		ioda_eeh_hub_diag_common(data);
+		break;
+	default:
+		pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n",
+			   __func__, phb->hub_id, data->type);
+	}
+}
+
+static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose,
+				    struct OpalIoPhbErrorCommon *common)
+{
+	struct OpalIoP7IOCPhbErrorData *data;
+	int i;
+
+	data = (struct OpalIoP7IOCPhbErrorData *)common;
+
+	pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n",
+		hose->global_number, common->version);
+
+	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
+
+	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
+	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
+	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
+
+	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
+	pr_info("  slotStatus:           %08x\n", data->slotStatus);
+	pr_info("  linkStatus:           %08x\n", data->linkStatus);
+	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
+	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
+
+	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
+	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
+	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
+	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
+	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
+	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
+	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
+	pr_info("  sourceId:             %08x\n", data->sourceId);
+
+	pr_info("  errorClass:           %016llx\n", data->errorClass);
+	pr_info("  correlator:           %016llx\n", data->correlator);
+	pr_info("  p7iocPlssr:           %016llx\n", data->p7iocPlssr);
+	pr_info("  p7iocCsr:             %016llx\n", data->p7iocCsr);
+	pr_info("  lemFir:               %016llx\n", data->lemFir);
+	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
+	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
+	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
+	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
+	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
+	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
+	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
+	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
+	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
+	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
+	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
+	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
+	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
+	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
+	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
+	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
+	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
+	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
+
+	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
+		if ((data->pestA[i] >> 63) == 0 &&
+		    (data->pestB[i] >> 63) == 0)
+			continue;
+
+		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
+		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
+	}
+}
+
+static void ioda_eeh_phb_diag(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct OpalIoPhbErrorCommon *common;
+	long rc;
+
+	common = (struct OpalIoPhbErrorCommon *)phb->diag.blob;
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, common, PAGE_SIZE);
+	if (rc != OPAL_SUCCESS) {
+		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
+			    __func__, hose->global_number, rc);
+		return;
+	}
+
+	switch (common->ioType) {
+	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
+		ioda_eeh_p7ioc_phb_diag(hose, common);
+		break;
+	default:
+		pr_warning("%s: Unrecognized I/O chip %d\n",
+			   __func__, common->ioType);
+	}
+}
+
+static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
+			       struct eeh_pe **pe)
+{
+	struct eeh_pe *phb_pe;
+
+	phb_pe = eeh_phb_pe_get(hose);
+	if (!phb_pe) {
+		pr_warning("%s Can't find PE for PHB#%d\n",
+			   __func__, hose->global_number);
+		return -EEXIST;
+	}
+
+	*pe = phb_pe;
+	return 0;
+}
+
+static int ioda_eeh_get_pe(struct pci_controller *hose,
+			   u16 pe_no, struct eeh_pe **pe)
+{
+	struct eeh_pe *phb_pe, *dev_pe;
+	struct eeh_dev dev;
+
+	/* Find the PHB PE */
+	if (ioda_eeh_get_phb_pe(hose, &phb_pe))
+		return -EEXIST;
+
+	/* Find the PE according to PE# */
+	memset(&dev, 0, sizeof(struct eeh_dev));
+	dev.phb = hose;
+	dev.pe_config_addr = pe_no;
+	dev_pe = eeh_pe_get(&dev);
+	if (!dev_pe) {
+		pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n",
+			   __func__, hose->global_number, pe_no);
+		return -EEXIST;
+	}
+
+	*pe = dev_pe;
+	return 0;
+}
+
+/**
+ * ioda_eeh_next_error - Retrieve next error for EEH core to handle
+ * @pe: The affected PE
+ *
+ * The function is expected to be called by EEH core while it gets
+ * special EEH event (without binding PE). The function calls to
+ * OPAL APIs for next error to handle. The informational error is
+ * handled internally by platform. However, the dead IOC, dead PHB,
+ * fenced PHB and frozen PE should be handled by EEH core eventually.
+ */
+static int ioda_eeh_next_error(struct eeh_pe **pe)
+{
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+	u64 frozen_pe_no;
+	u16 err_type, severity;
+	long rc;
+	int ret = 1;
+
+	/* While running here, it's safe to purge the event queue */
+	eeh_remove_event(NULL);
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		/*
+		 * If the subordinate PCI buses of the PHB has been
+		 * removed, we needn't take care of it any more.
+		 */
+		phb = hose->private_data;
+		if (phb->removed)
+			continue;
+
+		rc = opal_pci_next_error(phb->opal_id,
+				&frozen_pe_no, &err_type, &severity);
+
+		/* If OPAL API returns error, we needn't proceed */
+		if (rc != OPAL_SUCCESS) {
+			IODA_EEH_DBG("%s: Invalid return value on "
+				     "PHB#%x (0x%lx) from opal_pci_next_error",
+				     __func__, hose->global_number, rc);
+			continue;
+		}
+
+		/* If the PHB doesn't have error, stop processing */
+		if (err_type == OPAL_EEH_NO_ERROR ||
+		    severity == OPAL_EEH_SEV_NO_ERROR) {
+			IODA_EEH_DBG("%s: No error found on PHB#%x\n",
+				     __func__, hose->global_number);
+			continue;
+		}
+
+		/*
+		 * Processing the error. We're expecting the error with
+		 * highest priority reported upon multiple errors on the
+		 * specific PHB.
+		 */
+		IODA_EEH_DBG("%s: Error (%d, %d, %d) on PHB#%x\n",
+			err_type, severity, pe_no, hose->global_number);
+		switch (err_type) {
+		case OPAL_EEH_IOC_ERROR:
+			if (severity == OPAL_EEH_SEV_IOC_DEAD) {
+				list_for_each_entry_safe(hose, tmp,
+						&hose_list, list_node) {
+					phb = hose->private_data;
+					phb->removed = 1;
+				}
+
+				WARN(1, "EEH: dead IOC detected\n");
+				ret = 4;
+				goto out;
+			} else if (severity == OPAL_EEH_SEV_INF)
+				ioda_eeh_hub_diag(hose);
+
+			break;
+		case OPAL_EEH_PHB_ERROR:
+			if (severity == OPAL_EEH_SEV_PHB_DEAD) {
+				if (ioda_eeh_get_phb_pe(hose, pe))
+					break;
+
+				WARN(1, "EEH: dead PHB#%x detected\n",
+				     hose->global_number);
+				phb->removed = 1;
+				ret = 3;
+				goto out;
+			} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
+				if (ioda_eeh_get_phb_pe(hose, pe))
+					break;
+
+				WARN(1, "EEH: fenced PHB#%x detected\n",
+				     hose->global_number);
+				ret = 2;
+				goto out;
+			} else if (severity == OPAL_EEH_SEV_INF)
+				ioda_eeh_phb_diag(hose);
+
+			break;
+		case OPAL_EEH_PE_ERROR:
+			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe))
+				break;
+
+			WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n",
+			     (*pe)->addr, (*pe)->phb->global_number);
+			ret = 1;
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
 struct pnv_eeh_ops ioda_eeh_ops = {
 	.post_init		= ioda_eeh_post_init,
 	.set_option		= ioda_eeh_set_option,
@@ -505,5 +835,5 @@ struct pnv_eeh_ops ioda_eeh_ops = {
 	.reset			= ioda_eeh_reset,
 	.get_log		= ioda_eeh_get_log,
 	.configure_bridge	= ioda_eeh_configure_bridge,
-	.next_error		= NULL
+	.next_error		= ioda_eeh_next_error
 };
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 336c9dc..3656a240 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -93,6 +93,7 @@ struct pnv_phb {
 #ifdef CONFIG_EEH
 	struct pnv_eeh_ops	*eeh_ops;
 	int			eeh_enabled;
+	int			removed;
 #endif
 
 #ifdef CONFIG_PCI_MSI
-- 
cgit v0.10.2


From 29310e5e860697955b9c10ddb448d61267fab0dc Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:13 +0800
Subject: powerpc/eeh: PowerNV EEH backends

The patch adds EEH backends for PowerNV platform. It's notable that
part of those EEH backends call to the I/O chip dependent backends.

[Removed pointless change to eeh_pseries.c -- BenH]

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 09bd0cb..7fe5951 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -3,4 +3,4 @@ obj-y			+= opal-rtc.o opal-nvram.o
 
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
-obj-$(CONFIG_EEH)	+= eeh-ioda.o
+obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644
index 0000000..9559115
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -0,0 +1,419 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on
+ * powernv platform. Actually, the powernv was created in order to fully
+ * hypervisor support.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/firmware.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/ppc-pci.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/**
+ * powernv_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on powernv
+ */
+static int powernv_eeh_init(void)
+{
+	/* We require OPALv3 */
+	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
+		pr_warning("%s: OPALv3 is required !\n", __func__);
+		return -EINVAL;
+	}
+
+	/* Set EEH probe mode */
+	eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
+
+	return 0;
+}
+
+/**
+ * powernv_eeh_post_init - EEH platform dependent post initialization
+ *
+ * EEH platform dependent post initialization on powernv. When
+ * the function is called, the EEH PEs and devices should have
+ * been built. If the I/O cache staff has been built, EEH is
+ * ready to supply service.
+ */
+static int powernv_eeh_post_init(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	int ret = 0;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		if (phb->eeh_ops && phb->eeh_ops->post_init) {
+			ret = phb->eeh_ops->post_init(hose);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * powernv_eeh_dev_probe - Do probe on PCI device
+ * @dev: PCI device
+ * @flag: unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose. By default, EEH has been enabled
+ * on all PCI devices. That's to say, we only need do necessary
+ * initialization on the corresponding eeh device and create PE
+ * accordingly.
+ *
+ * It's notable that's unsafe to retrieve the EEH device through
+ * the corresponding PCI device. During the PCI device hotplug, which
+ * was possiblly triggered by EEH core, the binding between EEH device
+ * and the PCI device isn't built yet.
+ */
+static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+
+	/*
+	 * When probing the root bridge, which doesn't have any
+	 * subordinate PCI devices. We don't have OF node for
+	 * the root bridge. So it's not reasonable to continue
+	 * the probing.
+	 */
+	if (!dn || !edev)
+		return 0;
+
+	/* Skip for PCI-ISA bridge */
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+		return 0;
+
+	/* Initialize eeh device */
+	edev->class_code	= dev->class;
+	edev->mode		= 0;
+	edev->config_addr	= ((dev->bus->number << 8) | dev->devfn);
+	edev->pe_config_addr	= phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff);
+
+	/* Create PE */
+	eeh_add_to_parent_pe(edev);
+
+	/*
+	 * Enable EEH explicitly so that we will do EEH check
+	 * while accessing I/O stuff
+	 *
+	 * FIXME: Enable that for PHB3 later
+	 */
+	if (phb->type == PNV_PHB_IODA1)
+		eeh_subsystem_enabled = 1;
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	return 0;
+}
+
+/**
+ * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int powernv_eeh_set_option(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	int ret = -EEXIST;
+
+	/*
+	 * What we need do is pass it down for hardware
+	 * implementation to handle it.
+	 */
+	if (phb->eeh_ops && phb->eeh_ops->set_option)
+		ret = phb->eeh_ops->set_option(pe, option);
+
+	return ret;
+}
+
+/**
+ * powernv_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the PE address according to the given tranditional
+ * PCI BDF (Bus/Device/Function) address.
+ */
+static int powernv_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+	return pe->addr;
+}
+
+/**
+ * powernv_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @delay: delay while PE state is temporarily unavailable
+ *
+ * Retrieve the state of the specified PE. For IODA-compitable
+ * platform, it should be retrieved from IODA table. Therefore,
+ * we prefer passing down to hardware implementation to handle
+ * it.
+ */
+static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	int ret = EEH_STATE_NOT_SUPPORT;
+
+	if (phb->eeh_ops && phb->eeh_ops->get_state) {
+		ret = phb->eeh_ops->get_state(pe);
+
+		/*
+		 * If the PE state is temporarily unavailable,
+		 * to inform the EEH core delay for default
+		 * period (1 second)
+		 */
+		if (delay) {
+			*delay = 0;
+			if (ret & EEH_STATE_UNAVAILABLE)
+				*delay = 1000;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * powernv_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int powernv_eeh_reset(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	int ret = -EEXIST;
+
+	if (phb->eeh_ops && phb->eeh_ops->reset)
+		ret = phb->eeh_ops->reset(pe, option);
+
+	return ret;
+}
+
+/**
+ * powernv_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in microsecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+	int ret;
+	int mwait;
+
+	while (1) {
+		ret = powernv_eeh_get_state(pe, &mwait);
+
+		/*
+		 * If the PE's state is temporarily unavailable,
+		 * we have to wait for the specified time. Otherwise,
+		 * the PE's state will be returned immediately.
+		 */
+		if (ret != EEH_STATE_UNAVAILABLE)
+			return ret;
+
+		max_wait -= mwait;
+		if (max_wait <= 0) {
+			pr_warning("%s: Timeout getting PE#%x's state (%d)\n",
+				   __func__, pe->addr, max_wait);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+
+		msleep(mwait);
+	}
+
+	return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * powernv_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ */
+static int powernv_eeh_get_log(struct eeh_pe *pe, int severity,
+			char *drv_log, unsigned long len)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	int ret = -EEXIST;
+
+	if (phb->eeh_ops && phb->eeh_ops->get_log)
+		ret = phb->eeh_ops->get_log(pe, severity, drv_log, len);
+
+	return ret;
+}
+
+/**
+ * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int powernv_eeh_configure_bridge(struct eeh_pe *pe)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	int ret = 0;
+
+	if (phb->eeh_ops && phb->eeh_ops->configure_bridge)
+		ret = phb->eeh_ops->configure_bridge(pe);
+
+	return ret;
+}
+
+/**
+ * powernv_eeh_read_config - Read PCI config space
+ * @dn: device node
+ * @where: PCI address
+ * @size: size to read
+ * @val: return value
+ *
+ * Read config space from the speicifed device
+ */
+static int powernv_eeh_read_config(struct device_node *dn, int where,
+				   int size, u32 *val)
+{
+	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_controller *hose = edev->phb;
+
+	return hose->ops->read(dev->bus, dev->devfn, where, size, val);
+}
+
+/**
+ * powernv_eeh_write_config - Write PCI config space
+ * @dn: device node
+ * @where: PCI address
+ * @size: size to write
+ * @val: value to be written
+ *
+ * Write config space to the specified device
+ */
+static int powernv_eeh_write_config(struct device_node *dn, int where,
+				    int size, u32 val)
+{
+	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_controller *hose = edev->phb;
+
+	hose = pci_bus_to_host(dev->bus);
+
+	return hose->ops->write(dev->bus, dev->devfn, where, size, val);
+}
+
+/**
+ * powernv_eeh_next_error - Retrieve next EEH error to handle
+ * @pe: Affected PE
+ *
+ * Using OPAL API, to retrieve next EEH error for EEH core to handle
+ */
+static int powernv_eeh_next_error(struct eeh_pe **pe)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb = NULL;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		break;
+	}
+
+	if (phb && phb->eeh_ops->next_error)
+		return phb->eeh_ops->next_error(pe);
+
+	return -EEXIST;
+}
+
+static struct eeh_ops powernv_eeh_ops = {
+	.name                   = "powernv",
+	.init                   = powernv_eeh_init,
+	.post_init              = powernv_eeh_post_init,
+	.of_probe               = NULL,
+	.dev_probe              = powernv_eeh_dev_probe,
+	.set_option             = powernv_eeh_set_option,
+	.get_pe_addr            = powernv_eeh_get_pe_addr,
+	.get_state              = powernv_eeh_get_state,
+	.reset                  = powernv_eeh_reset,
+	.wait_state             = powernv_eeh_wait_state,
+	.get_log                = powernv_eeh_get_log,
+	.configure_bridge       = powernv_eeh_configure_bridge,
+	.read_config            = powernv_eeh_read_config,
+	.write_config           = powernv_eeh_write_config,
+	.next_error		= powernv_eeh_next_error
+};
+
+/**
+ * eeh_powernv_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on powernv platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_powernv_init(void)
+{
+	int ret = -EINVAL;
+
+	if (!machine_is(powernv))
+		return ret;
+
+	ret = eeh_ops_register(&powernv_eeh_ops);
+	if (!ret)
+		pr_info("EEH: PowerNV platform initialized\n");
+	else
+		pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
+
+	return ret;
+}
+
+early_initcall(eeh_powernv_init);
-- 
cgit v0.10.2


From e9cc17d4ded2d10b332c7f3788d7817f7d9d01ef Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:14 +0800
Subject: powerpc/eeh: Initialization for PowerNV

The patch initializes EEH for PowerNV platform. Because the OPAL
APIs requires HUB ID, we need trace that through struct pnv_phb.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2931d97..319ed61 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -974,6 +974,11 @@ static void pnv_pci_ioda_fixup(void)
 	pnv_pci_ioda_setup_PEs();
 	pnv_pci_ioda_setup_seg();
 	pnv_pci_ioda_setup_DMA();
+
+#ifdef CONFIG_EEH
+	eeh_addr_cache_build();
+	eeh_init();
+#endif
 }
 
 /*
@@ -1050,7 +1055,8 @@ static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
 		       OPAL_ASSERT_RESET);
 }
 
-void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
+void __init pnv_pci_init_ioda_phb(struct device_node *np,
+				  u64 hub_id, int ioda_type)
 {
 	struct pci_controller *hose;
 	static int primary = 1;
@@ -1088,6 +1094,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 	hose->first_busno = 0;
 	hose->last_busno = 0xff;
 	hose->private_data = phb;
+	phb->hub_id = hub_id;
 	phb->opal_id = phb_id;
 	phb->type = ioda_type;
 
@@ -1173,6 +1180,9 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 		phb->ioda.io_size, phb->ioda.io_segsize);
 
 	phb->hose->ops = &pnv_pci_ops;
+#ifdef CONFIG_EEH
+	phb->eeh_ops = &ioda_eeh_ops;
+#endif
 
 	/* Setup RID -> PE mapping function */
 	phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
@@ -1213,7 +1223,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 
 void pnv_pci_init_ioda2_phb(struct device_node *np)
 {
-	pnv_pci_init_ioda_phb(np, PNV_PHB_IODA2);
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
 }
 
 void __init pnv_pci_init_ioda_hub(struct device_node *np)
@@ -1236,6 +1246,6 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np)
 	for_each_child_of_node(np, phbn) {
 		/* Look for IODA1 PHBs */
 		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
-			pnv_pci_init_ioda_phb(phbn, PNV_PHB_IODA1);
+			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
 	}
 }
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 5d378f2..b68db63 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -95,7 +95,7 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
 }
 
-static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np,
+static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 					   void *tce_mem, u64 tce_size)
 {
 	struct pnv_phb *phb;
@@ -136,6 +136,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np,
 	phb->hose->first_busno = 0;
 	phb->hose->last_busno = 0xff;
 	phb->hose->private_data = phb;
+	phb->hub_id = hub_id;
 	phb->opal_id = phb_id;
 	phb->type = PNV_PHB_P5IOC2;
 	phb->model = PNV_PHB_MODEL_P5IOC2;
@@ -229,7 +230,8 @@ void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
 	for_each_child_of_node(np, phbn) {
 		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
 		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
-			pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb);
+			pnv_pci_init_p5ioc2_phb(phbn, hub_id,
+					tce_mem, tce_per_phb);
 			tce_mem += tce_per_phb;
 		}
 	}
-- 
cgit v0.10.2


From be7e744607175fb1620f0390d20c880e16de163b Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:15 +0800
Subject: powerpc/eeh: Enable EEH check for config access

The patch enables EEH check and let EEH core to process the EEH
errors for PowerNV platform while accessing config space. Originally,
the implementation already had mechanism to check EEH errors and
tried to recover from them. However, we never let EEH core to handle
the EEH errors.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 5edceb7..577cbea 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -33,6 +33,8 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/firmware.h>
+#include <asm/eeh_event.h>
+#include <asm/eeh.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -260,6 +262,10 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 {
 	struct pci_controller *hose = pci_bus_to_host(bus);
 	struct pnv_phb *phb = hose->private_data;
+#ifdef CONFIG_EEH
+	struct device_node *busdn, *dn;
+	struct eeh_pe *phb_pe = NULL;
+#endif
 	u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
 	s64 rc;
 
@@ -292,8 +298,34 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 	cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n",
 		bus->number, devfn, where, size, *val);
 
-	/* Check if the PHB got frozen due to an error (no response) */
+	/*
+	 * Check if the specified PE has been put into frozen
+	 * state. On the other hand, we needn't do that while
+	 * the PHB has been put into frozen state because of
+	 * PHB-fatal errors.
+	 */
+#ifdef CONFIG_EEH
+	phb_pe = eeh_phb_pe_get(hose);
+	if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
+		return PCIBIOS_SUCCESSFUL;
+
+	if (phb->eeh_enabled) {
+		if (*val == EEH_IO_ERROR_VALUE(size)) {
+			busdn = pci_bus_to_OF_node(bus);
+			for (dn = busdn->child; dn; dn = dn->sibling) {
+				struct pci_dn *pdn = PCI_DN(dn);
+
+				if (pdn && pdn->devfn == devfn &&
+				    eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
+					return PCIBIOS_DEVICE_NOT_FOUND;
+			}
+		}
+	} else {
+		pnv_pci_config_check_eeh(phb, bus, bdfn);
+	}
+#else
 	pnv_pci_config_check_eeh(phb, bus, bdfn);
+#endif
 
 	return PCIBIOS_SUCCESSFUL;
 }
@@ -324,8 +356,14 @@ static int pnv_pci_write_config(struct pci_bus *bus,
 	default:
 		return PCIBIOS_FUNC_NOT_SUPPORTED;
 	}
+
 	/* Check if the PHB got frozen due to an error (no response) */
+#ifdef CONFIG_EEH
+	if (!phb->eeh_enabled)
+		pnv_pci_config_check_eeh(phb, bus, bdfn);
+#else
 	pnv_pci_config_check_eeh(phb, bus, bdfn);
+#endif
 
 	return PCIBIOS_SUCCESSFUL;
 }
-- 
cgit v0.10.2


From b95cd2cd44b39cf11087b15f74e29ef9f2c6bf0f Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 13:21:16 +0800
Subject: powerpc/eeh: Allow to check fenced PHB proactively

It's meaningless to handle frozen PE if we already had fenced PHB.
The patch intends to check the PHB state before checking PE. If the
PHB has been put into fenced state, we need take care of that firstly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 81cd031..7c567be 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -269,6 +269,58 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 	return pa | (token & (PAGE_SIZE-1));
 }
 
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+	struct eeh_pe *phb_pe;
+	unsigned long flags;
+	int ret;
+
+	if (!eeh_probe_mode_dev())
+		return -EPERM;
+
+	/* Find the PHB PE */
+	phb_pe = eeh_phb_pe_get(pe->phb);
+	if (!phb_pe) {
+		pr_warning("%s Can't find PE for PHB#%d\n",
+			   __func__, pe->phb->global_number);
+		return -EEXIST;
+	}
+
+	/* If the PHB has been in problematic state */
+	eeh_serialize_lock(&flags);
+	if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Check PHB state */
+	ret = eeh_ops->get_state(phb_pe, NULL);
+	if ((ret < 0) ||
+	    (ret == EEH_STATE_NOT_SUPPORT) ||
+	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Isolate the PHB and send event */
+	eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+	eeh_serialize_unlock(flags);
+	eeh_send_failure_event(phb_pe);
+
+	WARN(1, "EEH: PHB failure detected\n");
+
+	return 1;
+out:
+	eeh_serialize_unlock(flags);
+	return ret;
+}
+
 /**
  * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
  * @edev: eeh device
@@ -319,6 +371,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 		return 0;
 	}
 
+	/*
+	 * On PowerNV platform, we might already have fenced PHB
+	 * there and we need take care of that firstly.
+	 */
+	ret = eeh_phb_check_failure(pe);
+	if (ret > 0)
+		return ret;
+
 	/* If we already have a pending isolation event for this
 	 * slot, we know it's bad already, we don't need to check.
 	 * Do this checking under a lock; as multiple PCI devices
-- 
cgit v0.10.2


From 1bc98de26d3f270e004421685a8e698e91cf95ca Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 18:13:22 +0800
Subject: powernv/opal: Notifier for OPAL events

This patch implements a notifier to receive a notification on OPAL
event mask changes. The notifier is only called as a result of an OPAL
interrupt, which will happen upon reception of FSP messages or PCI errors.
Any event mask change detected as a result of opal_poll_events() will not
result in a notifier call.

[benh: changelog]
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 2880797..029fe85 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -644,6 +644,11 @@ extern void hvc_opal_init_early(void);
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
 
+extern int opal_notifier_register(struct notifier_block *nb);
+extern void opal_notifier_enable(void);
+extern void opal_notifier_disable(void);
+extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
+
 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
 
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 628c564..106301f 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/interrupt.h>
+#include <linux/notifier.h>
 #include <linux/slab.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
@@ -31,6 +32,10 @@ static DEFINE_SPINLOCK(opal_write_lock);
 extern u64 opal_mc_secondary_handler[];
 static unsigned int *opal_irqs;
 static unsigned int opal_irq_count;
+static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static DEFINE_SPINLOCK(opal_notifier_lock);
+static uint64_t last_notified_mask = 0x0ul;
+static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
 
 int __init early_init_dt_scan_opal(unsigned long node,
 				   const char *uname, int depth, void *data)
@@ -95,6 +100,68 @@ static int __init opal_register_exception_handlers(void)
 
 early_initcall(opal_register_exception_handlers);
 
+int opal_notifier_register(struct notifier_block *nb)
+{
+	if (!nb) {
+		pr_warning("%s: Invalid argument (%p)\n",
+			   __func__, nb);
+		return -EINVAL;
+	}
+
+	atomic_notifier_chain_register(&opal_notifier_head, nb);
+	return 0;
+}
+
+static void opal_do_notifier(uint64_t events)
+{
+	unsigned long flags;
+	uint64_t changed_mask;
+
+	if (atomic_read(&opal_notifier_hold))
+		return;
+
+	spin_lock_irqsave(&opal_notifier_lock, flags);
+	changed_mask = last_notified_mask ^ events;
+	last_notified_mask = events;
+	spin_unlock_irqrestore(&opal_notifier_lock, flags);
+
+	/*
+	 * We feed with the event bits and changed bits for
+	 * enough information to the callback.
+	 */
+	atomic_notifier_call_chain(&opal_notifier_head,
+				   events, (void *)changed_mask);
+}
+
+void opal_notifier_update_evt(uint64_t evt_mask,
+			      uint64_t evt_val)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&opal_notifier_lock, flags);
+	last_notified_mask &= ~evt_mask;
+	last_notified_mask |= evt_val;
+	spin_unlock_irqrestore(&opal_notifier_lock, flags);
+}
+
+void opal_notifier_enable(void)
+{
+	int64_t rc;
+	uint64_t evt = 0;
+
+	atomic_set(&opal_notifier_hold, 0);
+
+	/* Process pending events */
+	rc = opal_poll_events(&evt);
+	if (rc == OPAL_SUCCESS && evt)
+		opal_do_notifier(evt);
+}
+
+void opal_notifier_disable(void)
+{
+	atomic_set(&opal_notifier_hold, 1);
+}
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
 	s64 len, rc;
@@ -297,7 +364,7 @@ static irqreturn_t opal_interrupt(int irq, void *data)
 
 	opal_handle_interrupt(virq_to_hw(irq), &events);
 
-	/* XXX TODO: Do something with the events */
+	opal_do_notifier(events);
 
 	return IRQ_HANDLED;
 }
-- 
cgit v0.10.2


From e8e71fa426d72914c500e98afa2076628076f511 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 18:13:23 +0800
Subject: powernv/opal: Disable OPAL notifier upon poweroff

While we're restarting or powering off the system, we needn't
the OPAL notifier any more. So just to disable that.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d4459bf..84438af 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -93,6 +93,8 @@ static void  __noreturn pnv_restart(char *cmd)
 {
 	long rc = OPAL_BUSY;
 
+	opal_notifier_disable();
+
 	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 		rc = opal_cec_reboot();
 		if (rc == OPAL_BUSY_EVENT)
@@ -108,6 +110,8 @@ static void __noreturn pnv_power_off(void)
 {
 	long rc = OPAL_BUSY;
 
+	opal_notifier_disable();
+
 	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 		rc = opal_cec_power_down(0);
 		if (rc == OPAL_BUSY_EVENT)
-- 
cgit v0.10.2


From 7cb9d93dc6d4f717218b6fa791be9bcf4e417379 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 18:13:24 +0800
Subject: powerpc/eeh: Register OPAL notifier for PCI error

The patch registers OPAL event notifier and process the PCI errors
from firmware. If we have pending PCI errors, special EEH event
(without binding PE) will be sent to EEH core for processing.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index a3eebd1..2b7689e 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -18,6 +18,7 @@
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/msi.h>
+#include <linux/notifier.h>
 #include <linux/pci.h>
 #include <linux/string.h>
 
@@ -42,6 +43,26 @@
 #endif
 
 static char *hub_diag = NULL;
+static int ioda_eeh_nb_init = 0;
+
+static int ioda_eeh_event(struct notifier_block *nb,
+			  unsigned long events, void *change)
+{
+	uint64_t changed_evts = (uint64_t)change;
+
+	/* We simply send special EEH event */
+	if ((changed_evts & OPAL_EVENT_PCI_ERROR) &&
+	    (events & OPAL_EVENT_PCI_ERROR))
+		eeh_send_failure_event(NULL);
+
+	return 0;
+}
+
+static struct notifier_block ioda_eeh_nb = {
+	.notifier_call	= ioda_eeh_event,
+	.next		= NULL,
+	.priority	= 0
+};
 
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
@@ -54,6 +75,19 @@ static char *hub_diag = NULL;
 static int ioda_eeh_post_init(struct pci_controller *hose)
 {
 	struct pnv_phb *phb = hose->private_data;
+	int ret;
+
+	/* Register OPAL event notifier */
+	if (!ioda_eeh_nb_init) {
+		ret = opal_notifier_register(&ioda_eeh_nb);
+		if (ret) {
+			pr_err("%s: Can't register OPAL event notifier (%d)\n",
+			       __func__, ret);
+			return ret;
+		}
+
+		ioda_eeh_nb_init = 1;
+	}
 
 	/* FIXME: Enable it for PHB3 later */
 	if (phb->type == PNV_PHB_IODA1) {
@@ -736,8 +770,13 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 	long rc;
 	int ret = 1;
 
-	/* While running here, it's safe to purge the event queue */
+	/*
+	 * While running here, it's safe to purge the event queue.
+	 * And we should keep the cached OPAL notifier event sychronized
+	 * between the kernel and firmware.
+	 */
 	eeh_remove_event(NULL);
+	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
 		/*
-- 
cgit v0.10.2


From 37c367f2792f899528b5bf201a4bd6131f8b75b6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 18:13:25 +0800
Subject: powerpc/powernv: Debugfs directory for PHB

The patch creates one debugfs directory ("powerpc/PCIxxxx") for
each PHB so that we can hook EEH error injection debugfs entry
there in proceeding patch.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 319ed61..dc4ec79 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -32,6 +33,7 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/xics.h>
+#include <asm/debug.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -969,12 +971,33 @@ static void pnv_pci_ioda_setup_DMA(void)
 	}
 }
 
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+	char name[16];
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		sprintf(name, "PCI%04x", hose->global_number);
+		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+		if (!phb->dbgfs)
+			pr_warning("%s: Error on creating debugfs on PHB#%x\n",
+				__func__, hose->global_number);
+	}
+#endif /* CONFIG_DEBUG_FS */
+}
+
 static void pnv_pci_ioda_fixup(void)
 {
 	pnv_pci_ioda_setup_PEs();
 	pnv_pci_ioda_setup_seg();
 	pnv_pci_ioda_setup_DMA();
 
+	pnv_pci_ioda_create_dbgfs();
+
 #ifdef CONFIG_EEH
 	eeh_addr_cache_build();
 	eeh_init();
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 3656a240..43906e3 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -96,6 +96,10 @@ struct pnv_phb {
 	int			removed;
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		*dbgfs;
+#endif
+
 #ifdef CONFIG_PCI_MSI
 	unsigned int		msi_base;
 	unsigned int		msi32_support;
-- 
cgit v0.10.2


From 8998897b8f966c9036307b5130494d4bf1e6cb18 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 18:13:26 +0800
Subject: powerpc/eeh: Debugfs for error injection

The patch creates debugfs entries (powerpc/PCIxxxx/err_injct) for
injecting EEH errors for testing purpose.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 2b7689e..84f3036 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/bootmem.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/io.h>
@@ -64,6 +65,29 @@ static struct notifier_block ioda_eeh_nb = {
 	.priority	= 0
 };
 
+#ifdef CONFIG_DEBUG_FS
+static int ioda_eeh_dbgfs_set(void *data, u64 val)
+{
+	struct pci_controller *hose = data;
+	struct pnv_phb *phb = hose->private_data;
+
+	out_be64(phb->regs + 0xD10, val);
+	return 0;
+}
+
+static int ioda_eeh_dbgfs_get(void *data, u64 *val)
+{
+	struct pci_controller *hose = data;
+	struct pnv_phb *phb = hose->private_data;
+
+	*val = in_be64(phb->regs + 0xD10);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_dbgfs_ops, ioda_eeh_dbgfs_get,
+			ioda_eeh_dbgfs_set, "0x%llx\n");
+#endif /* CONFIG_DEBUG_FS */
+
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
  * @hose: PCI controller
@@ -101,6 +125,13 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 			}
 		}
 
+#ifdef CONFIG_DEBUG_FS
+		if (phb->dbgfs)
+			debugfs_create_file("err_injct", 0600,
+					    phb->dbgfs, hose,
+					    &ioda_eeh_dbgfs_ops);
+#endif
+
 		phb->eeh_enabled = 1;
 	}
 
-- 
cgit v0.10.2


From db3d8534903c8a9617142975bec6db95acaba753 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:13 +0530
Subject: powerpc/mm: handle hugepage size correctly when invalidating hpte
 entries

If a hash bucket gets full, we "evict" a more/less random entry from it.
When we do that we don't invalidate the TLB (hpte_remove) because we assume
the old translation is still technically "valid". This implies that when
we are invalidating or updating pte, even if HPTE entry is not valid
we should do a tlb invalidate. With hugepages, we need to pass the correct
actual page size value for tlb invalidation.

This change update the patch 0608d692463598c1d6e826d9dd7283381b4f246c
"powerpc/mm: Always invalidate tlb on hpte invalidate and update" to handle
transparent hugepages correctly.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 92386fc..801e3c6 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -36,13 +36,13 @@ struct machdep_calls {
 #ifdef CONFIG_PPC64
 	void            (*hpte_invalidate)(unsigned long slot,
 					   unsigned long vpn,
-					   int psize, int ssize,
-					   int local);
+					   int bpsize, int apsize,
+					   int ssize, int local);
 	long		(*hpte_updatepp)(unsigned long slot, 
 					 unsigned long newpp, 
 					 unsigned long vpn,
-					 int psize, int ssize,
-					 int local);
+					 int bpsize, int apsize,
+					 int ssize, int local);
 	void            (*hpte_updateboltedpp)(unsigned long newpp, 
 					       unsigned long ea,
 					       int psize, int ssize);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 3a9a1ac..176d3fd 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -34,7 +34,7 @@
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	ppc_md.hpte_invalidate(pte->slot, pte->host_vpn,
-			       MMU_PAGE_4K, MMU_SEGSIZE_256M,
+			       MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M,
 			       false);
 }
 
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 0e980ac..d3cbda6 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -289,9 +289,10 @@ htab_modify_pte:
 
 	/* Call ppc_md.hpte_updatepp */
 	mr	r5,r29			/* vpn */
-	li	r6,MMU_PAGE_4K		/* page size */
-	ld	r7,STK_PARAM(R9)(r1)	/* segment size */
-	ld	r8,STK_PARAM(R8)(r1)	/* get "local" param */
+	li	r6,MMU_PAGE_4K		/* base page size */
+	li	r7,MMU_PAGE_4K		/* actual page size */
+	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
+	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
 	bl	.			/* Patched by htab_finish_init() */
 
@@ -649,9 +650,10 @@ htab_modify_pte:
 
 	/* Call ppc_md.hpte_updatepp */
 	mr	r5,r29			/* vpn */
-	li	r6,MMU_PAGE_4K		/* page size */
-	ld	r7,STK_PARAM(R9)(r1)	/* segment size */
-	ld	r8,STK_PARAM(R8)(r1)	/* get "local" param */
+	li	r6,MMU_PAGE_4K		/* base page size */
+	li	r7,MMU_PAGE_4K		/* actual page size */
+	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
+	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
 	bl	.			/* patched by htab_finish_init() */
 
@@ -937,9 +939,10 @@ ht64_modify_pte:
 
 	/* Call ppc_md.hpte_updatepp */
 	mr	r5,r29			/* vpn */
-	li	r6,MMU_PAGE_64K
-	ld	r7,STK_PARAM(R9)(r1)	/* segment size */
-	ld	r8,STK_PARAM(R8)(r1)	/* get "local" param */
+	li	r6,MMU_PAGE_64K		/* base page size */
+	li	r7,MMU_PAGE_64K		/* actual page size */
+	ld	r8,STK_PARAM(R9)(r1)	/* segment size */
+	ld	r9,STK_PARAM(R8)(r1)	/* get "local" param */
 _GLOBAL(ht64_call_hpte_updatepp)
 	bl	.			/* patched by htab_finish_init() */
 
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4c122c3..6d152bc 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -273,61 +273,15 @@ static long native_hpte_remove(unsigned long hpte_group)
 	return i;
 }
 
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-	int i, shift;
-	unsigned int mask;
-
-	/* start from 1 ignoring MMU_PAGE_4K */
-	for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
-		/* invalid penc */
-		if (mmu_psize_defs[psize].penc[i] == -1)
-			continue;
-		/*
-		 * encoding bits per actual page size
-		 *        PTE LP     actual page size
-		 *    rrrr rrrz		>=8KB
-		 *    rrrr rrzz		>=16KB
-		 *    rrrr rzzz		>=32KB
-		 *    rrrr zzzz		>=64KB
-		 * .......
-		 */
-		shift = mmu_psize_defs[i].shift - LP_SHIFT;
-		if (shift > LP_BITS)
-			shift = LP_BITS;
-		mask = (1 << shift) - 1;
-		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-			return i;
-	}
-	return -1;
-}
-
-static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
-{
-	/* Look at the 8 bit LP value */
-	unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
-	if (!(hptep->v & HPTE_V_VALID))
-		return -1;
-
-	/* First check if it is large page */
-	if (!(hptep->v & HPTE_V_LARGE))
-		return MMU_PAGE_4K;
-
-	return __hpte_actual_psize(lp, psize);
-}
-
 static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-				 unsigned long vpn, int psize, int ssize,
-				 int local)
+				 unsigned long vpn, int bpsize,
+				 int apsize, int ssize, int local)
 {
 	struct hash_pte *hptep = htab_address + slot;
 	unsigned long hpte_v, want_v;
 	int ret = 0;
-	int actual_psize;
 
-	want_v = hpte_encode_avpn(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 
 	DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
 		vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -335,7 +289,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	native_lock_hpte(hptep);
 
 	hpte_v = hptep->v;
-	actual_psize = hpte_actual_psize(hptep, psize);
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -343,12 +296,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	 * (hpte_remove) because we assume the old translation is still
 	 * technically "valid".
 	 */
-	if (actual_psize < 0) {
-		actual_psize = psize;
-		ret = -1;
-		goto err_out;
-	}
-	if (!HPTE_V_COMPARE(hpte_v, want_v)) {
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
 		DBG_LOW(" -> miss\n");
 		ret = -1;
 	} else {
@@ -357,11 +305,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 		hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
 			(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
 	}
-err_out:
 	native_unlock_hpte(hptep);
 
 	/* Ensure it is out of the tlb too. */
-	tlbie(vpn, psize, actual_psize, ssize, local);
+	tlbie(vpn, bpsize, apsize, ssize, local);
 
 	return ret;
 }
@@ -402,7 +349,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 				       int psize, int ssize)
 {
-	int actual_psize;
 	unsigned long vpn;
 	unsigned long vsid;
 	long slot;
@@ -415,36 +361,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 	if (slot == -1)
 		panic("could not find page to bolt\n");
 	hptep = htab_address + slot;
-	actual_psize = hpte_actual_psize(hptep, psize);
-	if (actual_psize < 0)
-		actual_psize = psize;
 
 	/* Update the HPTE */
 	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
 		(newpp & (HPTE_R_PP | HPTE_R_N));
-
-	/* Ensure it is out of the tlb too. */
-	tlbie(vpn, psize, actual_psize, ssize, 0);
+	/*
+	 * Ensure it is out of the tlb too. Bolted entries base and
+	 * actual page size will be same.
+	 */
+	tlbie(vpn, psize, psize, ssize, 0);
 }
 
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
-				   int psize, int ssize, int local)
+				   int bpsize, int apsize, int ssize, int local)
 {
 	struct hash_pte *hptep = htab_address + slot;
 	unsigned long hpte_v;
 	unsigned long want_v;
 	unsigned long flags;
-	int actual_psize;
 
 	local_irq_save(flags);
 
 	DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
 
-	want_v = hpte_encode_avpn(vpn, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
 	native_lock_hpte(hptep);
 	hpte_v = hptep->v;
 
-	actual_psize = hpte_actual_psize(hptep, psize);
 	/*
 	 * We need to invalidate the TLB always because hpte_remove doesn't do
 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -452,23 +395,48 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	 * (hpte_remove) because we assume the old translation is still
 	 * technically "valid".
 	 */
-	if (actual_psize < 0) {
-		actual_psize = psize;
-		native_unlock_hpte(hptep);
-		goto err_out;
-	}
-	if (!HPTE_V_COMPARE(hpte_v, want_v))
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
 		native_unlock_hpte(hptep);
 	else
 		/* Invalidate the hpte. NOTE: this also unlocks it */
 		hptep->v = 0;
 
-err_out:
 	/* Invalidate the TLB */
-	tlbie(vpn, psize, actual_psize, ssize, local);
+	tlbie(vpn, bpsize, apsize, ssize, local);
+
 	local_irq_restore(flags);
 }
 
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+	int i, shift;
+	unsigned int mask;
+
+	/* start from 1 ignoring MMU_PAGE_4K */
+	for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+		/* invalid penc */
+		if (mmu_psize_defs[psize].penc[i] == -1)
+			continue;
+		/*
+		 * encoding bits per actual page size
+		 *        PTE LP     actual page size
+		 *    rrrr rrrz		>=8KB
+		 *    rrrr rrzz		>=16KB
+		 *    rrrr rzzz		>=32KB
+		 *    rrrr zzzz		>=64KB
+		 * .......
+		 */
+		shift = mmu_psize_defs[i].shift - LP_SHIFT;
+		if (shift > LP_BITS)
+			shift = LP_BITS;
+		mask = (1 << shift) - 1;
+		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+			return i;
+	}
+	return -1;
+}
+
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 			int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e303a6d..2f47080 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1232,7 +1232,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 		slot += hidx & _PTEIDX_GROUP_IX;
 		DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
-		ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local);
+		/*
+		 * We use same base page size and actual psize, because we don't
+		 * use these functions for hugepage
+		 */
+		ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
 	} pte_iterate_hashed_end();
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1365,7 +1369,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
 		hash = ~hash;
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 	slot += hidx & _PTEIDX_GROUP_IX;
-	ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0);
+	ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+			       mmu_kernel_ssize, 0);
 }
 
 void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0f1d94a..0b7fb67 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		slot += (old_pte & _PAGE_F_GIX) >> 12;
 
 		if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
-					 ssize, local) == -1)
+					 mmu_psize, ssize, local) == -1)
 			old_pte &= ~_PAGE_HPTEFLAGS;
 	}
 
diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c
index 246e1d8..c34ee4e 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -185,7 +185,8 @@ static void beat_lpar_hptab_clear(void)
 static long beat_lpar_hpte_updatepp(unsigned long slot,
 				    unsigned long newpp,
 				    unsigned long vpn,
-				    int psize, int ssize, int local)
+				    int psize, int apsize,
+				    int ssize, int local)
 {
 	unsigned long lpar_rc;
 	u64 dummy0, dummy1;
@@ -274,7 +275,8 @@ static void beat_lpar_hpte_updateboltedpp(unsigned long newpp,
 }
 
 static void beat_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
-					 int psize, int ssize, int local)
+				      int psize, int apsize,
+				      int ssize, int local)
 {
 	unsigned long want_v;
 	unsigned long lpar_rc;
@@ -364,9 +366,10 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group,
  * already zero.  For now I am paranoid.
  */
 static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
-				    unsigned long newpp,
-				    unsigned long vpn,
-				    int psize, int ssize, int local)
+				       unsigned long newpp,
+				       unsigned long vpn,
+				       int psize, int apsize,
+				       int ssize, int local)
 {
 	unsigned long lpar_rc;
 	unsigned long want_v;
@@ -394,7 +397,8 @@ static long beat_lpar_hpte_updatepp_v3(unsigned long slot,
 }
 
 static void beat_lpar_hpte_invalidate_v3(unsigned long slot, unsigned long vpn,
-					 int psize, int ssize, int local)
+					 int psize, int apsize,
+					 int ssize, int local)
 {
 	unsigned long want_v;
 	unsigned long lpar_rc;
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index 177a2f7..3e270e3 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -109,7 +109,8 @@ static long ps3_hpte_remove(unsigned long hpte_group)
 }
 
 static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp,
-	unsigned long vpn, int psize, int ssize, int local)
+			      unsigned long vpn, int psize, int apsize,
+			      int ssize, int local)
 {
 	int result;
 	u64 hpte_v, want_v, hpte_rs;
@@ -162,7 +163,7 @@ static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 }
 
 static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn,
-	int psize, int ssize, int local)
+				int psize, int apsize, int ssize, int local)
 {
 	unsigned long flags;
 	int result;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 6d62072..ca45c8f 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -240,7 +240,8 @@ static void pSeries_lpar_hptab_clear(void)
 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
 				       unsigned long newpp,
 				       unsigned long vpn,
-				       int psize, int ssize, int local)
+				       int psize, int apsize,
+				       int ssize, int local)
 {
 	unsigned long lpar_rc;
 	unsigned long flags = (newpp & 7) | H_AVPN;
@@ -328,7 +329,8 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 }
 
 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
-					 int psize, int ssize, int local)
+					 int psize, int apsize,
+					 int ssize, int local)
 {
 	unsigned long want_v;
 	unsigned long lpar_rc;
@@ -356,8 +358,10 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 
 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
 	BUG_ON(slot == -1);
-
-	pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0);
+	/*
+	 * lpar doesn't use the passed actual page size
+	 */
+	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
 }
 
 /* Flag bits for H_BULK_REMOVE */
@@ -400,8 +404,11 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot += hidx & _PTEIDX_GROUP_IX;
 			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+				/*
+				 * lpar doesn't use the passed actual page size
+				 */
 				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
-							     ssize, local);
+							     0, ssize, local);
 			} else {
 				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
 				param[pix+1] = hpte_encode_avpn(vpn, psize,
-- 
cgit v0.10.2


From f940f5289873af2ad2c4e73f88c24ad2b8fe3f87 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:14 +0530
Subject: powerpc/THP: Double the PMD table size for THP

THP code does PTE page allocation along with large page request and deposit them
for later use. This is to ensure that we won't have any failures when we split
hugepages to regular pages.

On powerpc we want to use the deposited PTE page for storing hash pte slot and
secondary bit information for the HPTEs. We use the second half
of the pmd table to save the deposted PTE page.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index b66ae72..f65e27b 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -221,17 +221,17 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
+	return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
 				GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-	kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
+	kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
-	pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
+	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)		      \
 	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index 45142d6..a56b82f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -33,7 +33,8 @@
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
 /* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS		0x1ff
+/* PMDs point to PTE table fragments which are 4K aligned.  */
+#define PMD_MASKED_BITS		0xfff
 /* Bits to mask out from a PGD/PUD to get to the PMD page */
 #define PUD_MASKED_BITS		0x1ff
 
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f..ab84332 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -20,7 +20,11 @@
                 	    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
 #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
 
-
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_CACHE_INDEX	(PMD_INDEX_SIZE + 1)
+#else
+#define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#endif
 /*
  * Define the address range of the kernel non-linear virtual area
  */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c4..d0cd9e4 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
 
 static void pmd_ctor(void *addr)
 {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
 	memset(addr, 0, PMD_TABLE_SIZE);
+#endif
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 void pgtable_cache_init(void)
 {
 	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
-	pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
-	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
 		panic("Couldn't allocate pgtable caches");
-
 	/* In all current configs, when the PUD index exists it's the
 	 * same size as either the pgd or pmd index.  Verify that the
 	 * initialization above has also created a PUD cache.  This
-- 
cgit v0.10.2


From 074c2eae3e9b66c03a17a12df8f2cd19382b68ab Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:15 +0530
Subject: powerpc/THP: Implement transparent hugepages for ppc64

We now have pmd entries covering 16MB range and the PMD table double its original size.
We use the second half of the PMD table to deposit the pgtable (PTE page).
The depoisted PTE page is further used to track the HPTE information. The information
include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need
4096 entries. Both will fit in a 4K PTE page. On hugepage invalidate we need to walk
the PTE page and invalidate all valid HPTEs.

This patch implements necessary arch specific functions for THP support and also
hugepage invalidate logic. These PMD related functions are intentionally kept
similar to their PTE counter-part.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index ab84332..8f9da5e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -10,6 +10,7 @@
 #else
 #include <asm/pgtable-ppc64-4k.h>
 #endif
+#include <asm/barrier.h>
 
 #define FIRST_USER_ADDRESS	0
 
@@ -154,7 +155,7 @@
 #define	pmd_present(pmd)	(pmd_val(pmd) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
 #define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
-#define pmd_page(pmd)		virt_to_page(pmd_page_vaddr(pmd))
+extern struct page *pmd_page(pmd_t pmd);
 
 #define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
 #define pud_none(pud)		(!pud_val(pud))
@@ -382,4 +383,216 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 
 #endif /* __ASSEMBLY__ */
 
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
+
+/*
+ * set of bits not changed in pmd_modify.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |		\
+			 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+			 _PAGE_THP_HUGE)
+
+#ifndef __ASSEMBLY__
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
+ * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
+ *
+ * The last three bits are intentionally left to zero. This memory location
+ * are also used as normal page PTE pointers. So if we have any pointers
+ * left around while we collapse a hugepage, we need to make sure
+ * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them
+ */
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
+{
+	return (hpte_slot_array[index] >> 3) & 0x1;
+}
+
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+					   int index)
+{
+	return hpte_slot_array[index] >> 4;
+}
+
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+					unsigned int index, unsigned int hidx)
+{
+	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+}
+
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+	/*
+	 * The hpte hindex is stored in the pgtable whose address is in the
+	 * second half of the PMD
+	 *
+	 * Order this load with the test for pmd_trans_huge in the caller
+	 */
+	smp_rmb();
+	return *(char **)(pmdp + PTRS_PER_PMD);
+
+
+}
+
+extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+				   pmd_t *pmdp);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		       pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+				 pmd_t *pmd);
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+	/*
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	if (pmd_trans_huge(pmd))
+		return pmd_val(pmd) & _PAGE_PRESENT;
+	return 0;
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	if (pmd_trans_huge(pmd))
+		return pmd_val(pmd) & _PAGE_SPLITTING;
+	return 0;
+}
+
+/* We will enable it in the last patch */
+#define has_transparent_hugepage() 0
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+	return __pte(pmd_val(pmd));
+}
+
+static inline pmd_t pte_pmd(pte_t pte)
+{
+	return __pmd(pte_val(pte));
+}
+
+static inline pte_t *pmdp_ptep(pmd_t *pmd)
+{
+	return (pte_t *)pmd;
+}
+
+#define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
+#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+
+#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write(pmd)		pte_write(pmd_pte(pmd))
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	/* Do nothing, mk_pmd() does this part.  */
+	return pmd;
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~_PAGE_PRESENT;
+	return pmd;
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	pmd_val(pmd) |= _PAGE_SPLITTING;
+	return pmd;
+}
+
+#define __HAVE_ARCH_PMD_SAME
+static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
+{
+	return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0);
+}
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
+					 unsigned long addr,
+					 pmd_t *pmdp, unsigned long clr);
+
+static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
+					      unsigned long addr, pmd_t *pmdp)
+{
+	unsigned long old;
+
+	if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+		return 0;
+	old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+	return ((old & _PAGE_ACCESSED) != 0);
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long address, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
+extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+			      pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
+				      pmd_t *pmdp)
+{
+
+	if ((pmd_val(*pmdp) & _PAGE_RW) == 0)
+		return;
+
+	pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW);
+}
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				       pgtable_t pgtable);
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmdp);
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7aeb955..d53db93 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -220,6 +220,10 @@ extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
 
 extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		       unsigned long end, int write, struct page **pages, int *nr);
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+#define pmd_large(pmd)		0
+#define has_transparent_hugepage() 0
+#endif
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 61a5927..2def01ed 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -165,7 +165,8 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 /* Private function for use by PCI IO mapping code */
 extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 				     unsigned long end);
-
+extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr);
 #else
 #error Unsupported MMU type
 #endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096..e4d3e9f 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__iounmap_at);
 
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (pmd_trans_huge(pmd))
+		return pfn_to_page(pmd_pfn(pmd));
+#endif
+	return virt_to_page(pmd_page_vaddr(pmd));
+}
+
 #ifdef CONFIG_PPC_64K_PAGES
 static pte_t *get_from_cache(struct mm_struct *mm)
 {
@@ -455,3 +468,367 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 }
 #endif
 #endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp, pmd_t entry, int dirty)
+{
+	int changed;
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp));
+	assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+	changed = !pmd_same(*(pmdp), entry);
+	if (changed) {
+		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+		/*
+		 * Since we are not supporting SW TLB systems, we don't
+		 * have any thing similar to flush_tlb_page_nohash()
+		 */
+	}
+	return changed;
+}
+
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+				  pmd_t *pmdp, unsigned long clr)
+{
+
+	unsigned long old, tmp;
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp));
+	assert_spin_locked(&mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3\n\
+		andi.	%1,%0,%6\n\
+		bne-	1b \n\
+		andc	%1,%0,%4 \n\
+		stdcx.	%1,0,%3 \n\
+		bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
+	: "cc" );
+#else
+	old = pmd_val(*pmdp);
+	*pmdp = __pmd(old & ~clr);
+#endif
+	if (old & _PAGE_HASHPTE)
+		hpte_do_hugepage_flush(mm, addr, pmdp);
+	return old;
+}
+
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+		       pmd_t *pmdp)
+{
+	pmd_t pmd;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	if (pmd_trans_huge(*pmdp)) {
+		pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+	} else {
+		/*
+		 * khugepaged calls this for normal pmd
+		 */
+		pmd = *pmdp;
+		pmd_clear(pmdp);
+		/*
+		 * Wait for all pending hash_page to finish. This is needed
+		 * in case of subpage collapse. When we collapse normal pages
+		 * to hugepage, we first clear the pmd, then invalidate all
+		 * the PTE entries. The assumption here is that any low level
+		 * page fault will see a none pmd and take the slow path that
+		 * will wait on mmap_sem. But we could very well be in a
+		 * hash_page with local ptep pointer value. Such a hash page
+		 * can result in adding new HPTE entries for normal subpages.
+		 * That means we could be modifying the page content as we
+		 * copy them to a huge page. So wait for parallel hash_page
+		 * to finish before invalidating HPTE entries. We can do this
+		 * by sending an IPI to all the cpus and executing a dummy
+		 * function there.
+		 */
+		kick_all_cpus_sync();
+		/*
+		 * Now invalidate the hpte entries in the range
+		 * covered by pmd. This make sure we take a
+		 * fault and will find the pmd as none, which will
+		 * result in a major fault which takes mmap_sem and
+		 * hence wait for collapse to complete. Without this
+		 * the __collapse_huge_page_copy can result in copying
+		 * the old content.
+		 */
+		flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+	}
+	return pmd;
+}
+
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long address, pmd_t *pmdp)
+{
+	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp)
+{
+	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp)
+{
+	unsigned long old, tmp;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_trans_huge(*pmdp));
+	assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+
+#ifdef PTE_ATOMIC_UPDATES
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3\n\
+		andi.	%1,%0,%6\n\
+		bne-	1b \n\
+		ori	%1,%0,%4 \n\
+		stdcx.	%1,0,%3 \n\
+		bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+	: "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+	: "cc" );
+#else
+	old = pmd_val(*pmdp);
+	*pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+	/*
+	 * If we didn't had the splitting flag set, go and flush the
+	 * HPTE entries.
+	 */
+	if (!(old & _PAGE_SPLITTING)) {
+		/* We need to flush the hpte */
+		if (old & _PAGE_HASHPTE)
+			hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+	}
+}
+
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable)
+{
+	pgtable_t *pgtable_slot;
+	assert_spin_locked(&mm->page_table_lock);
+	/*
+	 * we store the pgtable in the second half of PMD
+	 */
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	*pgtable_slot = pgtable;
+	/*
+	 * expose the deposited pgtable to other cpus.
+	 * before we set the hugepage PTE at pmd level
+	 * hash fault code looks at the deposted pgtable
+	 * to store hash index values.
+	 */
+	smp_wmb();
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pgtable_t pgtable;
+	pgtable_t *pgtable_slot;
+
+	assert_spin_locked(&mm->page_table_lock);
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	pgtable = *pgtable_slot;
+	/*
+	 * Once we withdraw, mark the entry NULL.
+	 */
+	*pgtable_slot = NULL;
+	/*
+	 * We store HPTE information in the deposited PTE fragment.
+	 * zero out the content on withdraw.
+	 */
+	memset(pgtable, 0, PTE_FRAG_SIZE);
+	return pgtable;
+}
+
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+		pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(!pmd_none(*pmdp));
+	assert_spin_locked(&mm->page_table_lock);
+	WARN_ON(!pmd_trans_huge(pmd));
+#endif
+	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+		     pmd_t *pmdp)
+{
+	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
+}
+
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+			    pmd_t *pmdp)
+{
+	int ssize, i;
+	unsigned long s_addr;
+	unsigned int psize, valid;
+	unsigned char *hpte_slot_array;
+	unsigned long hidx, vpn, vsid, hash, shift, slot;
+
+	/*
+	 * Flush all the hptes mapping this hugepage
+	 */
+	s_addr = addr & HPAGE_PMD_MASK;
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+	/*
+	 * IF we try to do a HUGE PTE update after a withdraw is done.
+	 * we will find the below NULL. This happens when we do
+	 * split_huge_page_pmd
+	 */
+	if (!hpte_slot_array)
+		return;
+
+	/* get the base page size */
+	psize = get_slice_psize(mm, s_addr);
+	shift = mmu_psize_defs[psize].shift;
+
+	for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) {
+		/*
+		 * 8 bits per each hpte entries
+		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+		 */
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		ppc_md.hpte_invalidate(slot, vpn, psize,
+				       MMU_PAGE_16M, ssize, 0);
+	}
+}
+
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+	pmd_val(pmd) |= pgprot_val(pgprot);
+	return pmd;
+}
+
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+	pmd_t pmd;
+	/*
+	 * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+	 * set. We use this to check THP page at pmd level.
+	 * leaf pte for huge page, bottom two bits != 00
+	 */
+	pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+	pmd_val(pmd) |= _PAGE_THP_HUGE;
+	pmd = pmd_set_protbits(pmd, pgprot);
+	return pmd;
+}
+
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+	return pfn_pmd(page_to_pfn(page), pgprot);
+}
+
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+
+	pmd_val(pmd) &= _HPAGE_CHG_MASK;
+	pmd = pmd_set_protbits(pmd, newprot);
+	return pmd;
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+			  pmd_t *pmd)
+{
+	return;
+}
+
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+			 unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old_pmd;
+	pgtable_t pgtable;
+	unsigned long old;
+	pgtable_t *pgtable_slot;
+
+	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+	old_pmd = __pmd(old);
+	/*
+	 * We have pmd == none and we are holding page_table_lock.
+	 * So we can safely go and clear the pgtable hash
+	 * index info.
+	 */
+	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+	pgtable = *pgtable_slot;
+	/*
+	 * Let's zero out old valid and hash index details
+	 * hash fault look at them.
+	 */
+	memset(pgtable, 0, PTE_FRAG_SIZE);
+	return old_pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a..48bf63e 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -219,3 +219,30 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	arch_leave_lazy_mmu_mode();
 	local_irq_restore(flags);
 }
+
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+	pte_t *pte;
+	pte_t *start_pte;
+	unsigned long flags;
+
+	addr = _ALIGN_DOWN(addr, PMD_SIZE);
+	/* Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	start_pte = pte_offset_map(pmd, addr);
+	for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+		unsigned long pteval = pte_val(*pte);
+		if (pteval & _PAGE_HASHPTE)
+			hpte_need_flush(mm, addr, pte, pteval, 0);
+		addr += PAGE_SIZE;
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 54f3936..ae0aaea 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
 	select PPC_FPU
 	select PPC_HAVE_PMU_SUPPORT
 	select SYS_SUPPORTS_HUGETLBFS
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
 
 config PPC_BOOK3E_64
 	bool "Embedded processors"
-- 
cgit v0.10.2


From 29409997f8d06d693d82127d200eeaf48989fdd2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:16 +0530
Subject: powerpc: move find_linux_pte_or_hugepte and gup_hugepte to common
 code

We will use this in the later patch for handling THP pages

Reviewed-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index f2498c8..d750336 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -191,8 +191,14 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
 				      unsigned long vmaddr)
 {
 }
-#endif /* CONFIG_HUGETLB_PAGE */
 
+#define hugepd_shift(x) 0
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+				    unsigned pdshift)
+{
+	return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
 
 /*
  * FSL Book3E platforms require special gpage handling - the gpages
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 8f9da5e..6c9323f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -368,19 +368,6 @@ static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
 	return pt;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				 unsigned *shift);
-#else
-static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-					       unsigned *shift)
-{
-	if (shift)
-		*shift = 0;
-	return find_linux_pte(pgdir, ea);
-}
-#endif /* !CONFIG_HUGETLB_PAGE */
-
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d53db93..959d575 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -224,6 +224,8 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 #define pmd_large(pmd)		0
 #define has_transparent_hugepage() 0
 #endif
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+				 unsigned *shift);
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 26f29a7..ff0379c 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -27,8 +27,8 @@ obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-y				+= hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 237c8e5..2865077 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/setup.h>
+#include <asm/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE
 
 #define PAGE_SHIFT_64K	16
 #define PAGE_SHIFT_16M	24
@@ -100,66 +103,6 @@ int pgd_huge(pgd_t pgd)
 }
 #endif
 
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page, bottom two bits != 00
- * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
- */
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-	pte_t *ret_pte;
-	hugepd_t *hpdp = NULL;
-	unsigned pdshift = PGDIR_SHIFT;
-
-	if (shift)
-		*shift = 0;
-
-	pg = pgdir + pgd_index(ea);
-
-	if (pgd_huge(*pg)) {
-		ret_pte = (pte_t *) pg;
-		goto out;
-	} else if (is_hugepd(pg))
-		hpdp = (hugepd_t *)pg;
-	else if (!pgd_none(*pg)) {
-		pdshift = PUD_SHIFT;
-		pu = pud_offset(pg, ea);
-
-		if (pud_huge(*pu)) {
-			ret_pte = (pte_t *) pu;
-			goto out;
-		} else if (is_hugepd(pu))
-			hpdp = (hugepd_t *)pu;
-		else if (!pud_none(*pu)) {
-			pdshift = PMD_SHIFT;
-			pm = pmd_offset(pu, ea);
-
-			if (pmd_huge(*pm)) {
-				ret_pte = (pte_t *) pm;
-				goto out;
-			} else if (is_hugepd(pm))
-				hpdp = (hugepd_t *)pm;
-			else if (!pmd_none(*pm))
-				return pte_offset_kernel(pm, ea);
-		}
-	}
-	if (!hpdp)
-		return NULL;
-
-	ret_pte = hugepte_offset(hpdp, ea, pdshift);
-	pdshift = hugepd_shift(*hpdp);
-out:
-	if (shift)
-		*shift = pdshift;
-	return ret_pte;
-}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
-
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
@@ -753,69 +696,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-		unsigned long end, int write, struct page **pages, int *nr)
-{
-	unsigned long mask;
-	unsigned long pte_end;
-	struct page *head, *page, *tail;
-	pte_t pte;
-	int refs;
-
-	pte_end = (addr + sz) & ~(sz-1);
-	if (pte_end < end)
-		end = pte_end;
-
-	pte = *ptep;
-	mask = _PAGE_PRESENT | _PAGE_USER;
-	if (write)
-		mask |= _PAGE_RW;
-
-	if ((pte_val(pte) & mask) != mask)
-		return 0;
-
-	/* hugepages are never "special" */
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-	refs = 0;
-	head = pte_page(pte);
-
-	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-	tail = page;
-	do {
-		VM_BUG_ON(compound_head(page) != head);
-		pages[*nr] = page;
-		(*nr)++;
-		page++;
-		refs++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	if (!page_cache_add_speculative(head, refs)) {
-		*nr -= refs;
-		return 0;
-	}
-
-	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-		/* Could be optimized better */
-		*nr -= refs;
-		while (refs--)
-			put_page(head);
-		return 0;
-	}
-
-	/*
-	 * Any tail page need their mapcount reference taken before we
-	 * return.
-	 */
-	while (refs--) {
-		if (PageTail(tail))
-			get_huge_page_tail(tail);
-		tail++;
-	}
-
-	return 1;
-}
-
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 				      unsigned long sz)
 {
@@ -1032,3 +912,128 @@ void flush_dcache_icache_hugepage(struct page *page)
 		}
 	}
 }
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ */
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *ret_pte;
+	hugepd_t *hpdp = NULL;
+	unsigned pdshift = PGDIR_SHIFT;
+
+	if (shift)
+		*shift = 0;
+
+	pg = pgdir + pgd_index(ea);
+
+	if (pgd_huge(*pg)) {
+		ret_pte = (pte_t *) pg;
+		goto out;
+	} else if (is_hugepd(pg))
+		hpdp = (hugepd_t *)pg;
+	else if (!pgd_none(*pg)) {
+		pdshift = PUD_SHIFT;
+		pu = pud_offset(pg, ea);
+
+		if (pud_huge(*pu)) {
+			ret_pte = (pte_t *) pu;
+			goto out;
+		} else if (is_hugepd(pu))
+			hpdp = (hugepd_t *)pu;
+		else if (!pud_none(*pu)) {
+			pdshift = PMD_SHIFT;
+			pm = pmd_offset(pu, ea);
+
+			if (pmd_huge(*pm)) {
+				ret_pte = (pte_t *) pm;
+				goto out;
+			} else if (is_hugepd(pm))
+				hpdp = (hugepd_t *)pm;
+			else if (!pmd_none(*pm))
+				return pte_offset_kernel(pm, ea);
+		}
+	}
+	if (!hpdp)
+		return NULL;
+
+	ret_pte = hugepte_offset(hpdp, ea, pdshift);
+	pdshift = hugepd_shift(*hpdp);
+out:
+	if (shift)
+		*shift = pdshift;
+	return ret_pte;
+}
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	unsigned long pte_end;
+	struct page *head, *page, *tail;
+	pte_t pte;
+	int refs;
+
+	pte_end = (addr + sz) & ~(sz-1);
+	if (pte_end < end)
+		end = pte_end;
+
+	pte = *ptep;
+	mask = _PAGE_PRESENT | _PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+
+	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+	tail = page;
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		*nr -= refs;
+		while (refs--)
+			put_page(head);
+		return 0;
+	}
+
+	/*
+	 * Any tail page need their mapcount reference taken before we
+	 * return.
+	 */
+	while (refs--) {
+		if (PageTail(tail))
+			get_huge_page_tail(tail);
+		tail++;
+	}
+
+	return 1;
+}
-- 
cgit v0.10.2


From ac52ae4721233150a3c30e9732a1c1f4f68e7db7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:17 +0530
Subject: powerpc: Update find_linux_pte_or_hugepte to handle transparent
 hugepages

Reviewed-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 2865077..4928204 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -936,30 +936,50 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 
 	pg = pgdir + pgd_index(ea);
 
-	if (pgd_huge(*pg)) {
+	/*
+	 * we should first check for none. That takes care of a
+	 * a parallel hugetlb or THP pagefault moving none entries
+	 * to respective types.
+	 */
+	if (pgd_none(*pg))
+		return NULL;
+	else if (pgd_huge(*pg)) {
 		ret_pte = (pte_t *) pg;
 		goto out;
 	} else if (is_hugepd(pg))
 		hpdp = (hugepd_t *)pg;
-	else if (!pgd_none(*pg)) {
+	else {
 		pdshift = PUD_SHIFT;
 		pu = pud_offset(pg, ea);
 
-		if (pud_huge(*pu)) {
+		if (pud_none(*pu))
+			return NULL;
+		else if (pud_huge(*pu)) {
 			ret_pte = (pte_t *) pu;
 			goto out;
 		} else if (is_hugepd(pu))
 			hpdp = (hugepd_t *)pu;
-		else if (!pud_none(*pu)) {
+		else {
 			pdshift = PMD_SHIFT;
 			pm = pmd_offset(pu, ea);
+			/*
+			 * A hugepage collapse is captured by pmd_none, because
+			 * it mark the pmd none and do a hpte invalidate.
+			 *
+			 * A hugepage split is captured by pmd_trans_splitting
+			 * because we mark the pmd trans splitting and do a
+			 * hpte invalidate
+			 *
+			 */
+			if (pmd_none(*pm) || pmd_trans_splitting(*pm))
+				return NULL;
 
-			if (pmd_huge(*pm)) {
+			if (pmd_huge(*pm) || pmd_large(*pm)) {
 				ret_pte = (pte_t *) pm;
 				goto out;
 			} else if (is_hugepd(pm))
 				hpdp = (hugepd_t *)pm;
-			else if (!pmd_none(*pm))
+			else
 				return pte_offset_kernel(pm, ea);
 		}
 	}
-- 
cgit v0.10.2


From 12bc9f6fc1d6582b4529ac522d2231bd2584a5f1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:18 +0530
Subject: powerpc: Replace find_linux_pte with find_linux_pte_or_hugepte

Replace find_linux_pte with find_linux_pte_or_hugepte and explicitly
document why we don't need to handle transparent hugepages at callsites.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 6c9323f..e71bd25 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -344,30 +344,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
-
-/*
- * find_linux_pte returns the address of a linux pte for a given
- * effective address and directory.  If not found, it returns zero.
- */
-static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
-{
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
-	pte_t *pt = NULL;
-
-	pg = pgdir + pgd_index(ea);
-	if (!pgd_none(*pg)) {
-		pu = pud_offset(pg, ea);
-		if (!pud_none(*pu)) {
-			pm = pmd_offset(pu, ea);
-			if (pmd_present(*pm))
-				pt = pte_offset_kernel(pm, ea);
-		}
-	}
-	return pt;
-}
-
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 7c567be..af2b9ae 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -260,10 +260,15 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 {
 	pte_t *ptep;
 	unsigned long pa;
+	int hugepage_shift;
 
-	ptep = find_linux_pte(init_mm.pgd, token);
+	/*
+	 * We won't find hugepages here, iomem
+	 */
+	ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
 	if (!ptep)
 		return token;
+	WARN_ON(hugepage_shift);
 	pa = pte_pfn(*ptep) << PAGE_SHIFT;
 
 	return pa | (token & (PAGE_SIZE-1));
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 50e90b7..fa0b54b 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
 
 struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 {
+	unsigned hugepage_shift;
 	struct iowa_bus *bus;
 	int token;
 
@@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 		if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
 			return NULL;
 
-		ptep = find_linux_pte(init_mm.pgd, vaddr);
+		ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+						 &hugepage_shift);
 		if (ptep == NULL)
 			paddr = 0;
-		else
+		else {
+			/*
+			 * we don't have hugepages backing iomem
+			 */
+			WARN_ON(hugepage_shift);
 			paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+		}
 		bus = iowa_pci_find(vaddr, paddr);
 
 		if (bus == NULL)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6dcbb49..dcf892d 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x)
 	unsigned long addr = (unsigned long) x;
 	pte_t *p;
 
-	p = find_linux_pte(swapper_pg_dir, addr);
+	p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
 	if (!p || !pte_present(*p))
 		return NULL;
 	/* assume we don't have huge pages in vmalloc space... */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2f47080..e8434ca 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1145,6 +1145,7 @@ EXPORT_SYMBOL_GPL(hash_page);
 void hash_preload(struct mm_struct *mm, unsigned long ea,
 		  unsigned long access, unsigned long trap)
 {
+	int hugepage_shift;
 	unsigned long vsid;
 	pgd_t *pgdir;
 	pte_t *ptep;
@@ -1166,10 +1167,15 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	pgdir = mm->pgd;
 	if (pgdir == NULL)
 		return;
-	ptep = find_linux_pte(pgdir, ea);
+	/*
+	 * THP pages use update_mmu_cache_pmd. We don't do
+	 * hash preload there. Hence can ignore THP here
+	 */
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
 	if (!ptep)
 		return;
 
+	WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
 	 * a 64K kernel), then we don't preload, hash_page() will take
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 4928204..8add580 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -105,6 +105,7 @@ int pgd_huge(pgd_t pgd)
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
+	/* Only called for hugetlbfs pages, hence can ignore THP */
 	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 }
 
@@ -673,11 +674,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 	struct page *page;
 	unsigned shift;
 	unsigned long mask;
-
+	/*
+	 * Transparent hugepages are handled by generic code. We can skip them
+	 * here.
+	 */
 	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
 	/* Verify it is a huge page else bail. */
-	if (!ptep || !shift)
+	if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
 		return ERR_PTR(-EINVAL);
 
 	mask = (1UL << shift) - 1;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 48bf63e..313c85c 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -189,6 +189,7 @@ void tlb_flush(struct mmu_gather *tlb)
 void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 			      unsigned long end)
 {
+	int hugepage_shift;
 	unsigned long flags;
 
 	start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte(mm->pgd, start);
+		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+							&hugepage_shift);
 		unsigned long pte;
 
 		if (ptep == NULL)
@@ -214,7 +216,10 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 		pte = pte_val(*ptep);
 		if (!(pte & _PAGE_HASHPTE))
 			continue;
-		hpte_need_flush(mm, start, ptep, pte, 0);
+		if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+			hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+		else
+			hpte_need_flush(mm, start, ptep, pte, 0);
 	}
 	arch_leave_lazy_mmu_mode();
 	local_irq_restore(flags);
-- 
cgit v0.10.2


From db7cb5b92409b36e4338355fbc3561b3f6801c7b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:19 +0530
Subject: powerpc/kvm: Handle transparent hugepage in KVM

We can find pte that are splitting while walking page tables. Return
None pte in that case.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9c1ff33..a1ecb14 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -159,36 +159,46 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
 }
 
 /*
- * Lock and read a linux PTE.  If it's present and writable, atomically
- * set dirty and referenced bits and return the PTE, otherwise return 0.
+ * If it's present and writable, atomically set dirty and referenced bits and
+ * return the PTE, otherwise return 0. If we find a transparent hugepage
+ * and if it is marked splitting we return 0;
  */
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
+						 unsigned int hugepage)
 {
-	pte_t pte, tmp;
-
-	/* wait until _PAGE_BUSY is clear then set it atomically */
-	__asm__ __volatile__ (
-		"1:	ldarx	%0,0,%3\n"
-		"	andi.	%1,%0,%4\n"
-		"	bne-	1b\n"
-		"	ori	%1,%0,%4\n"
-		"	stdcx.	%1,0,%3\n"
-		"	bne-	1b"
-		: "=&r" (pte), "=&r" (tmp), "=m" (*p)
-		: "r" (p), "i" (_PAGE_BUSY)
-		: "cc");
-
-	if (pte_present(pte)) {
-		pte = pte_mkyoung(pte);
-		if (writing && pte_write(pte))
-			pte = pte_mkdirty(pte);
-	}
+	pte_t old_pte, new_pte = __pte(0);
+
+	while (1) {
+		old_pte = pte_val(*ptep);
+		/*
+		 * wait until _PAGE_BUSY is clear then set it atomically
+		 */
+		if (unlikely(old_pte & _PAGE_BUSY)) {
+			cpu_relax();
+			continue;
+		}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		/* If hugepage and is trans splitting return None */
+		if (unlikely(hugepage &&
+			     pmd_trans_splitting(pte_pmd(old_pte))))
+			return __pte(0);
+#endif
+		/* If pte is not present return None */
+		if (unlikely(!(old_pte & _PAGE_PRESENT)))
+			return __pte(0);
 
-	*p = pte;	/* clears _PAGE_BUSY */
+		new_pte = pte_mkyoung(old_pte);
+		if (writing && pte_write(old_pte))
+			new_pte = pte_mkdirty(new_pte);
 
-	return pte;
+		if (old_pte == __cmpxchg_u64((unsigned long *)ptep, old_pte,
+					     new_pte))
+			break;
+	}
+	return new_pte;
 }
 
+
 /* Return HPTE cache control bits corresponding to Linux pte bits */
 static inline unsigned long hpte_cache_bits(unsigned long pte_val)
 {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 5880dfb..710d313 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 		/* if the guest wants write access, see if that is OK */
 		if (!writing && hpte_is_writable(r)) {
+			unsigned int hugepage_shift;
 			pte_t *ptep, pte;
 
 			/*
@@ -683,9 +684,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 */
 			rcu_read_lock_sched();
 			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-							 hva, NULL);
-			if (ptep && pte_present(*ptep)) {
-				pte = kvmppc_read_update_linux_pte(ptep, 1);
+							 hva, &hugepage_shift);
+			if (ptep) {
+				pte = kvmppc_read_update_linux_pte(ptep, 1,
+							   hugepage_shift);
 				if (pte_write(pte))
 					write_ok = 1;
 			}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index dcf892d..fc25689 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -139,20 +139,18 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 {
 	pte_t *ptep;
 	unsigned long ps = *pte_sizep;
-	unsigned int shift;
+	unsigned int hugepage_shift;
 
-	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
 	if (!ptep)
 		return __pte(0);
-	if (shift)
-		*pte_sizep = 1ul << shift;
+	if (hugepage_shift)
+		*pte_sizep = 1ul << hugepage_shift;
 	else
 		*pte_sizep = PAGE_SIZE;
 	if (ps > *pte_sizep)
 		return __pte(0);
-	if (!pte_present(*ptep))
-		return __pte(0);
-	return kvmppc_read_update_linux_pte(ptep, writing);
+	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 }
 
 static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
-- 
cgit v0.10.2


From c367714ce807faff3f0c48064cda158d5117b419 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:20 +0530
Subject: powerpc: Update gup_pmd_range to handle transparent hugepages

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 4b921af..223a255 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -66,9 +66,15 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		pmd_t pmd = *pmdp;
 
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
+		/*
+		 * If we find a splitting transparent hugepage we
+		 * return zero. That will result in taking the slow
+		 * path which will call wait_split_huge_page()
+		 * if the pmd is still in splitting state
+		 */
+		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
-		if (pmd_huge(pmd)) {
+		if (pmd_huge(pmd) || pmd_large(pmd)) {
 			if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
 					 write, pages, nr))
 				return 0;
-- 
cgit v0.10.2


From 6d492ecc6489113968ec269be1cf88942d4a5d29 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:21 +0530
Subject: powerpc/THP: Add code to handle HPTE faults for hugepages

The deposted PTE page in the second half of the PMD table is used to
track the state on hash PTEs. After updating the HPTE, we mark the
coresponding slot in the deposted PTE page valid.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 2accc96..3d6fbb0 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -340,6 +340,19 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		     pte_t *ptep, unsigned long trap, int local, int ssize,
 		     unsigned int shift, unsigned int mmu_psize);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __hash_page_thp(unsigned long ea, unsigned long access,
+			   unsigned long vsid, pmd_t *pmdp, unsigned long trap,
+			   int local, int ssize, unsigned int psize);
+#else
+static inline int __hash_page_thp(unsigned long ea, unsigned long access,
+				  unsigned long vsid, pmd_t *pmdp,
+				  unsigned long trap, int local,
+				  int ssize, unsigned int psize)
+{
+	BUG();
+}
+#endif
 extern void hash_failure_debug(unsigned long ea, unsigned long access,
 			       unsigned long vsid, unsigned long trap,
 			       int ssize, int psize, int lpsize,
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index ff0379c..51230ee 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -32,6 +32,7 @@ ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e8434ca..7a81e86 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 		goto bail;
 	}
 
-#ifdef CONFIG_HUGETLB_PAGE
 	if (hugeshift) {
-		rc = __hash_page_huge(ea, access, vsid, ptep, trap, local,
-					ssize, hugeshift, psize);
+		if (pmd_trans_huge(*(pmd_t *)ptep))
+			rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+					     trap, local, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+		else
+			rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+					      local, ssize, hugeshift, psize);
+#else
+		else {
+			/*
+			 * if we have hugeshift, and is not transhuge with
+			 * hugetlb disabled, something is really wrong.
+			 */
+			rc = 1;
+			WARN_ON(1);
+		}
+#endif
 		goto bail;
 	}
-#endif /* CONFIG_HUGETLB_PAGE */
 
 #ifndef CONFIG_PPC_64K_PAGES
 	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 0000000..3c22fa3
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+		    pmd_t *pmdp, unsigned long trap, int local, int ssize,
+		    unsigned int psize)
+{
+	unsigned int index, valid;
+	unsigned char *hpte_slot_array;
+	unsigned long rflags, pa, hidx;
+	unsigned long old_pmd, new_pmd;
+	int ret, lpsize = MMU_PAGE_16M;
+	unsigned long vpn, hash, shift, slot;
+
+	/*
+	 * atomically mark the linux large page PMD busy and dirty
+	 */
+	do {
+		old_pmd = pmd_val(*pmdp);
+		/* If PMD busy, retry the access */
+		if (unlikely(old_pmd & _PAGE_BUSY))
+			return 0;
+		/* If PMD permissions don't match, take page fault */
+		if (unlikely(access & ~old_pmd))
+			return 1;
+		/*
+		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+		 * a write access
+		 */
+		new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+		if (access & _PAGE_RW)
+			new_pmd |= _PAGE_DIRTY;
+	} while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+					  old_pmd, new_pmd));
+	/*
+	 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	rflags = new_pmd & _PAGE_USER;
+	if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+					   (new_pmd & _PAGE_DIRTY)))
+		rflags |= 0x1;
+	/*
+	 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+	 */
+	rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+
+#if 0
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+
+		/*
+		 * No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case
+		 */
+		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+	}
+#endif
+	/*
+	 * Find the slot index details for this ea, using base page size.
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	index = (ea & ~HPAGE_PMD_MASK) >> shift;
+	BUG_ON(index >= 4096);
+
+	vpn = hpt_vpn(ea, vsid, ssize);
+	hash = hpt_hash(vpn, shift, ssize);
+	hpte_slot_array = get_hpte_slot_array(pmdp);
+
+	valid = hpte_valid(hpte_slot_array, index);
+	if (valid) {
+		/* update the hpte bits */
+		hidx =  hpte_hash_index(hpte_slot_array, index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+					   psize, lpsize, ssize, local);
+		/*
+		 * We failed to update, try to insert a new entry.
+		 */
+		if (ret == -1) {
+			/*
+			 * large pte is marked busy, so we can be sure
+			 * nobody is looking at hpte_slot_array. hence we can
+			 * safely update this here.
+			 */
+			valid = 0;
+			new_pmd &= ~_PAGE_HPTEFLAGS;
+			hpte_slot_array[index] = 0;
+		} else
+			/* clear the busy bits and set the hash pte bits */
+			new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+	}
+
+	if (!valid) {
+		unsigned long hpte_group;
+
+		/* insert new entry */
+		pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+		hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+		/* clear the busy bits and set the hash pte bits */
+		new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+
+		/* Add in WIMG bits */
+		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				      _PAGE_COHERENT | _PAGE_GUARDED));
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+					  psize, lpsize, ssize);
+		/*
+		 * Primary is full, try the secondary
+		 */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+						  rflags, HPTE_V_SECONDARY,
+						  psize, lpsize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP) & ~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+			}
+		}
+		/*
+		 * Hypervisor failure. Restore old pmd and return -1
+		 * similar to __hash_page_*
+		 */
+		if (unlikely(slot == -2)) {
+			*pmdp = __pmd(old_pmd);
+			hash_failure_debug(ea, access, vsid, trap, ssize,
+					   psize, lpsize, old_pmd);
+			return -1;
+		}
+		/*
+		 * large pte is marked busy, so we can be sure
+		 * nobody is looking at hpte_slot_array. hence we can
+		 * safely update this here.
+		 */
+		mark_hpte_slot_valid(hpte_slot_array, index, slot);
+	}
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+	return 0;
+}
-- 
cgit v0.10.2


From 0ac52dd7666d5c0d0147d73a8e4b1d1ffd81cdf3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:22 +0530
Subject: powerpc: Make linux pagetable walk safe with THP enabled

We need to have irqs disabled to handle all the possible parallel update for
linux page table without holding locks.

Events that we are intersted in while walking page tables are
1) Page fault
2) umap
3) THP split
4) THP collapse

A) local_irq_disabled:
------------------------
1) page fault:
A none to valid transition via page fault is not an issue because we
would either see a none or valid. If it is none, we would error out
the page table walk. We may need to use on stack values when checking for
type of page table elements, because if we do

if (!is_hugepd()) {
    if (!pmd_none() {
       if (pmd_bad() {

We could take that bad condition because the pmd got converted to a hugepd
after the !is_hugepd check via a hugetlb fault.

The right way would be to check for pmd_none higher up or use on stack value.

2) A valid to none conversion via unmap:
We can safely walk the upper level table, because we don't remove the the
page table entries until rcu grace period. So even if we followed a
wrong pointer we still have the pointer valid till the grace period.

A PTE pointer returned need to be atomically checked for _PAGE_PRESENT and
 _PAGE_BUSY. A valid pointer returned could becoming none later. To prevent
pte_clear we take _PAGE_BUSY.

3) THP split:
A valid transparent hugepage is converted to nomal page. Before we split we
do pmd_splitting_flush, which sets the hugepage PTE to _PAGE_SPLITTING
So when walking page table we need to check for pmd_trans_splitting and
handle that. The pte returned should also need to be checked for
_PAGE_SPLITTING before setting _PAGE_BUSY similar to _PAGE_PRESENT. We save
the value of PTE on stack and check for the flag in the local pte value.
If we don't have the value set we can safely operate on the local pte value
and we atomicaly set _PAGE_BUSY.

4) THP collapse:
A normal page gets converted to hugepage. In the collapse path, we
mark the pmd none early (pmdp_clear_flush). With irq disabled, if we
are aleady walking page table we would see the pmd_none and won't continue.
If we see a valid PMD, we should still check for _PAGE_PRESENT before
setting _PAGE_BUSY, to make sure we didn't collapse the PTE to a Huge PTE.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a81e86..8452316 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1180,13 +1180,25 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	pgdir = mm->pgd;
 	if (pgdir == NULL)
 		return;
+
+	/* Get VSID */
+	ssize = user_segment_size(ea);
+	vsid = get_vsid(mm->context.id, ea, ssize);
+	if (!vsid)
+		return;
+	/*
+	 * Hash doesn't like irqs. Walking linux page table with irq disabled
+	 * saves us from holding multiple locks.
+	 */
+	local_irq_save(flags);
+
 	/*
 	 * THP pages use update_mmu_cache_pmd. We don't do
 	 * hash preload there. Hence can ignore THP here
 	 */
 	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
 	if (!ptep)
-		return;
+		goto out_exit;
 
 	WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
@@ -1197,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 * page size demotion here
 	 */
 	if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
-		return;
+		goto out_exit;
 #endif /* CONFIG_PPC_64K_PAGES */
 
-	/* Get VSID */
-	ssize = user_segment_size(ea);
-	vsid = get_vsid(mm->context.id, ea, ssize);
-	if (!vsid)
-		return;
-
-	/* Hash doesn't like irqs */
-	local_irq_save(flags);
-
 	/* Is that local to this CPU ? */
 	if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
 		local = 1;
@@ -1230,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 				   mm->context.user_psize,
 				   mm->context.user_psize,
 				   pte_val(*ptep));
-
+out_exit:
 	local_irq_restore(flags);
 }
 
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 3c22fa3..34de9e0 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -37,6 +37,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		/* If PMD busy, retry the access */
 		if (unlikely(old_pmd & _PAGE_BUSY))
 			return 0;
+		/* If PMD is trans splitting retry the access */
+		if (unlikely(old_pmd & _PAGE_SPLITTING))
+			return 0;
 		/* If PMD permissions don't match, take page fault */
 		if (unlikely(access & ~old_pmd))
 			return 1;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8add580..e9e6882 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -925,12 +925,16 @@ void flush_dcache_icache_hugepage(struct page *page)
  * (2) pointer to next table, as normal; bottom 6 bits == 0
  * (3) leaf pte for huge page, bottom two bits != 00
  * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
  */
+
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
 {
-	pgd_t *pg;
-	pud_t *pu;
-	pmd_t *pm;
+	pgd_t pgd, *pgdp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
 	pte_t *ret_pte;
 	hugepd_t *hpdp = NULL;
 	unsigned pdshift = PGDIR_SHIFT;
@@ -938,34 +942,42 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 	if (shift)
 		*shift = 0;
 
-	pg = pgdir + pgd_index(ea);
-
+	pgdp = pgdir + pgd_index(ea);
+	pgd  = ACCESS_ONCE(*pgdp);
 	/*
-	 * we should first check for none. That takes care of a
-	 * a parallel hugetlb or THP pagefault moving none entries
-	 * to respective types.
+	 * Always operate on the local stack value. This make sure the
+	 * value don't get updated by a parallel THP split/collapse,
+	 * page fault or a page unmap. The return pte_t * is still not
+	 * stable. So should be checked there for above conditions.
 	 */
-	if (pgd_none(*pg))
+	if (pgd_none(pgd))
 		return NULL;
-	else if (pgd_huge(*pg)) {
-		ret_pte = (pte_t *) pg;
+	else if (pgd_huge(pgd)) {
+		ret_pte = (pte_t *) pgdp;
 		goto out;
-	} else if (is_hugepd(pg))
-		hpdp = (hugepd_t *)pg;
+	} else if (is_hugepd(&pgd))
+		hpdp = (hugepd_t *)&pgd;
 	else {
+		/*
+		 * Even if we end up with an unmap, the pgtable will not
+		 * be freed, because we do an rcu free and here we are
+		 * irq disabled
+		 */
 		pdshift = PUD_SHIFT;
-		pu = pud_offset(pg, ea);
+		pudp = pud_offset(&pgd, ea);
+		pud  = ACCESS_ONCE(*pudp);
 
-		if (pud_none(*pu))
+		if (pud_none(pud))
 			return NULL;
-		else if (pud_huge(*pu)) {
-			ret_pte = (pte_t *) pu;
+		else if (pud_huge(pud)) {
+			ret_pte = (pte_t *) pudp;
 			goto out;
-		} else if (is_hugepd(pu))
-			hpdp = (hugepd_t *)pu;
+		} else if (is_hugepd(&pud))
+			hpdp = (hugepd_t *)&pud;
 		else {
 			pdshift = PMD_SHIFT;
-			pm = pmd_offset(pu, ea);
+			pmdp = pmd_offset(&pud, ea);
+			pmd  = ACCESS_ONCE(*pmdp);
 			/*
 			 * A hugepage collapse is captured by pmd_none, because
 			 * it mark the pmd none and do a hpte invalidate.
@@ -975,16 +987,16 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 			 * hpte invalidate
 			 *
 			 */
-			if (pmd_none(*pm) || pmd_trans_splitting(*pm))
+			if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 				return NULL;
 
-			if (pmd_huge(*pm) || pmd_large(*pm)) {
-				ret_pte = (pte_t *) pm;
+			if (pmd_huge(pmd) || pmd_large(pmd)) {
+				ret_pte = (pte_t *) pmdp;
 				goto out;
-			} else if (is_hugepd(pm))
-				hpdp = (hugepd_t *)pm;
+			} else if (is_hugepd(&pmd))
+				hpdp = (hugepd_t *)&pmd;
 			else
-				return pte_offset_kernel(pm, ea);
+				return pte_offset_kernel(&pmd, ea);
 		}
 	}
 	if (!hpdp)
@@ -1020,6 +1032,14 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 	if ((pte_val(pte) & mask) != mask)
 		return 0;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/*
+	 * check for splitting here
+	 */
+	if (pmd_trans_splitting(pte_pmd(pte)))
+		return 0;
+#endif
+
 	/* hugepages are never "special" */
 	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0988a26..ccd49f9 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -508,6 +508,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *ptep)
 {
 #ifdef CONFIG_PPC_STD_MMU
+	/*
+	 * We don't need to worry about _PAGE_PRESENT here because we are
+	 * called with either mm->page_table_lock held or ptl lock held
+	 */
 	unsigned long access = 0, trap;
 
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
-- 
cgit v0.10.2


From 7888b4ddb44dccd68bc20d0dc4425707dff88c72 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:23 +0530
Subject: powerpc: Prevent gcc to re-read the pagetables

GCC is very likely to read the pagetables just once and cache them in
the local stack or in a register, but it is can also decide to re-read
the pagetables. The problem is that the pagetable in those places can
change from under gcc.

With THP/hugetlbfs the pmd (and pgd for hugetlbfs giga pages) can
change under gup_fast. The pages won't be freed untill we finish
gup fast because we have irq disabled and we free these pages via
rcu callback.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 223a255..49822d9 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 
 	ptep = pte_offset_kernel(&pmd, addr);
 	do {
-		pte_t pte = *ptep;
+		pte_t pte = ACCESS_ONCE(*ptep);
 		struct page *page;
 
 		if ((pte_val(pte) & mask) != result)
@@ -63,7 +63,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 
 	pmdp = pmd_offset(&pud, addr);
 	do {
-		pmd_t pmd = *pmdp;
+		pmd_t pmd = ACCESS_ONCE(*pmdp);
 
 		next = pmd_addr_end(addr, end);
 		/*
@@ -97,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 
 	pudp = pud_offset(&pgd, addr);
 	do {
-		pud_t pud = *pudp;
+		pud_t pud = ACCESS_ONCE(*pudp);
 
 		next = pud_addr_end(addr, end);
 		if (pud_none(pud))
@@ -160,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
 	pgdp = pgd_offset(mm, addr);
 	do {
-		pgd_t pgd = *pgdp;
+		pgd_t pgd = ACCESS_ONCE(*pgdp);
 
 		pr_devel("  %016lx: normal pgd %p\n", addr,
 			 (void *)pgd_val(pgd));
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e9e6882..f2f01fd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1024,7 +1024,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 	if (pte_end < end)
 		end = pte_end;
 
-	pte = *ptep;
+	pte = ACCESS_ONCE(*ptep);
 	mask = _PAGE_PRESENT | _PAGE_USER;
 	if (write)
 		mask |= _PAGE_RW;
-- 
cgit v0.10.2


From a00e7bea0dde6a44b9bbe84f30b731d9ec73858b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:24 +0530
Subject: powerpc: disable assert_pte_locked for collapse_huge_page

With THP we set pmd to none, before we do pte_clear. Hence we can't
walk page table to get the pte lock ptr and verify whether it is locked.
THP do take pte lock before calling pte_clear. So we don't change the locking
rules here. It is that we can't use page table walking to check whether
pte locks are held with THP.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a..edda589 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 	pud = pud_offset(pgd, addr);
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, addr);
+	/*
+	 * khugepaged to collapse normal pages to hugepage, first set
+	 * pmd to none to force page fault/gup to take mmap_sem. After
+	 * pmd is set to none, we do a pte_clear which does this assertion
+	 * so if we find pmd none, return.
+	 */
+	if (pmd_none(*pmd))
+		return;
 	BUG_ON(!pmd_present(*pmd));
 	assert_spin_locked(pte_lockptr(mm, pmd));
 }
-- 
cgit v0.10.2


From d8e355a20f9dd45deea4c33db649dda59bdbd293 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:25 +0530
Subject: powerpc: split hugepage when using subpage protection

We find all the overlapping vma and mark them such that we don't allocate
hugepage in that range. Also we split existing huge page so that the
normal page hash can be invalidated and new page faulted in with new
protection bits.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415dd..aa74acb 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 	up_write(&mm->mmap_sem);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->private;
+	split_huge_page_pmd(vma, addr, pmd);
+	return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+				    unsigned long len)
+{
+	struct vm_area_struct *vma;
+	struct mm_walk subpage_proto_walk = {
+		.mm = mm,
+		.pmd_entry = subpage_walk_pmd_entry,
+	};
+
+	/*
+	 * We don't try too hard, we just mark all the vma in that range
+	 * VM_NOHUGEPAGE and split them.
+	 */
+	vma = find_vma(mm, addr);
+	/*
+	 * If the range is in unmapped range, just return
+	 */
+	if (vma && ((addr + len) <= vma->vm_start))
+		return;
+
+	while (vma) {
+		if (vma->vm_start >= (addr + len))
+			break;
+		vma->vm_flags |= VM_NOHUGEPAGE;
+		subpage_proto_walk.private = vma;
+		walk_page_range(vma->vm_start, vma->vm_end,
+				&subpage_proto_walk);
+		vma = vma->vm_next;
+	}
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+				    unsigned long len)
+{
+	return;
+}
+#endif
+
 /*
  * Copy in a subpage protection map for an address range.
  * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 		return -EFAULT;
 
 	down_write(&mm->mmap_sem);
+	subpage_mark_vma_nohuge(mm, addr, len);
 	for (limit = addr + len; addr < limit; addr = next) {
 		next = pmd_addr_end(addr, limit);
 		err = -ENOMEM;
-- 
cgit v0.10.2


From 437d496457a30ce9ccccb94b2373c201b2558392 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:26 +0530
Subject: powerpc/THP: Enable THP on PPC64

We enable only if the we support 16MB page size.

Reviewed-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index e71bd25..46db094 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -444,8 +444,7 @@ static inline int pmd_trans_splitting(pmd_t pmd)
 	return 0;
 }
 
-/* We will enable it in the last patch */
-#define has_transparent_hugepage() 0
+extern int has_transparent_hugepage(void);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline pte_t pmd_pte(pmd_t pmd)
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e4d3e9f..074a4a2 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -831,4 +831,33 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 	memset(pgtable, 0, PTE_FRAG_SIZE);
 	return old_pmd;
 }
+
+int has_transparent_hugepage(void)
+{
+	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+		return 0;
+	/*
+	 * We support THP only if PMD_SIZE is 16MB.
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+		return 0;
+	/*
+	 * We need to make sure that we support 16MB hugepage in a segement
+	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+	 * of 64K.
+	 */
+	/*
+	 * If we have 64K HPTE, we will be using that by default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+		return 0;
+	/*
+	 * Ok we only have 4K HPTE
+	 */
+	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+		return 0;
+
+	return 1;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
cgit v0.10.2


From 1a5272866f87d7fbf04dc8060f8da3e8456490ab Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 20 Jun 2013 14:30:27 +0530
Subject: powerpc: Optimize hugepage invalidate

Hugepage invalidate involves invalidating multiple hpte entries.
Optimize the operation using H_BULK_REMOVE on lpar platforms.
On native, reduce the number of tlb flush.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 801e3c6..8b48090 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -57,6 +57,9 @@ struct machdep_calls {
 	void            (*hpte_removebolted)(unsigned long ea,
 					     int psize, int ssize);
 	void		(*flush_hash_range)(unsigned long number, int local);
+	void		(*hugepage_invalidate)(struct mm_struct *mm,
+					       unsigned char *hpte_slot_array,
+					       unsigned long addr, int psize);
 
 	/* special for kexec, to be called in real mode, linear mapping is
 	 * destroyed as well */
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 6d152bc..3f0c30a 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -407,6 +407,78 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	local_irq_restore(flags);
 }
 
+static void native_hugepage_invalidate(struct mm_struct *mm,
+				       unsigned char *hpte_slot_array,
+				       unsigned long addr, int psize)
+{
+	int ssize = 0, i;
+	int lock_tlbie;
+	struct hash_pte *hptep;
+	int actual_psize = MMU_PAGE_16M;
+	unsigned int max_hpte_count, valid;
+	unsigned long flags, s_addr = addr;
+	unsigned long hpte_v, want_v, shift;
+	unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+	local_irq_save(flags);
+	for (i = 0; i < max_hpte_count; i++) {
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		hptep = htab_address + slot;
+		want_v = hpte_encode_avpn(vpn, psize, ssize);
+		native_lock_hpte(hptep);
+		hpte_v = hptep->v;
+
+		/* Even if we miss, we need to invalidate the TLB */
+		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+			native_unlock_hpte(hptep);
+		else
+			/* Invalidate the hpte. NOTE: this also unlocks it */
+			hptep->v = 0;
+	}
+	/*
+	 * Since this is a hugepage, we just need a single tlbie.
+	 * use the last vpn.
+	 */
+	lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+	if (lock_tlbie)
+		raw_spin_lock(&native_tlbie_lock);
+
+	asm volatile("ptesync":::"memory");
+	__tlbie(vpn, psize, actual_psize, ssize);
+	asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+	if (lock_tlbie)
+		raw_spin_unlock(&native_tlbie_lock);
+
+	local_irq_restore(flags);
+}
+
 static inline int __hpte_actual_psize(unsigned int lp, int psize)
 {
 	int i, shift;
@@ -640,4 +712,5 @@ void __init hpte_init_native(void)
 	ppc_md.hpte_remove	= native_hpte_remove;
 	ppc_md.hpte_clear_all	= native_hpte_clear;
 	ppc_md.flush_hash_range = native_flush_hash_range;
+	ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 074a4a2..536eec72 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -708,6 +708,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 {
 	int ssize, i;
 	unsigned long s_addr;
+	int max_hpte_count;
 	unsigned int psize, valid;
 	unsigned char *hpte_slot_array;
 	unsigned long hidx, vpn, vsid, hash, shift, slot;
@@ -727,9 +728,16 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 
 	/* get the base page size */
 	psize = get_slice_psize(mm, s_addr);
-	shift = mmu_psize_defs[psize].shift;
 
-	for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) {
+	if (ppc_md.hugepage_invalidate)
+		return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+						  s_addr, psize);
+	/*
+	 * No bluk hpte removal support, invalidate each entry
+	 */
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = HPAGE_PMD_SIZE >> shift;
+	for (i = 0; i < max_hpte_count; i++) {
 		/*
 		 * 8 bits per each hpte entries
 		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index ca45c8f..fd0f2f2 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -45,6 +45,13 @@
 #include "plpar_wrappers.h"
 #include "pseries.h"
 
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST	0x4000000000000000UL
+#define HBR_RESPONSE	0x8000000000000000UL
+#define HBR_END		0xc000000000000000UL
+#define HBR_AVPN	0x0200000000000000UL
+#define HBR_ANDCOND	0x0100000000000000UL
+
 
 /* in hvCall.S */
 EXPORT_SYMBOL(plpar_hcall);
@@ -347,6 +354,113 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
 	BUG_ON(lpar_rc != H_SUCCESS);
 }
 
+/*
+ * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
+ * to make sure that we avoid bouncing the hypervisor tlbie lock.
+ */
+#define PPC64_HUGE_HPTE_BATCH 12
+
+static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
+					     unsigned long *vpn, int count,
+					     int psize, int ssize)
+{
+	unsigned long param[8];
+	int i = 0, pix = 0, rc;
+	unsigned long flags = 0;
+	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+
+	if (lock_tlbie)
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+	for (i = 0; i < count; i++) {
+
+		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
+						     ssize, 0);
+		} else {
+			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
+			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
+			pix += 2;
+			if (pix == 8) {
+				rc = plpar_hcall9(H_BULK_REMOVE, param,
+						  param[0], param[1], param[2],
+						  param[3], param[4], param[5],
+						  param[6], param[7]);
+				BUG_ON(rc != H_SUCCESS);
+				pix = 0;
+			}
+		}
+	}
+	if (pix) {
+		param[pix] = HBR_END;
+		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+				  param[2], param[3], param[4], param[5],
+				  param[6], param[7]);
+		BUG_ON(rc != H_SUCCESS);
+	}
+
+	if (lock_tlbie)
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
+				       unsigned char *hpte_slot_array,
+				       unsigned long addr, int psize)
+{
+	int ssize = 0, i, index = 0;
+	unsigned long s_addr = addr;
+	unsigned int max_hpte_count, valid;
+	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
+	unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
+
+	shift = mmu_psize_defs[psize].shift;
+	max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+	for (i = 0; i < max_hpte_count; i++) {
+		valid = hpte_valid(hpte_slot_array, i);
+		if (!valid)
+			continue;
+		hidx =  hpte_hash_index(hpte_slot_array, i);
+
+		/* get the vpn */
+		addr = s_addr + (i * (1ul << shift));
+		if (!is_kernel_addr(addr)) {
+			ssize = user_segment_size(addr);
+			vsid = get_vsid(mm->context.id, addr, ssize);
+			WARN_ON(vsid == 0);
+		} else {
+			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+			ssize = mmu_kernel_ssize;
+		}
+
+		vpn = hpt_vpn(addr, vsid, ssize);
+		hash = hpt_hash(vpn, shift, ssize);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+
+		slot_array[index] = slot;
+		vpn_array[index] = vpn;
+		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
+			/*
+			 * Now do a bluk invalidate
+			 */
+			__pSeries_lpar_hugepage_invalidate(slot_array,
+							   vpn_array,
+							   PPC64_HUGE_HPTE_BATCH,
+							   psize, ssize);
+			index = 0;
+		} else
+			index++;
+	}
+	if (index)
+		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
+						   index, psize, ssize);
+}
+
 static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 					   int psize, int ssize)
 {
@@ -364,13 +478,6 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
 }
 
-/* Flag bits for H_BULK_REMOVE */
-#define HBR_REQUEST	0x4000000000000000UL
-#define HBR_RESPONSE	0x8000000000000000UL
-#define HBR_END		0xc000000000000000UL
-#define HBR_AVPN	0x0200000000000000UL
-#define HBR_ANDCOND	0x0100000000000000UL
-
 /*
  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  * lock.
@@ -459,6 +566,7 @@ void __init hpte_init_lpar(void)
 	ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
 	ppc_md.flush_hash_range	= pSeries_lpar_flush_hash_range;
 	ppc_md.hpte_clear_all   = pSeries_lpar_hptab_clear;
+	ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
 }
 
 #ifdef CONFIG_PPC_SMLPAR
-- 
cgit v0.10.2


From 0875a88e8569c2ca306f321018238bd1a3d7fd86 Mon Sep 17 00:00:00 2001
From: Matteo Facchinetti <matteo.facchinetti@sirius-es.it>
Date: Mon, 10 Jun 2013 10:13:52 +0200
Subject: powerpc/mpc512x: add MPC5125 reset module support for system restart

Only part of MPC5125 reset module is like as MPC5121.
In detail, RCWH register doesn't contain informations about:
- PCI arbiter
- NAND flash page size
- NAND flash port size

For this reason, in device tree, this module has a different name then
MPC5121 reset module but use the same "struct mpc512x_reset_module"
register definition and the same restart procedure.

Signed-off-by: Matteo Facchinetti <engineering@sirius-es.it>
Signed-off-by: Anatolij Gustschin <agust@denx.de>

diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h
index fdb4303..cc97f02 100644
--- a/arch/powerpc/platforms/512x/mpc512x.h
+++ b/arch/powerpc/platforms/512x/mpc512x.h
@@ -17,6 +17,7 @@ extern void __init mpc512x_init(void);
 extern void __init mpc512x_setup_arch(void);
 extern int __init mpc5121_clk_init(void);
 extern const char *mpc512x_select_psc_compat(void);
+extern const char *mpc512x_select_reset_compat(void);
 extern void mpc512x_restart(char *cmd);
 
 #endif				/* __MPC512X_H__ */
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index a8b5110..a82a41b 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -35,8 +35,10 @@ static struct mpc512x_reset_module __iomem *reset_module_base;
 static void __init mpc512x_restart_init(void)
 {
 	struct device_node *np;
+	const char *reset_compat;
 
-	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-reset");
+	reset_compat = mpc512x_select_reset_compat();
+	np = of_find_compatible_node(NULL, NULL, reset_compat);
 	if (!np)
 		return;
 
@@ -355,6 +357,17 @@ const char *mpc512x_select_psc_compat(void)
 	return NULL;
 }
 
+const char *mpc512x_select_reset_compat(void)
+{
+	if (of_machine_is_compatible("fsl,mpc5121"))
+		return "fsl,mpc5121-reset";
+
+	if (of_machine_is_compatible("fsl,mpc5125"))
+		return "fsl,mpc5125-reset";
+
+	return NULL;
+}
+
 static unsigned int __init get_fifo_size(struct device_node *np,
 					 char *prop_name)
 {
-- 
cgit v0.10.2


From 8c43d2b0ca10cea9eb62d8996fb93f330be88ada Mon Sep 17 00:00:00 2001
From: Joe Liccese <joe.liccese@freescale.com>
Date: Mon, 25 Jun 2012 12:22:09 -0500
Subject: powerpc: Add T4 LAC device tree binding & defs

The Interlaken is a narrow, high speed channelized chip-to-chip interface. To
facilitate interoperability between a data path device and a look-aside
co-processor, the Interlaken Look-Aside protocol is defined for short
transaction-related transfers. Although based on the Interlaken protocol,
Interlaken Look-Aside is not directly compatible with Interlaken and can be
considered a different operation mode.

The Interlaken LA controller connects internal platform to Interlaken serial
interface. It accepts LA command through software portals, which are system
memory mapped 4KB spaces. The LA commands are then translated into the
Interlaken control words and data words, which are sent on TX side to TCAM
through SerDes lanes.

Signed-off-by: Joe Liccese <joe.liccese@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt
new file mode 100644
index 0000000..641bc13
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/fsl/interlaken-lac.txt
@@ -0,0 +1,309 @@
+===============================================================================
+Freescale Interlaken Look-Aside Controller Device Bindings
+Copyright 2012 Freescale Semiconductor Inc.
+
+CONTENTS
+  - Interlaken Look-Aside Controller (LAC) Node
+  - Example LAC Node
+  - Interlaken Look-Aside Controller (LAC) Software Portal Node
+  - Interlaken Look-Aside Controller (LAC) Software Portal Child Nodes
+  - Example LAC SWP Node with Child Nodes
+
+==============================================================================
+Interlaken Look-Aside Controller (LAC) Node
+
+DESCRIPTION
+
+The Interlaken is a narrow, high speed channelized chip-to-chip interface. To
+facilitate interoperability between a data path device and a look-aside
+co-processor, the Interlaken Look-Aside protocol is defined for short
+transaction-related transfers. Although based on the Interlaken protocol,
+Interlaken Look-Aside is not directly compatible with Interlaken and can be
+considered a different operation mode.
+
+The Interlaken LA controller connects internal platform to Interlaken serial
+interface. It accepts LA command through software portals, which are system
+memory mapped 4KB spaces. The LA commands are then translated into the
+Interlaken control words and data words, which are sent on TX side to TCAM
+through SerDes lanes.
+
+There are two 4KiB spaces defined within the LAC global register memory map.
+There is a full register set at 0x0000-0x0FFF (also known as the "hypervisor"
+version), and a subset at 0x1000-0x1FFF.  The former is a superset of the
+latter, and includes certain registers that should not be accessible to
+partitioned software.  Separate nodes are used for each region, with a phandle
+linking the hypervisor node to the normal operating node.
+
+PROPERTIES
+
+  - compatible
+	Usage: required
+	Value type: <string>
+	Definition: Must include "fsl,interlaken-lac". This represents only
+		those LAC CCSR registers not protected in partitioned
+		software. The version of the device is determined by the LAC
+		IP Block Revision Register (IPBRR0) at offset 0x0BF8.
+
+		Table of correspondences between IPBRR0 values and example
+		chips:
+			Value		Device
+			-----------	-------
+			0x02000100	T4240
+
+		The Hypervisor node has a different compatible. It must include
+		"fsl,interlaken-lac-hv". This node represents the protected
+		LAC register space and is required except inside a partition
+		where access to the hypervisor node is to be denied.
+
+  - fsl,non-hv-node
+	Usage: required in "fsl,interlaken-lac-hv"
+	Value type: <phandle>
+	Definition: Points to the non-protected LAC CCSR mapped register space
+		node.
+
+  - reg
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: A standard property. The first resource represents the
+		Interlaken LAC configuration registers.
+
+  - interrupts:
+	Usage: required in non-hv node only
+	Value type: <prop-encoded-array>
+	Definition: Interrupt mapping for Interlaken LAC error IRQ.
+
+EXAMPLE
+	lac: lac@229000 {
+		compatible = "fsl,interlaken-lac"
+		reg = <0x229000 0x1000>;
+		interrupts = <16 2 1 18>;
+	};
+
+	lac-hv@228000 {
+		compatible = "fsl,interlaken-lac-hv"
+		reg = <0x228000 0x1000>;
+		fsl,non-hv-node = <&lac>;
+	};
+
+===============================================================================
+Interlaken Look-Aside Controller (LAC) Software Portal Container Node
+
+DESCRIPTION
+The Interlaken Look-Aside Controller (LAC) utilizes Software Portals to accept
+Interlaken Look-Aside (ILA) commands. The Interlaken LAC software portal
+memory map occupies 128KB of memory space. The software portal memory space is
+intended to be cache-enabled. WIMG for each software space is required to be
+0010 if stashing is enabled; otherwise, WIMG can be 0000 or 0010.
+
+PROPERTIES
+
+  - #address-cells
+	Usage: required
+	Value type: <u32>
+	Definition: A standard property. Must have a value of 1.
+
+  - #size-cells
+	Usage: required
+	Value type: <u32>
+	Definition: A standard property. Must have a value of 1.
+
+  - compatible
+	Usage: required
+	Value type: <string>
+	Definition: Must include "fsl,interlaken-lac-portals"
+
+  - ranges
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: A standard property. Specifies the address and length
+		of the LAC portal memory space.
+
+===============================================================================
+Interlaken Look-Aside Controller (LAC) Software Portals Child Nodes
+
+DESCRIPTION
+There are up to 24 available software portals with each software portal
+requiring 4KB of consecutive memory within the software portal memory mapped
+space.
+
+PROPERTIES
+
+  - compatible
+	Usage: required
+	Value type: <string>
+	Definition: Must include "fsl,interlaken-lac-portal-vX.Y" where X is
+		the Major version (IP_MJ) found in the LAC IP Block Revision
+		Register (IPBRR0), at offset 0x0BF8, and Y is the Minor version
+		(IP_MN).
+
+		Table of correspondences between version values and example chips:
+		    Value	Device
+		    ------	-------
+		      1.0	T4240
+
+  - reg
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: A standard property.  The first resource represents the
+		Interlaken LAC software portal registers.
+
+  - fsl,liodn
+	Value type: <u32>
+	Definition: The logical I/O device number (LIODN) for this device.  The
+		LIODN is a number expressed by this device and used to perform
+		look-ups in the IOMMU (PAMU) address table when performing
+		DMAs. This property is automatically added by u-boot.
+
+===============================================================================
+EXAMPLE
+
+lac-portals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "fsl,interlaken-lac-portals";
+	ranges = <0x0 0xf 0xf4400000 0x20000>;
+
+	lportal0: lac-portal@0 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x204>;
+		reg = <0x0 0x1000>;
+	};
+
+	lportal1: lac-portal@1000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x205>;
+		reg = <0x1000 0x1000>;
+	};
+
+	lportal2: lac-portal@2000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x206>;
+		reg = <0x2000 0x1000>;
+	};
+
+	lportal3: lac-portal@3000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x207>;
+		reg = <0x3000 0x1000>;
+	};
+
+	lportal4: lac-portal@4000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x208>;
+		reg = <0x4000 0x1000>;
+	};
+
+	lportal5: lac-portal@5000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x209>;
+		reg = <0x5000 0x1000>;
+	};
+
+	lportal6: lac-portal@6000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x20A>;
+		reg = <0x6000 0x1000>;
+	};
+
+	lportal7: lac-portal@7000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x20B>;
+		reg = <0x7000 0x1000>;
+	};
+
+	lportal8: lac-portal@8000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x20C>;
+		reg = <0x8000 0x1000>;
+	};
+
+	lportal9: lac-portal@9000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x20D>;
+		reg = <0x9000 0x1000>;
+	};
+
+	lportal10: lac-portal@A000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x20E>;
+		reg = <0xA000 0x1000>;
+	};
+
+	lportal11: lac-portal@B000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x20F>;
+		reg = <0xB000 0x1000>;
+	};
+
+	lportal12: lac-portal@C000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x210>;
+		reg = <0xC000 0x1000>;
+	};
+
+	lportal13: lac-portal@D000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x211>;
+		reg = <0xD000 0x1000>;
+	};
+
+	lportal14: lac-portal@E000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x212>;
+		reg = <0xE000 0x1000>;
+	};
+
+	lportal15: lac-portal@F000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x213>;
+		reg = <0xF000 0x1000>;
+	};
+
+	lportal16: lac-portal@10000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x214>;
+		reg = <0x10000 0x1000>;
+	};
+
+	lportal17: lac-portal@11000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x215>;
+		reg = <0x11000 0x1000>;
+	};
+
+	lportal8: lac-portal@1200 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x216>;
+		reg = <0x12000 0x1000>;
+	};
+
+	lportal19: lac-portal@13000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x217>;
+		reg = <0x13000 0x1000>;
+	};
+
+	lportal20: lac-portal@14000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x218>;
+		reg = <0x14000 0x1000>;
+	};
+
+	lportal21: lac-portal@15000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x219>;
+		reg = <0x15000 0x1000>;
+	};
+
+	lportal22: lac-portal@16000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x21A>;
+		reg = <0x16000 0x1000>;
+	};
+
+	lportal23: lac-portal@17000 {
+		compatible = "fsl,interlaken-lac-portal-v1.0";
+		fsl,liodn = <0x21B>;
+		reg = <0x17000 0x1000>;
+	};
+};
diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi
new file mode 100644
index 0000000..9cffccf
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/interlaken-lac-portals.dtsi
@@ -0,0 +1,156 @@
+/* T4240 Interlaken LAC Portal device tree stub with 24 portals.
+ *
+ * Copyright 2012 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#address-cells = <0x1>;
+#size-cells = <0x1>;
+compatible = "fsl,interlaken-lac-portals";
+
+lportal0: lac-portal@0 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x0 0x1000>;
+};
+
+lportal1: lac-portal@1000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x1000 0x1000>;
+};
+
+lportal2: lac-portal@2000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x2000 0x1000>;
+};
+
+lportal3: lac-portal@3000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x3000 0x1000>;
+};
+
+lportal4: lac-portal@4000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x4000 0x1000>;
+};
+
+lportal5: lac-portal@5000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x5000 0x1000>;
+};
+
+lportal6: lac-portal@6000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x6000 0x1000>;
+};
+
+lportal7: lac-portal@7000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x7000 0x1000>;
+};
+
+lportal8: lac-portal@8000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x8000 0x1000>;
+};
+
+lportal9: lac-portal@9000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x9000 0x1000>;
+};
+
+lportal10: lac-portal@A000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0xA000 0x1000>;
+};
+
+lportal11: lac-portal@B000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0xB000 0x1000>;
+};
+
+lportal12: lac-portal@C000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0xC000 0x1000>;
+};
+
+lportal13: lac-portal@D000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0xD000 0x1000>;
+};
+
+lportal14: lac-portal@E000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0xE000 0x1000>;
+};
+
+lportal15: lac-portal@F000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0xF000 0x1000>;
+};
+
+lportal16: lac-portal@10000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x10000 0x1000>;
+};
+
+lportal17: lac-portal@11000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x11000 0x1000>;
+};
+
+lportal18: lac-portal@1200 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x12000 0x1000>;
+};
+
+lportal19: lac-portal@13000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x13000 0x1000>;
+};
+
+lportal20: lac-portal@14000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x14000 0x1000>;
+};
+
+lportal21: lac-portal@15000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x15000 0x1000>;
+};
+
+lportal22: lac-portal@16000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x16000 0x1000>;
+};
+
+lportal23: lac-portal@17000 {
+	compatible = "fsl,interlaken-lac-portal-v1.0";
+	reg = <0x17000 0x1000>;
+};
diff --git a/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi
new file mode 100644
index 0000000..e820872
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/interlaken-lac.dtsi
@@ -0,0 +1,45 @@
+/*
+ * T4 Interlaken Look-aside Controller (LAC) device tree stub
+ *
+ * Copyright 2012 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+lac: lac@229000 {
+	compatible = "fsl,interlaken-lac";
+	reg = <0x229000 0x1000>;
+	interrupts = <16 2 1 18>;
+};
+
+lac-hv@228000 {
+	compatible = "fsl,interlaken-lac-hv";
+	reg = <0x228000 0x1000>;
+	fsl,non-hv-node = <&lac>;
+};
-- 
cgit v0.10.2


From 6d18c9042c037f8600c9648260f76eae5170c0c3 Mon Sep 17 00:00:00 2001
From: Gerhard Sittig <gsi@denx.de>
Date: Tue, 18 Jun 2013 13:17:21 +0200
Subject: powerpc/mpc512x: commit re-generated defconfig

This patch does not change the content, it merely re-orders
configuration items and drops explicit options which already
apply as the default.

Signed-off-by: Gerhard Sittig <gsi@denx.de>
Signed-off-by: Anatolij Gustschin <agust@denx.de>

diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig
index 0d0d981..5b8ee80 100644
--- a/arch/powerpc/configs/mpc512x_defconfig
+++ b/arch/powerpc/configs/mpc512x_defconfig
@@ -1,7 +1,6 @@
-CONFIG_EXPERIMENTAL=y
 # CONFIG_SWAP is not set
 CONFIG_SYSVIPC=y
-CONFIG_SPARSE_IRQ=y
+CONFIG_NO_HZ=y
 CONFIG_LOG_BUF_SHIFT=16
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
@@ -9,6 +8,7 @@ CONFIG_SLAB=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
+CONFIG_PARTITION_ADVANCED=y
 # CONFIG_IOSCHED_CFQ is not set
 # CONFIG_PPC_CHRP is not set
 CONFIG_PPC_MPC512x=y
@@ -16,9 +16,7 @@ CONFIG_MPC5121_ADS=y
 CONFIG_MPC512x_GENERIC=y
 CONFIG_PDM360NG=y
 # CONFIG_PPC_PMAC is not set
-CONFIG_NO_HZ=y
 CONFIG_HZ_1000=y
-# CONFIG_MIGRATION is not set
 # CONFIG_SECCOMP is not set
 # CONFIG_PCI is not set
 CONFIG_NET=y
@@ -33,8 +31,6 @@ CONFIG_IP_PNP=y
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 CONFIG_CAN=y
-CONFIG_CAN_RAW=y
-CONFIG_CAN_BCM=y
 CONFIG_CAN_VCAN=y
 CONFIG_CAN_MSCAN=y
 CONFIG_CAN_DEBUG_DEVICES=y
@@ -46,7 +42,6 @@ CONFIG_DEVTMPFS_MOUNT=y
 # CONFIG_FIRMWARE_IN_KERNEL is not set
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_CHAR=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_AMDSTD=y
@@ -60,7 +55,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=1
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_BLK_DEV_XIP=y
-CONFIG_MISC_DEVICES=y
 CONFIG_EEPROM_AT24=y
 CONFIG_EEPROM_AT25=y
 CONFIG_SCSI=y
@@ -68,6 +62,7 @@ CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
 CONFIG_NETDEVICES=y
+CONFIG_FS_ENET=y
 CONFIG_MARVELL_PHY=y
 CONFIG_DAVICOM_PHY=y
 CONFIG_QSEMI_PHY=y
@@ -83,10 +78,6 @@ CONFIG_STE10XP=y
 CONFIG_LSI_ET1011C_PHY=y
 CONFIG_FIXED_PHY=y
 CONFIG_MDIO_BITBANG=y
-CONFIG_NET_ETHERNET=y
-CONFIG_FS_ENET=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV_PSAUX is not set
 CONFIG_INPUT_EVDEV=y
@@ -106,10 +97,7 @@ CONFIG_GPIO_SYSFS=y
 CONFIG_GPIO_MPC8XXX=y
 # CONFIG_HWMON is not set
 CONFIG_MEDIA_SUPPORT=y
-CONFIG_VIDEO_DEV=y
 CONFIG_VIDEO_ADV_DEBUG=y
-# CONFIG_VIDEO_HELPER_CHIPS_AUTO is not set
-CONFIG_VIDEO_SAA711X=y
 CONFIG_FB=y
 CONFIG_FB_FSL_DIU=y
 # CONFIG_VGA_CONSOLE is not set
@@ -129,9 +117,7 @@ CONFIG_TMPFS=y
 CONFIG_JFFS2_FS=y
 CONFIG_UBIFS_FS=y
 CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
 CONFIG_ROOT_NFS=y
-CONFIG_PARTITION_ADVANCED=y
 CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_ISO8859_1=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-- 
cgit v0.10.2


From dd0120dea601de654a5a6d959da3a632a02200f0 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Tue, 25 Jun 2013 08:51:19 +0200
Subject: powerpc/mpc512x: enable USB support in defconfig

Enable USB EHCI, mass storage and USB gadget support.

Signed-off-by: Anatolij Gustschin <agust@denx.de>

diff --git a/arch/powerpc/configs/mpc512x_defconfig b/arch/powerpc/configs/mpc512x_defconfig
index 5b8ee80..ee853a1 100644
--- a/arch/powerpc/configs/mpc512x_defconfig
+++ b/arch/powerpc/configs/mpc512x_defconfig
@@ -102,6 +102,13 @@ CONFIG_FB=y
 CONFIG_FB_FSL_DIU=y
 # CONFIG_VGA_CONSOLE is not set
 CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_USB=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_FSL=y
+# CONFIG_USB_EHCI_HCD_PPC_OF is not set
+CONFIG_USB_STORAGE=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_FSL_USB2=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_M41T80=y
 CONFIG_RTC_DRV_MPC5121=y
-- 
cgit v0.10.2


From e7f345a2a36754e43673e86870a7a89101fb13e3 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Sat, 22 Jun 2013 04:29:48 -0400
Subject: macintosh/adb: Replace __WAITQUEUE_INITIALIZER with more standard
 DECLARE_WAITQUEUE.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index b026896..04a5049 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -697,7 +697,7 @@ static ssize_t adb_read(struct file *file, char __user *buf,
 	int ret = 0;
 	struct adbdev_state *state = file->private_data;
 	struct adb_request *req;
-	wait_queue_t wait = __WAITQUEUE_INITIALIZER(wait,current);
+	DECLARE_WAITQUEUE(wait,current);
 	unsigned long flags;
 
 	if (count < 2)
-- 
cgit v0.10.2


From b0b0aa9c7faf94e92320eabd8a1786c7747e40a8 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 24 Jun 2013 15:47:22 +1000
Subject: powerpc/hw_brk: Fix setting of length for exact mode breakpoints

The smallest match region for both the DABR and DAWR is 8 bytes, so the
kernel needs to filter matches when users want to look at regions smaller than
this.

Currently we set the length of PPC_BREAKPOINT_MODE_EXACT breakpoints to 8.
This is wrong as in exact mode we should only match on 1 address, hence the
length should be 1.

This ensures that the kernel will filter out any exact mode hardware breakpoint
matches on any addresses other than the requested one.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Reported-by: Edjunior Barbosa Machado <emachado@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 98c2fc1..64f7bd5 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1449,7 +1449,9 @@ static long ppc_set_hwdebug(struct task_struct *child,
 	 */
 	if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) {
 		len = bp_info->addr2 - bp_info->addr;
-	} else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) {
+	} else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+		len = 1;
+	else {
 		ptrace_put_breakpoints(child);
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From 540e07c67efe42ef6b6be4f1956931e676d58a15 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 24 Jun 2013 15:47:23 +1000
Subject: powerpc/hw_brk: Fix clearing of extraneous IRQ

In 9422de3 "powerpc: Hardware breakpoints rewrite to handle non DABR breakpoint
registers" we changed the way we mark extraneous irqs with this:

-	info->extraneous_interrupt = !((bp->attr.bp_addr <= dar) &&
-			(dar - bp->attr.bp_addr < bp->attr.bp_len));
+	if (!((bp->attr.bp_addr <= dar) &&
+	      (dar - bp->attr.bp_addr < bp->attr.bp_len)))
+		info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;

Unfortunately this is bogus as it never clears extraneous IRQ if it's already
set.

This correctly clears extraneous IRQ before possibly setting it.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Reported-by: Edjunior Barbosa Machado <emachado@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index a949bdf..1150ae7 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -250,6 +250,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	 * we still need to single-step the instruction, but we don't
 	 * generate an event.
 	 */
+	info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
 	if (!((bp->attr.bp_addr <= dar) &&
 	      (dar - bp->attr.bp_addr < bp->attr.bp_len)))
 		info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
-- 
cgit v0.10.2


From 99b308e3bbacc26509858ac78ebc7a0a1dfbb888 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Mon, 24 Jun 2013 12:23:00 +0530
Subject: powerpc/pseries: Enable PSTORE in pseries_defconfig

Since now we have pstore support for nvram in pseries, enable it
in the default config. With this config option enabled, pstore
infra-structure will be used to read/write the messages from/to nvram.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index c4dfbaf..bea8587 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -296,6 +296,7 @@ CONFIG_SQUASHFS=m
 CONFIG_SQUASHFS_XATTR=y
 CONFIG_SQUASHFS_LZO=y
 CONFIG_SQUASHFS_XZ=y
+CONFIG_PSTORE=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=y
-- 
cgit v0.10.2


From ff1e7683418798e44eb8af1ab9f28dd475af6034 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Mon, 24 Jun 2013 09:35:55 -0500
Subject: powerpc/mm: Fix build warnings with CONFIG_TRANSPARENT_HUGEPAGE
 disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building with CONFIG_TRANSPARENT_HUGEPAGE disabled causes the following
build wearnings;

powerpc/arch/powerpc/include/asm/mmu-hash64.h: In function ‘__hash_page_thp’:
powerpc/arch/powerpc/include/asm/mmu-hash64.h:354: warning: no return statement in function returning non-void

This patch adds a return -1 to the static inline for __hash_page_thp()
to correct the warnings.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 3d6fbb0..c4cf011 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -351,6 +351,7 @@ static inline int __hash_page_thp(unsigned long ea, unsigned long access,
 				  int ssize, unsigned int psize)
 {
 	BUG();
+	return -1;
 }
 #endif
 extern void hash_failure_debug(unsigned long ea, unsigned long access,
-- 
cgit v0.10.2


From ef6a28577398df2853abf123cb4a2e0c57eb879a Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 25 Jun 2013 14:35:27 +0800
Subject: powerpc/eeh: Remove eeh_mutex

Originally, eeh_mutex was introduced to protect the PE hierarchy
tree and the attached EEH devices because EEH core was possiblly
running with multiple threads to access the PE hierarchy tree.
However, we now have only one kthread in EEH core. So we needn't
the eeh_mutex and just remove it.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a0b11fb..dd65e31 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -151,7 +151,6 @@ struct eeh_ops {
 
 extern struct eeh_ops *eeh_ops;
 extern int eeh_subsystem_enabled;
-extern struct mutex eeh_mutex;
 extern raw_spinlock_t confirm_error_lock;
 extern int eeh_probe_mode;
 
@@ -173,16 +172,6 @@ static inline int eeh_probe_mode_dev(void)
 	return (eeh_probe_mode == EEH_PROBE_MODE_DEV);
 }
 
-static inline void eeh_lock(void)
-{
-	mutex_lock(&eeh_mutex);
-}
-
-static inline void eeh_unlock(void)
-{
-	mutex_unlock(&eeh_mutex);
-}
-
 static inline void eeh_serialize_lock(unsigned long *flags)
 {
 	raw_spin_lock_irqsave(&confirm_error_lock, *flags);
@@ -271,9 +260,6 @@ static inline void eeh_add_sysfs_files(struct pci_bus *bus) { }
 
 static inline void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) { }
 
-static inline void eeh_lock(void) { }
-static inline void eeh_unlock(void) { }
-
 #define EEH_POSSIBLE_ERROR(val, type) (0)
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index af2b9ae..e579652 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -103,9 +103,6 @@ EXPORT_SYMBOL(eeh_subsystem_enabled);
  */
 int eeh_probe_mode;
 
-/* Global EEH mutex */
-DEFINE_MUTEX(eeh_mutex);
-
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index ae75722..55943fc 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -78,9 +78,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
 	}
 
 	/* Put it into the list */
-	eeh_lock();
 	list_add_tail(&pe->child, &eeh_phb_pe);
-	eeh_unlock();
 
 	pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
 
@@ -185,21 +183,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root,
 		return NULL;
 	}
 
-	eeh_lock();
-
 	/* Traverse root PE */
 	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
 		eeh_pe_for_each_dev(pe, edev) {
 			ret = fn(edev, flag);
-			if (ret) {
-				eeh_unlock();
+			if (ret)
 				return ret;
-			}
 		}
 	}
 
-	eeh_unlock();
-
 	return NULL;
 }
 
@@ -305,8 +297,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 {
 	struct eeh_pe *pe, *parent;
 
-	eeh_lock();
-
 	/*
 	 * Search the PE has been existing or not according
 	 * to the PE address. If that has been existing, the
@@ -316,7 +306,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	pe = eeh_pe_get(edev);
 	if (pe && !(pe->type & EEH_PE_INVALID)) {
 		if (!edev->pe_config_addr) {
-			eeh_unlock();
 			pr_err("%s: PE with addr 0x%x already exists\n",
 				__func__, edev->config_addr);
 			return -EEXIST;
@@ -328,7 +317,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
 		/* Put the edev to PE */
 		list_add_tail(&edev->list, &pe->edevs);
-		eeh_unlock();
 		pr_debug("EEH: Add %s to Bus PE#%x\n",
 			edev->dn->full_name, pe->addr);
 
@@ -347,7 +335,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 			parent->type &= ~EEH_PE_INVALID;
 			parent = parent->parent;
 		}
-		eeh_unlock();
 		pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
 			edev->dn->full_name, pe->addr, pe->parent->addr);
 
@@ -357,7 +344,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	/* Create a new EEH PE */
 	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
 	if (!pe) {
-		eeh_unlock();
 		pr_err("%s: out of memory!\n", __func__);
 		return -ENOMEM;
 	}
@@ -385,7 +371,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	if (!parent) {
 		parent = eeh_phb_pe_get(edev->phb);
 		if (!parent) {
-			eeh_unlock();
 			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
 				__func__, edev->phb->global_number);
 			edev->pe = NULL;
@@ -402,7 +387,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	list_add_tail(&pe->child, &parent->child_list);
 	list_add_tail(&edev->list, &pe->edevs);
 	edev->pe = pe;
-	eeh_unlock();
 	pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
 		edev->dn->full_name, pe->addr, pe->parent->addr);
 
@@ -430,8 +414,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
 		return -EEXIST;
 	}
 
-	eeh_lock();
-
 	/* Remove the EEH device */
 	pe = edev->pe;
 	edev->pe = NULL;
@@ -476,8 +458,6 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
 		pe = parent;
 	}
 
-	eeh_unlock();
-
 	return 0;
 }
 
@@ -550,9 +530,7 @@ static void *__eeh_pe_state_mark(void *data, void *flag)
  */
 void eeh_pe_state_mark(struct eeh_pe *pe, int state)
 {
-	eeh_lock();
 	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
-	eeh_unlock();
 }
 
 /**
@@ -586,9 +564,7 @@ static void *__eeh_pe_state_clear(void *data, void *flag)
  */
 void eeh_pe_state_clear(struct eeh_pe *pe, int state)
 {
-	eeh_lock();
 	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
-	eeh_unlock();
 }
 
 /**
@@ -673,8 +649,6 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 	struct eeh_dev *edev;
 	struct pci_dev *pdev;
 
-	eeh_lock();
-
 	if (pe->type & EEH_PE_PHB) {
 		bus = pe->phb->bus;
 	} else if (pe->type & EEH_PE_BUS ||
@@ -691,7 +665,5 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 	}
 
 out:
-	eeh_unlock();
-
 	return bus;
 }
-- 
cgit v0.10.2


From 5459ae1431f5d22ab10fa8b56fb16c018289fdfc Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Tue, 25 Jun 2013 14:35:28 +0800
Subject: powerpc/eeh: Use interruptible sleep in keehd

To replace down() with down_interrutible() to avoid following
warning:

[c00000007ba7b710] [c000000000014410] .__switch_to+0x1b0/0x380
[c00000007ba7b7c0] [c0000000007b408c] .__schedule+0x3ec/0x970
[c00000007ba7ba50] [c0000000007b1f24] .schedule_timeout+0x1a4/0x2b0
[c00000007ba7bb30] [c0000000007b34a4] .__down+0xa4/0x104
[c00000007ba7bbf0] [c0000000000b9230] .down+0x60/0x70
[c00000007ba7bc80] [c0000000000336d0] .eeh_event_handler+0x70/0x190
[c00000007ba7bd30] [c0000000000b1a58] .kthread+0xe8/0xf0
[c00000007ba7be30] [c00000000000a05c] .ret_from_kernel_thread+0x5c/0x8

This also avoids keeping the load average up while doing nothing.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 39bcd81..d27c5af 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -55,7 +55,8 @@ static int eeh_event_handler(void * dummy)
 	struct eeh_pe *pe;
 
 	while (!kthread_should_stop()) {
-		down(&eeh_eventlist_sem);
+		if (down_interruptible(&eeh_eventlist_sem))
+			break;
 
 		/* Fetch EEH event from the queue */
 		spin_lock_irqsave(&eeh_eventlist_lock, flags);
-- 
cgit v0.10.2


From 98c7355fb373d7c29e5c45d0a423810ad2476b34 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 8 Oct 2012 20:39:52 +0800
Subject: powerpc/83xx: use module_i2c_driver to simplify the code

Use the module_i2c_driver() macro to make the code smaller
and a bit simpler.

dpatch engine is used to auto generate this patch.
(https://github.com/weiyj/dpatch)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index 624cb51..7bc3158 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -231,17 +231,7 @@ static struct i2c_driver mcu_driver = {
 	.id_table = mcu_ids,
 };
 
-static int __init mcu_init(void)
-{
-	return i2c_add_driver(&mcu_driver);
-}
-module_init(mcu_init);
-
-static void __exit mcu_exit(void)
-{
-	i2c_del_driver(&mcu_driver);
-}
-module_exit(mcu_exit);
+module_i2c_driver(mcu_driver);
 
 MODULE_DESCRIPTION("Power Management and GPIO expander driver for "
 		   "MPC8349E-mITX-compatible MCU");
-- 
cgit v0.10.2


From a16d8aa4726a944ffc1616689ae34ff6a902faba Mon Sep 17 00:00:00 2001
From: Sachin Surendran <sachin.surendran@alliedtelesis.co.nz>
Date: Mon, 26 Nov 2012 11:20:01 +1300
Subject: i2c-cpm: Fix to takeback i2c bus master-ship after a collision

In case of collision on i2c bus the controller which lost bus mastership
stays as a slave for all subsequent transfers. This results in the i2c
controller never writing to the bus for future transactions, resulting
in i2c transfer timeouts.
  This fix checks for a collision on last I2C transaction and sets the
I2COM_MASTER bit for the new transaction.

Signed-off-by: Sachin Surendran <sachin.surendran@alliedtelesis.co.nz>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index 3823623..9e60021 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -338,6 +338,14 @@ static int cpm_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 	tptr = 0;
 	rptr = 0;
 
+	/*
+	 * If there was a collision in the last i2c transaction,
+	 * Set I2COM_MASTER as it was cleared during collision.
+	 */
+	if (in_be16(&tbdf->cbd_sc) & BD_SC_CL) {
+		out_8(&cpm->i2c_reg->i2com, I2COM_MASTER);
+	}
+
 	while (tptr < num) {
 		pmsg = &msgs[tptr];
 		dev_dbg(&adap->dev, "R: %d T: %d\n", rptr, tptr);
-- 
cgit v0.10.2


From 3766a1abc53164f058d74f950599c797cb3fe302 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 25 Jun 2013 14:15:36 -0700
Subject: mm/thp: define HPAGE_PMD_* constants as BUILD_BUG() if !THP

Currently, HPAGE_PMD_* constans rely on PMD_SHIFT regardless of
CONFIG_TRANSPARENT_HUGEPAGE.  PMD_SHIFT is not defined everywhere (e.g.
arm nommu case).

It means we can't use anything like this in generic code:

        if (PageTransHuge(page))
                zero_huge_user(page, 0, HPAGE_PMD_SIZE);
        else
                clear_highpage(page);

For !THP case, PageTransHuge() is 0 and compiler can eliminate
zero_huge_user() call.  But it still need to be valid C expression, means
HPAGE_PMD_SIZE has to expand to something compiler can understand.

Previously, HPAGE_PMD_* were defined to BUILD_BUG() for !THP. Let's come
back to it.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cc276d2..e2dbefb 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -58,11 +58,12 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
 #define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
 #define transparent_hugepage_enabled(__vma)				\
@@ -180,6 +181,9 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
+#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
 
 #define hpage_nr_pages(x) 1
 
-- 
cgit v0.10.2


From 3978bdb4ed653342b0be66c031bf61b72cc55d60 Mon Sep 17 00:00:00 2001
From: Tudor Laurentiu <laurentiu.tudor@freescale.com>
Date: Tue, 5 Mar 2013 17:52:49 +0200
Subject: powerpc/watchdog: Don't enable interrupt on PPC64 BookE

Critical interrupts are not handled on PPC64 BookE machines,
so when the first watchdog interrupt fires the machine will
freeze without a warning until it's rebooted by the second
watchdog trigger.
Plus, the interrupt isn't used anyway since the driver
expects a usermode app to ping the watchdog periodically.

Signed-off-by: Laurentiu Tudor <Laurentiu.Tudor@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/drivers/watchdog/booke_wdt.c b/drivers/watchdog/booke_wdt.c
index a8dbceb3..f1b8d55 100644
--- a/drivers/watchdog/booke_wdt.c
+++ b/drivers/watchdog/booke_wdt.c
@@ -138,6 +138,14 @@ static void __booke_wdt_enable(void *data)
 	val &= ~WDTP_MASK;
 	val |= (TCR_WIE|TCR_WRC(WRC_CHIP)|WDTP(booke_wdt_period));
 
+#ifdef CONFIG_PPC_BOOK3E_64
+	/*
+	 * Crit ints are currently broken on PPC64 Book-E, so
+	 * just disable them for now.
+	 */
+	val &= ~TCR_WIE;
+#endif
+
 	mtspr(SPRN_TCR, val);
 }
 
-- 
cgit v0.10.2


From e1b85c17bf3e4f2ecbf9ec824c4048a06078100b Mon Sep 17 00:00:00 2001
From: Sebastien Bessiere <sebastien.bessiere@gmail.com>
Date: Fri, 14 Jun 2013 17:57:03 +0200
Subject: trivial: powerpc: Fix typo in ioei_interrupt() description

Signed-off-by: Sebastien Bessiere <sebastien.bessiere@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
index ef9d9d8..5ea88d1 100644
--- a/arch/powerpc/platforms/pseries/io_event_irq.c
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -115,7 +115,7 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
  *   by scope or event type alone. For example, Torrent ISR route change
  *   event is reported with scope 0x00 (Not Applicatable) rather than
  *   0x3B (Torrent-hub). It is better to let the clients to identify
- *   who owns the the event.
+ *   who owns the event.
  */
 
 static irqreturn_t ioei_interrupt(int irq, void *dev_id)
-- 
cgit v0.10.2


From 80aa0fb4940bf8ee52bcb574d74459a7aea45621 Mon Sep 17 00:00:00 2001
From: James Yang <James.Yang@freescale.com>
Date: Tue, 25 Jun 2013 11:41:05 -0500
Subject: powerpc: Fix string instr. emulation for 32-bit processes on ppc64

String instruction emulation would erroneously result in a segfault if
the upper bits of the EA are set and is so high that it fails access
check.  Truncate the EA to 32 bits if the process is 32-bit.

Signed-off-by: James Yang <James.Yang@freescale.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 071f6e0..300daf3 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -866,6 +866,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword)
 		u8 val;
 		u32 shift = 8 * (3 - (pos & 0x3));
 
+		/* if process is 32-bit, clear upper 32 bits of EA */
+		if ((regs->msr & MSR_64BIT) == 0)
+			EA &= 0xFFFFFFFF;
+
 		switch ((instword & PPC_INST_STRING_MASK)) {
 			case PPC_INST_LSWX:
 			case PPC_INST_LSWI:
-- 
cgit v0.10.2


From 090b9284d72561644d17a6e568d29c9e472c9865 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 28 Jun 2013 18:17:09 +1000
Subject: powerpc/tm: Clear MSR RI in non-recoverable TM code

When we treclaim and trecheckpoint there's an unavoidable period when r1
will not be a valid kernel stack pointer.

This patch clears the MSR recoverable interrupt (RI) bit over these
regions to indicate we have an invalid kernel stack pointer.

For treclaim, the region over which we clear MSR RI is larger than
required to avoid the need for an extra costly mtmsrd.

Thanks to Paulus for suggesting this change.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 2da67e7..51be8fb 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -112,9 +112,18 @@ _GLOBAL(tm_reclaim)
 	std	r3, STACK_PARAM(0)(r1)
 	SAVE_NVGPRS(r1)
 
+	/* We need to setup MSR for VSX register save instructions.  Here we
+	 * also clear the MSR RI since when we do the treclaim, we won't have a
+	 * valid kernel pointer for a while.  We clear RI here as it avoids
+	 * adding another mtmsr closer to the treclaim.  This makes the region
+	 * maked as non-recoverable wider than it needs to be but it saves on
+	 * inserting another mtmsrd later.
+	 */
 	mfmsr	r14
 	mr	r15, r14
 	ori	r15, r15, MSR_FP
+	li	r16, MSR_RI
+	andc	r15, r15, r16
 	oris	r15, r15, MSR_VEC@h
 #ifdef CONFIG_VSX
 	BEGIN_FTR_SECTION
@@ -349,9 +358,10 @@ restore_gprs:
 	mtcr	r5
 	mtxer	r6
 
-	/* MSR and flags:  We don't change CRs, and we don't need to alter
-	 * MSR.
+	/* Clear the MSR RI since we are about to change R1.  EE is already off
 	 */
+	li	r4, 0
+	mtmsrd	r4, 1
 
 	REST_4GPRS(0, r7)			/* GPR0-3 */
 	REST_GPR(4, r7)				/* GPR4-6 */
@@ -377,6 +387,10 @@ restore_gprs:
 	GET_PACA(r13)
 	GET_SCRATCH0(r1)
 
+	/* R1 is restored, so we are recoverable again.  EE is still off */
+	li	r4, MSR_RI
+	mtmsrd	r4, 1
+
 	REST_NVGPRS(r1)
 
 	addi    r1, r1, TM_FRAME_SIZE
-- 
cgit v0.10.2


From c35ae1796bd4865bad322645a7edb92d223dfb51 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:42 +0800
Subject: powerpc/eeh: Don't collect PCI-CFG data on PHB

When the PHB is fenced or dead, it's pointless to collect the data
from PCI config space of subordinate PCI devices since it should
return 0xFF's. The patch also fixes overwritten buffer while getting
PCI config data.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index e579652..416fb43 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -232,16 +232,30 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 {
 	size_t loglen = 0;
 	struct eeh_dev *edev;
+	bool valid_cfg_log = true;
 
-	eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
-	eeh_ops->configure_bridge(pe);
-	eeh_pe_restore_bars(pe);
-
-	pci_regs_buf[0] = 0;
-	eeh_pe_for_each_dev(pe, edev) {
-		loglen += eeh_gather_pci_data(edev, pci_regs_buf,
-				EEH_PCI_REGS_LOG_LEN);
-        }
+	/*
+	 * When the PHB is fenced or dead, it's pointless to collect
+	 * the data from PCI config space because it should return
+	 * 0xFF's. For ER, we still retrieve the data from the PCI
+	 * config space.
+	 */
+	if (eeh_probe_mode_dev() &&
+	    (pe->type & EEH_PE_PHB) &&
+	    (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)))
+		valid_cfg_log = false;
+
+	if (valid_cfg_log) {
+		eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+		eeh_ops->configure_bridge(pe);
+		eeh_pe_restore_bars(pe);
+
+		pci_regs_buf[0] = 0;
+		eeh_pe_for_each_dev(pe, edev) {
+			loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen,
+						      EEH_PCI_REGS_LOG_LEN - loglen);
+		}
+	}
 
 	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
 }
-- 
cgit v0.10.2


From 652defed48757ae0dcc851beeb8fdd484bad40c6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:43 +0800
Subject: powerpc/eeh: Check PCIe link after reset

After reset (e.g. complete reset) in order to bring the fenced PHB
back, the PCIe link might not be ready yet. The patch intends to
make sure the PCIe link is ready before accessing its subordinate
PCI devices. The patch also fixes that wrong values restored to
PCI_COMMAND register for PCI bridges.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 55943fc..016588a 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -22,6 +22,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
 #include <linux/init.h>
@@ -567,30 +568,132 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state)
 	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
 }
 
-/**
- * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
- * @data: EEH device
- * @flag: Unused
+/*
+ * Some PCI bridges (e.g. PLX bridges) have primary/secondary
+ * buses assigned explicitly by firmware, and we probably have
+ * lost that after reset. So we have to delay the check until
+ * the PCI-CFG registers have been restored for the parent
+ * bridge.
  *
- * Loads the PCI configuration space base address registers,
- * the expansion ROM base address, the latency timer, and etc.
- * from the saved values in the device node.
+ * Don't use normal PCI-CFG accessors, which probably has been
+ * blocked on normal path during the stage. So we need utilize
+ * eeh operations, which is always permitted.
  */
-static void *eeh_restore_one_device_bars(void *data, void *flag)
+static void eeh_bridge_check_link(struct pci_dev *pdev,
+				  struct device_node *dn)
+{
+	int cap;
+	uint32_t val;
+	int timeout = 0;
+
+	/*
+	 * We only check root port and downstream ports of
+	 * PCIe switches
+	 */
+	if (!pci_is_pcie(pdev) ||
+	    (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT &&
+	     pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM))
+		return;
+
+	pr_debug("%s: Check PCIe link for %s ...\n",
+		 __func__, pci_name(pdev));
+
+	/* Check slot status */
+	cap = pdev->pcie_cap;
+	eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val);
+	if (!(val & PCI_EXP_SLTSTA_PDS)) {
+		pr_debug("  No card in the slot (0x%04x) !\n", val);
+		return;
+	}
+
+	/* Check power status if we have the capability */
+	eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val);
+	if (val & PCI_EXP_SLTCAP_PCP) {
+		eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val);
+		if (val & PCI_EXP_SLTCTL_PCC) {
+			pr_debug("  In power-off state, power it on ...\n");
+			val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
+			val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
+			eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val);
+			msleep(2 * 1000);
+		}
+	}
+
+	/* Enable link */
+	eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val);
+	val &= ~PCI_EXP_LNKCTL_LD;
+	eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val);
+
+	/* Check link */
+	eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val);
+	if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
+		pr_debug("  No link reporting capability (0x%08x) \n", val);
+		msleep(1000);
+		return;
+	}
+
+	/* Wait the link is up until timeout (5s) */
+	timeout = 0;
+	while (timeout < 5000) {
+		msleep(20);
+		timeout += 20;
+
+		eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val);
+		if (val & PCI_EXP_LNKSTA_DLLLA)
+			break;
+	}
+
+	if (val & PCI_EXP_LNKSTA_DLLLA)
+		pr_debug("  Link up (%s)\n",
+			 (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
+	else
+		pr_debug("  Link not ready (0x%04x)\n", val);
+}
+
+#define BYTE_SWAP(OFF)	(8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF)	(((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+static void eeh_restore_bridge_bars(struct pci_dev *pdev,
+				    struct eeh_dev *edev,
+				    struct device_node *dn)
+{
+	int i;
+
+	/*
+	 * Device BARs: 0x10 - 0x18
+	 * Bus numbers and windows: 0x18 - 0x30
+	 */
+	for (i = 4; i < 13; i++)
+		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+	/* Rom: 0x38 */
+	eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]);
+
+	/* Cache line & Latency timer: 0xC 0xD */
+	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+                SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+        eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+                SAVED_BYTE(PCI_LATENCY_TIMER));
+	/* Max latency, min grant, interrupt ping and line: 0x3C */
+	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+	/* PCI Command: 0x4 */
+	eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
+
+	/* Check the PCIe link is ready */
+	eeh_bridge_check_link(pdev, dn);
+}
+
+static void eeh_restore_device_bars(struct eeh_dev *edev,
+				    struct device_node *dn)
 {
 	int i;
 	u32 cmd;
-	struct eeh_dev *edev = (struct eeh_dev *)data;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	for (i = 4; i < 10; i++)
 		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
 	/* 12 == Expansion ROM Address */
 	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
 
-#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
-#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
-
 	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
 		SAVED_BYTE(PCI_CACHE_LINE_SIZE));
 	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
@@ -613,6 +716,34 @@ static void *eeh_restore_one_device_bars(void *data, void *flag)
 	else
 		cmd &= ~PCI_COMMAND_SERR;
 	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+	struct pci_dev *pdev = NULL;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+
+	/* Trace the PCI bridge */
+	if (eeh_probe_mode_dev()) {
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+                        pdev = NULL;
+        }
+
+	if (pdev)
+		eeh_restore_bridge_bars(pdev, edev, dn);
+	else
+		eeh_restore_device_bars(edev, dn);
 
 	return NULL;
 }
-- 
cgit v0.10.2


From 0b9e267d71d2e74d1108785928fd8c8c9dbf441e Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:44 +0800
Subject: powerpc/powernv: Replace variables with flags

We have 2 fields in "struct pnv_phb" to trace the states. The patch
replace the fields with one and introduces flags for that. The patch
doesn't impact the logic.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 84f3036..85025d7 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -132,7 +132,7 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 					    &ioda_eeh_dbgfs_ops);
 #endif
 
-		phb->eeh_enabled = 1;
+		phb->eeh_state |= PNV_EEH_STATE_ENABLED;
 	}
 
 	return 0;
@@ -815,7 +815,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		 * removed, we needn't take care of it any more.
 		 */
 		phb = hose->private_data;
-		if (phb->removed)
+		if (phb->eeh_state & PNV_EEH_STATE_REMOVED)
 			continue;
 
 		rc = opal_pci_next_error(phb->opal_id,
@@ -850,7 +850,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 				list_for_each_entry_safe(hose, tmp,
 						&hose_list, list_node) {
 					phb = hose->private_data;
-					phb->removed = 1;
+					phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				}
 
 				WARN(1, "EEH: dead IOC detected\n");
@@ -867,7 +867,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 
 				WARN(1, "EEH: dead PHB#%x detected\n",
 				     hose->global_number);
-				phb->removed = 1;
+				phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				ret = 3;
 				goto out;
 			} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 577cbea..4c91e6d 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -309,7 +309,7 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 	if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
 		return PCIBIOS_SUCCESSFUL;
 
-	if (phb->eeh_enabled) {
+	if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
 		if (*val == EEH_IO_ERROR_VALUE(size)) {
 			busdn = pci_bus_to_OF_node(bus);
 			for (dn = busdn->child; dn; dn = dn->sibling) {
@@ -359,7 +359,7 @@ static int pnv_pci_write_config(struct pci_bus *bus,
 
 	/* Check if the PHB got frozen due to an error (no response) */
 #ifdef CONFIG_EEH
-	if (!phb->eeh_enabled)
+	if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
 		pnv_pci_config_check_eeh(phb, bus, bdfn);
 #else
 	pnv_pci_config_check_eeh(phb, bus, bdfn);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 43906e3..40bdf02 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -78,6 +78,10 @@ struct pnv_eeh_ops {
 	int (*configure_bridge)(struct eeh_pe *pe);
 	int (*next_error)(struct eeh_pe **pe);
 };
+
+#define PNV_EEH_STATE_ENABLED	(1 << 0)	/* EEH enabled	*/
+#define PNV_EEH_STATE_REMOVED	(1 << 1)	/* PHB removed	*/
+
 #endif /* CONFIG_EEH */
 
 struct pnv_phb {
@@ -92,8 +96,7 @@ struct pnv_phb {
 
 #ifdef CONFIG_EEH
 	struct pnv_eeh_ops	*eeh_ops;
-	int			eeh_enabled;
-	int			removed;
+	int			eeh_state;
 #endif
 
 #ifdef CONFIG_DEBUG_FS
-- 
cgit v0.10.2


From 88b6d14b2bb48ea4f66fedfe671f98544395b305 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:45 +0800
Subject: powerpc/eeh: Fix address catch for PowerNV

On the PowerNV platform, the EEH address cache isn't built correctly
because we skipped the EEH devices without binding PE. The patch
fixes that.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 1d5d9a6..858ebea 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -194,7 +194,7 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
 	}
 
 	/* Skip any devices for which EEH is not enabled. */
-	if (!edev->pe) {
+	if (!eeh_probe_mode_dev() && !edev->pe) {
 #ifdef DEBUG
 		pr_info("PCI: skip building address cache for=%s - %s\n",
 			pci_name(dev), dn->full_name);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index dc4ec79..c393bf5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -999,6 +999,7 @@ static void pnv_pci_ioda_fixup(void)
 	pnv_pci_ioda_create_dbgfs();
 
 #ifdef CONFIG_EEH
+	eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
 	eeh_addr_cache_build();
 	eeh_init();
 #endif
-- 
cgit v0.10.2


From 56ca4fde90009094b1a46971de3879d5f2dd724e Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:46 +0800
Subject: powerpc/eeh: Refactor the output message

We needn't the the whole backtrace other than one-line message in
the error reporting interrupt handler. For errors triggered by
access PCI config space or MMIO, we replace "WARN(1, ...)" with
pr_err() and dump_stack(). The patch also adds more output messages
to indicate what EEH core is doing. Besides, some printk() are
replaced with pr_warning().

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 416fb43..3a8f82f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -329,7 +329,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
 	eeh_serialize_unlock(flags);
 	eeh_send_failure_event(phb_pe);
 
-	WARN(1, "EEH: PHB failure detected\n");
+	pr_err("EEH: PHB#%x failure detected\n",
+		phb_pe->phb->global_number);
+	dump_stack();
 
 	return 1;
 out:
@@ -458,7 +460,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 * a stack trace will help the device-driver authors figure
 	 * out what happened.  So print that out.
 	 */
-	WARN(1, "EEH: failure detected\n");
+	pr_err("EEH: Frozen PE#%x detected on PHB#%x\n",
+		pe->addr, pe->phb->global_number);
+	dump_stack();
+
 	return 1;
 
 dn_unlock:
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 0974e13..2b1ce17 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -425,6 +425,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	 * status ... if any child can't handle the reset, then the entire
 	 * slot is dlpar removed and added.
 	 */
+	pr_info("EEH: Notify device drivers to shutdown\n");
 	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
 
 	/* Get the current PCI slot state. This can take a long time,
@@ -432,7 +433,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	 */
 	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
-		printk(KERN_WARNING "EEH: Permanent failure\n");
+		pr_warning("EEH: Permanent failure\n");
 		goto hard_fail;
 	}
 
@@ -440,6 +441,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	 * don't post the error log until after all dev drivers
 	 * have been informed.
 	 */
+	pr_info("EEH: Collect temporary log\n");
 	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
 
 	/* If all device drivers were EEH-unaware, then shut
@@ -447,15 +449,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	 * go down willingly, without panicing the system.
 	 */
 	if (result == PCI_ERS_RESULT_NONE) {
+		pr_info("EEH: Reset with hotplug activity\n");
 		rc = eeh_reset_device(pe, frozen_bus);
 		if (rc) {
-			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
+			pr_warning("%s: Unable to reset, err=%d\n",
+				   __func__, rc);
 			goto hard_fail;
 		}
 	}
 
 	/* If all devices reported they can proceed, then re-enable MMIO */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		pr_info("EEH: Enable I/O for affected devices\n");
 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
 
 		if (rc < 0)
@@ -463,6 +468,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 		if (rc) {
 			result = PCI_ERS_RESULT_NEED_RESET;
 		} else {
+			pr_info("EEH: Notify device drivers to resume I/O\n");
 			result = PCI_ERS_RESULT_NONE;
 			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
 		}
@@ -470,6 +476,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 
 	/* If all devices reported they can proceed, then re-enable DMA */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+		pr_info("EEH: Enabled DMA for affected devices\n");
 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
 
 		if (rc < 0)
@@ -482,17 +489,22 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 
 	/* If any device has a hard failure, then shut off everything. */
 	if (result == PCI_ERS_RESULT_DISCONNECT) {
-		printk(KERN_WARNING "EEH: Device driver gave up\n");
+		pr_warning("EEH: Device driver gave up\n");
 		goto hard_fail;
 	}
 
 	/* If any device called out for a reset, then reset the slot */
 	if (result == PCI_ERS_RESULT_NEED_RESET) {
+		pr_info("EEH: Reset without hotplug activity\n");
 		rc = eeh_reset_device(pe, NULL);
 		if (rc) {
-			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
+			pr_warning("%s: Cannot reset, err=%d\n",
+				   __func__, rc);
 			goto hard_fail;
 		}
+
+		pr_info("EEH: Notify device drivers "
+			"the completion of reset\n");
 		result = PCI_ERS_RESULT_NONE;
 		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
 	}
@@ -500,11 +512,12 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	/* All devices should claim they have recovered by now. */
 	if ((result != PCI_ERS_RESULT_RECOVERED) &&
 	    (result != PCI_ERS_RESULT_NONE)) {
-		printk(KERN_WARNING "EEH: Not recovered\n");
+		pr_warning("EEH: Not recovered\n");
 		goto hard_fail;
 	}
 
 	/* Tell all device drivers that they can resume operations */
+	pr_info("EEH: Notify device driver to resume\n");
 	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
 	return;
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 85025d7..0cd1c4a 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -853,11 +853,14 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 					phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				}
 
-				WARN(1, "EEH: dead IOC detected\n");
+				pr_err("EEH: dead IOC detected\n");
 				ret = 4;
 				goto out;
-			} else if (severity == OPAL_EEH_SEV_INF)
+			} else if (severity == OPAL_EEH_SEV_INF) {
+				pr_info("EEH: IOC informative error "
+					"detected\n");
 				ioda_eeh_hub_diag(hose);
+			}
 
 			break;
 		case OPAL_EEH_PHB_ERROR:
@@ -865,8 +868,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 				if (ioda_eeh_get_phb_pe(hose, pe))
 					break;
 
-				WARN(1, "EEH: dead PHB#%x detected\n",
-				     hose->global_number);
+				pr_err("EEH: dead PHB#%x detected\n",
+					hose->global_number);
 				phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				ret = 3;
 				goto out;
@@ -874,20 +877,24 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 				if (ioda_eeh_get_phb_pe(hose, pe))
 					break;
 
-				WARN(1, "EEH: fenced PHB#%x detected\n",
-				     hose->global_number);
+				pr_err("EEH: fenced PHB#%x detected\n",
+					hose->global_number);
 				ret = 2;
 				goto out;
-			} else if (severity == OPAL_EEH_SEV_INF)
+			} else if (severity == OPAL_EEH_SEV_INF) {
+				pr_info("EEH: PHB#%x informative error "
+					"detected\n",
+					hose->global_number);
 				ioda_eeh_phb_diag(hose);
+			}
 
 			break;
 		case OPAL_EEH_PE_ERROR:
 			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe))
 				break;
 
-			WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n",
-			     (*pe)->addr, (*pe)->phb->global_number);
+			pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
+				(*pe)->addr, (*pe)->phb->global_number);
 			ret = 1;
 			goto out;
 		}
-- 
cgit v0.10.2


From eeb6361fdd3df59c1741522b3d8102f0f5efdd88 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:47 +0800
Subject: powerpc/eeh: Avoid build warnings

The patch is for avoiding following build warnings:

   The function .pnv_pci_ioda_fixup() references
   the function __init .eeh_init().
   This is often because .pnv_pci_ioda_fixup lacks a __init

   The function .pnv_pci_ioda_fixup() references
   the function __init .eeh_addr_cache_build().
   This is often because .pnv_pci_ioda_fixup lacks a __init

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index dd65e31..09a8743 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -202,13 +202,13 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 void *eeh_dev_init(struct device_node *dn, void *data);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
-int __init eeh_init(void);
+int eeh_init(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 unsigned long eeh_check_failure(const volatile void __iomem *token,
 				unsigned long val);
 int eeh_dev_check_failure(struct eeh_dev *edev);
-void __init eeh_addr_cache_build(void);
+void eeh_addr_cache_build(void);
 void eeh_add_device_tree_early(struct device_node *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3a8f82f..39954fe 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -756,7 +756,7 @@ int __exit eeh_ops_unregister(const char *name)
  * Even if force-off is set, the EEH hardware is still enabled, so that
  * newer systems can boot.
  */
-int __init eeh_init(void)
+int eeh_init(void)
 {
 	struct pci_controller *hose, *tmp;
 	struct device_node *phb;
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 858ebea..ea9a94c 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -285,7 +285,7 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
  * Must be run late in boot process, after the pci controllers
  * have been scanned for devices (after all device resources are known).
  */
-void __init eeh_addr_cache_build(void)
+void eeh_addr_cache_build(void)
 {
 	struct device_node *dn;
 	struct eeh_dev *edev;
-- 
cgit v0.10.2


From 9bf41be6737327b7c06cd3f210a0cb599f4aa790 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 13:46:48 +0800
Subject: powerpc/powernv: Use dev-node in PCI config accessors

Currently, we're using the combo (PCI bus + devfn) in the PCI
config accessors and PCI config accessors in EEH depends on them.
However, it's not safe to refer the PCI bus which might have been
removed during hotplug. So we're using device node in the PCI
config accessors and the corresponding backends just reuse them.

The patch also fix one potential risk: We possiblly have frozen
PE during the early PCI probe time, but we haven't setup the PE
mapping yet. So the errors should be counted to PE#0.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 9559115..969cce7 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -315,46 +315,6 @@ static int powernv_eeh_configure_bridge(struct eeh_pe *pe)
 }
 
 /**
- * powernv_eeh_read_config - Read PCI config space
- * @dn: device node
- * @where: PCI address
- * @size: size to read
- * @val: return value
- *
- * Read config space from the speicifed device
- */
-static int powernv_eeh_read_config(struct device_node *dn, int where,
-				   int size, u32 *val)
-{
-	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	struct pci_controller *hose = edev->phb;
-
-	return hose->ops->read(dev->bus, dev->devfn, where, size, val);
-}
-
-/**
- * powernv_eeh_write_config - Write PCI config space
- * @dn: device node
- * @where: PCI address
- * @size: size to write
- * @val: value to be written
- *
- * Write config space to the specified device
- */
-static int powernv_eeh_write_config(struct device_node *dn, int where,
-				    int size, u32 val)
-{
-	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
-	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	struct pci_controller *hose = edev->phb;
-
-	hose = pci_bus_to_host(dev->bus);
-
-	return hose->ops->write(dev->bus, dev->devfn, where, size, val);
-}
-
-/**
  * powernv_eeh_next_error - Retrieve next EEH error to handle
  * @pe: Affected PE
  *
@@ -389,8 +349,8 @@ static struct eeh_ops powernv_eeh_ops = {
 	.wait_state             = powernv_eeh_wait_state,
 	.get_log                = powernv_eeh_get_log,
 	.configure_bridge       = powernv_eeh_configure_bridge,
-	.read_config            = powernv_eeh_read_config,
-	.write_config           = powernv_eeh_write_config,
+	.read_config            = pnv_pci_cfg_read,
+	.write_config           = pnv_pci_cfg_write,
 	.next_error		= powernv_eeh_next_error
 };
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 4c91e6d..a28d3b5 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -231,47 +231,50 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
 	spin_unlock_irqrestore(&phb->lock, flags);
 }
 
-static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus,
-				     u32 bdfn)
+static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
+				     struct device_node *dn)
 {
 	s64	rc;
 	u8	fstate;
 	u16	pcierr;
 	u32	pe_no;
 
-	/* Get PE# if we support IODA */
-	pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0;
+	/*
+	 * Get the PE#. During the PCI probe stage, we might not
+	 * setup that yet. So all ER errors should be mapped to
+	 * PE#0
+	 */
+	pe_no = PCI_DN(dn)->pe_number;
+	if (pe_no == IODA_INVALID_PE)
+		pe_no = 0;
 
 	/* Read freeze status */
 	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr,
 					NULL);
 	if (rc) {
-		pr_warning("PCI %d: Failed to read EEH status for PE#%d,"
-			   " err %lld\n", phb->hose->global_number, pe_no, rc);
+		pr_warning("%s: Can't read EEH status (PE#%d) for "
+			   "%s, err %lld\n",
+			   __func__, pe_no, dn->full_name, rc);
 		return;
 	}
-	cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n",
-		bdfn, pe_no, fstate);
+	cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+		(PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn),
+		pe_no, fstate);
 	if (fstate != 0)
 		pnv_pci_handle_eeh_config(phb, pe_no);
 }
 
-static int pnv_pci_read_config(struct pci_bus *bus,
-			       unsigned int devfn,
-			       int where, int size, u32 *val)
+int pnv_pci_cfg_read(struct device_node *dn,
+		     int where, int size, u32 *val)
 {
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = PCI_DN(dn);
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u32 bdfn = (pdn->busno << 8) | pdn->devfn;
 #ifdef CONFIG_EEH
-	struct device_node *busdn, *dn;
 	struct eeh_pe *phb_pe = NULL;
 #endif
-	u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
 	s64 rc;
 
-	if (hose == NULL)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
 	switch (size) {
 	case 1: {
 		u8 v8;
@@ -295,8 +298,8 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 	default:
 		return PCIBIOS_FUNC_NOT_SUPPORTED;
 	}
-	cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n",
-		bus->number, devfn, where, size, *val);
+	cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+		__func__, pdn->busno, pdn->devfn, where, size, *val);
 
 	/*
 	 * Check if the specified PE has been put into frozen
@@ -305,44 +308,33 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 	 * PHB-fatal errors.
 	 */
 #ifdef CONFIG_EEH
-	phb_pe = eeh_phb_pe_get(hose);
+	phb_pe = eeh_phb_pe_get(pdn->phb);
 	if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
 		return PCIBIOS_SUCCESSFUL;
 
 	if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
-		if (*val == EEH_IO_ERROR_VALUE(size)) {
-			busdn = pci_bus_to_OF_node(bus);
-			for (dn = busdn->child; dn; dn = dn->sibling) {
-				struct pci_dn *pdn = PCI_DN(dn);
-
-				if (pdn && pdn->devfn == devfn &&
-				    eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
-					return PCIBIOS_DEVICE_NOT_FOUND;
-			}
-		}
+		if (*val == EEH_IO_ERROR_VALUE(size) &&
+		    eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
+			return PCIBIOS_DEVICE_NOT_FOUND;
 	} else {
-		pnv_pci_config_check_eeh(phb, bus, bdfn);
+		pnv_pci_config_check_eeh(phb, dn);
 	}
 #else
-	pnv_pci_config_check_eeh(phb, bus, bdfn);
+	pnv_pci_config_check_eeh(phb, dn);
 #endif
 
 	return PCIBIOS_SUCCESSFUL;
 }
 
-static int pnv_pci_write_config(struct pci_bus *bus,
-				unsigned int devfn,
-				int where, int size, u32 val)
+int pnv_pci_cfg_write(struct device_node *dn,
+		      int where, int size, u32 val)
 {
-	struct pci_controller *hose = pci_bus_to_host(bus);
-	struct pnv_phb *phb = hose->private_data;
-	u32 bdfn = (((uint64_t)bus->number) << 8) | devfn;
+	struct pci_dn *pdn = PCI_DN(dn);
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u32 bdfn = (pdn->busno << 8) | pdn->devfn;
 
-	if (hose == NULL)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n",
-		bus->number, devfn, where, size, val);
+	cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+		pdn->busno, pdn->devfn, where, size, val);
 	switch (size) {
 	case 1:
 		opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
@@ -360,16 +352,50 @@ static int pnv_pci_write_config(struct pci_bus *bus,
 	/* Check if the PHB got frozen due to an error (no response) */
 #ifdef CONFIG_EEH
 	if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
-		pnv_pci_config_check_eeh(phb, bus, bdfn);
+		pnv_pci_config_check_eeh(phb, dn);
 #else
-	pnv_pci_config_check_eeh(phb, bus, bdfn);
+	pnv_pci_config_check_eeh(phb, dn);
 #endif
 
 	return PCIBIOS_SUCCESSFUL;
 }
 
+static int pnv_pci_read_config(struct pci_bus *bus,
+			       unsigned int devfn,
+			       int where, int size, u32 *val)
+{
+	struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
+	struct pci_dn *pdn;
+
+	for (dn = busdn->child; dn; dn = dn->sibling) {
+		pdn = PCI_DN(dn);
+		if (pdn && pdn->devfn == devfn)
+			return pnv_pci_cfg_read(dn, where, size, val);
+	}
+
+	*val = 0xFFFFFFFF;
+	return PCIBIOS_DEVICE_NOT_FOUND;
+
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+				unsigned int devfn,
+				int where, int size, u32 val)
+{
+	struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
+	struct pci_dn *pdn;
+
+	for (dn = busdn->child; dn; dn = dn->sibling) {
+		pdn = PCI_DN(dn);
+		if (pdn && pdn->devfn == devfn)
+			return pnv_pci_cfg_write(dn, where, size, val);
+	}
+
+	return PCIBIOS_DEVICE_NOT_FOUND;
+}
+
 struct pci_ops pnv_pci_ops = {
-	.read = pnv_pci_read_config,
+	.read  = pnv_pci_read_config,
 	.write = pnv_pci_write_config,
 };
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 40bdf02..d633c64 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -182,6 +182,10 @@ extern struct pci_ops pnv_pci_ops;
 extern struct pnv_eeh_ops ioda_eeh_ops;
 #endif
 
+int pnv_pci_cfg_read(struct device_node *dn,
+		     int where, int size, u32 *val);
+int pnv_pci_cfg_write(struct device_node *dn,
+		      int where, int size, u32 val);
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
 				      u64 dma_offset);
-- 
cgit v0.10.2


From 4bb297113433048169c30a32c1e58b6a1b61b621 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Sun, 30 Jun 2013 22:00:42 +0300
Subject: powerpc/windfarm: Fix overtemperature clearing

With pm81/pm91/pm121, when the overtemperature state is entered, and
when it remains on after skipped ticks, the driver will try to leave
it too soon (immediately on the next tick). This is because the active
FAILURE_OVERTEMP state is not visible in "new_failure" variable of the
current tick. Furthermore, the driver will keep trying to clear condition
in subsequent ticks as FAILURE_OVERTEMP remains set in the "last_failure"
variable. These will start to trigger WARNINGS from windfarm core:

[  100.082735] windfarm: Clamping CPU frequency to minimum !
[  100.108132] windfarm: Overtemp condition detected !
[  101.952908] windfarm: Overtemp condition cleared !
[...]
[  102.980388] WARNING: at drivers/macintosh/windfarm_core.c:463
[...]
[  103.982227] WARNING: at drivers/macintosh/windfarm_core.c:463
[...]
[  105.030494] WARNING: at drivers/macintosh/windfarm_core.c:463
[...]
[  105.973666] WARNING: at drivers/macintosh/windfarm_core.c:463
[...]
[  106.977913] WARNING: at drivers/macintosh/windfarm_core.c:463

Fix by adding a helper global variable. We leave the overtemp state only
after all failure bits have been cleared.

I saw this error on iMac G5 iSight (pm121). Also pm81/pm91 are fixed
based on the observation that these are almost identical/copy-pasted code.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c
index af605e9..7fe58b0 100644
--- a/drivers/macintosh/windfarm_pm121.c
+++ b/drivers/macintosh/windfarm_pm121.c
@@ -276,6 +276,7 @@ static const char *loop_names[N_LOOPS] = {
 
 static unsigned int pm121_failure_state;
 static int pm121_readjust, pm121_skipping;
+static bool pm121_overtemp;
 static s32 average_power;
 
 struct pm121_correction {
@@ -847,6 +848,7 @@ static void pm121_tick(void)
 	if (new_failure & FAILURE_OVERTEMP) {
 		wf_set_overtemp();
 		pm121_skipping = 2;
+		pm121_overtemp = true;
 	}
 
 	/* We only clear the overtemp condition if overtemp is cleared
@@ -855,8 +857,10 @@ static void pm121_tick(void)
 	 * the control loop levels, but we don't want to keep it clear
 	 * here in this case
 	 */
-	if (new_failure == 0 && last_failure & FAILURE_OVERTEMP)
+	if (!pm121_failure_state && pm121_overtemp) {
 		wf_clear_overtemp();
+		pm121_overtemp = false;
+	}
 }
 
 
diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c
index f84933f..2a5e1b1 100644
--- a/drivers/macintosh/windfarm_pm81.c
+++ b/drivers/macintosh/windfarm_pm81.c
@@ -149,6 +149,7 @@ static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started;
 
 static unsigned int wf_smu_failure_state;
 static int wf_smu_readjust, wf_smu_skipping;
+static bool wf_smu_overtemp;
 
 /*
  * ****** System Fans Control Loop ******
@@ -593,6 +594,7 @@ static void wf_smu_tick(void)
 	if (new_failure & FAILURE_OVERTEMP) {
 		wf_set_overtemp();
 		wf_smu_skipping = 2;
+		wf_smu_overtemp = true;
 	}
 
 	/* We only clear the overtemp condition if overtemp is cleared
@@ -601,8 +603,10 @@ static void wf_smu_tick(void)
 	 * the control loop levels, but we don't want to keep it clear
 	 * here in this case
 	 */
-	if (new_failure == 0 && last_failure & FAILURE_OVERTEMP)
+	if (!wf_smu_failure_state && wf_smu_overtemp) {
 		wf_clear_overtemp();
+		wf_smu_overtemp = false;
+	}
 }
 
 static void wf_smu_new_control(struct wf_control *ct)
diff --git a/drivers/macintosh/windfarm_pm91.c b/drivers/macintosh/windfarm_pm91.c
index 2eb484f..a8ac66c 100644
--- a/drivers/macintosh/windfarm_pm91.c
+++ b/drivers/macintosh/windfarm_pm91.c
@@ -76,6 +76,7 @@ static struct wf_control *cpufreq_clamp;
 
 /* Set to kick the control loop into life */
 static int wf_smu_all_controls_ok, wf_smu_all_sensors_ok, wf_smu_started;
+static bool wf_smu_overtemp;
 
 /* Failure handling.. could be nicer */
 #define FAILURE_FAN		0x01
@@ -517,6 +518,7 @@ static void wf_smu_tick(void)
 	if (new_failure & FAILURE_OVERTEMP) {
 		wf_set_overtemp();
 		wf_smu_skipping = 2;
+		wf_smu_overtemp = true;
 	}
 
 	/* We only clear the overtemp condition if overtemp is cleared
@@ -525,8 +527,10 @@ static void wf_smu_tick(void)
 	 * the control loop levels, but we don't want to keep it clear
 	 * here in this case
 	 */
-	if (new_failure == 0 && last_failure & FAILURE_OVERTEMP)
+	if (!wf_smu_failure_state && wf_smu_overtemp) {
 		wf_clear_overtemp();
+		wf_smu_overtemp = false;
+	}
 }
 
 
-- 
cgit v0.10.2


From 348c2298a6fd2b145e789739808d5e7598e275fc Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 27 Jun 2013 09:09:43 +0800
Subject: powerpc: Don't flush/invalidate the d/icache for an unknown
 relocation type

For an unknown relocation type since the value of r4 is just the 8bit
relocation type, the sum of r4 and r7 may yield an invalid memory
address. For example:
    In normal case:
             r4 = c00xxxxx
             r7 = 40000000
             r4 + r7 = 000xxxxx

    For an unknown relocation type:
             r4 = 000000xx
             r7 = 40000000
             r4 + r7 = 400000xx
   400000xx is an invalid memory address for a board which has just
   512M memory.

And for operations such as dcbst or icbi may cause bus error for an
invalid memory address on some platforms and then cause the board
reset. So we should skip the flush/invalidate the d/icache for
an unknown relocation type.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Acked-by: Suzuki K. Poulose <suzuki@in.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
index ef46ba6..f366fed 100644
--- a/arch/powerpc/kernel/reloc_32.S
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -166,7 +166,7 @@ ha16:
 	/* R_PPC_ADDR16_LO */
 lo16:
 	cmpwi	r4, R_PPC_ADDR16_LO
-	bne	nxtrela
+	bne	unknown_type
 	lwz	r4, 0(r9)	/* r_offset */
 	lwz	r0, 8(r9)	/* r_addend */
 	add	r0, r0, r3
@@ -191,6 +191,7 @@ nxtrela:
 	dcbst	r4,r7
 	sync			/* Ensure the data is flushed before icbi */
 	icbi	r4,r7
+unknown_type:
 	cmpwi	r8, 0		/* relasz = 0 ? */
 	ble	done
 	add	r9, r9, r6	/* move to next entry in the .rela table */
-- 
cgit v0.10.2


From 5524f3fc069b6435f9ff0db573e3a8b5082ef528 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 11 Jun 2013 13:57:05 -0600
Subject: powerpc/iommu: Remove unused pci_iommu_init() and
 pci_direct_iommu_init()

pci_iommu_init() and pci_direct_iommu_init() are not referenced anywhere,
so remove them.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 98d1422..c34656a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -130,13 +130,6 @@ extern void iommu_init_early_pSeries(void);
 extern void iommu_init_early_dart(void);
 extern void iommu_init_early_pasemi(void);
 
-#ifdef CONFIG_PCI
-extern void pci_iommu_init(void);
-extern void pci_direct_iommu_init(void);
-#else
-static inline void pci_iommu_init(void) { }
-#endif
-
 extern void alloc_dart_table(void);
 #if defined(CONFIG_PPC64) && defined(CONFIG_PM)
 static inline void iommu_save(void)
-- 
cgit v0.10.2


From cc293bf7a9aa6fad68d107c79705732bd2fc57e3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 13 Jun 2013 19:37:30 -0700
Subject: powerpc/idle: Convert use of typedef ctl_table to struct ctl_table

This typedef is unnecessary and should just be removed.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 939ea7e..d7216c9 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -85,7 +85,7 @@ int powersave_nap;
 /*
  * Register the sysctl to set/clear powersave_nap.
  */
-static ctl_table powersave_nap_ctl_table[]={
+static struct ctl_table powersave_nap_ctl_table[] = {
 	{
 		.procname	= "powersave-nap",
 		.data		= &powersave_nap,
@@ -95,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={
 	},
 	{}
 };
-static ctl_table powersave_nap_sysctl_root[] = {
+static struct ctl_table powersave_nap_sysctl_root[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
-- 
cgit v0.10.2


From 5eb969d0e8b8f38fca0b2c6c76f5dca01449664a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 13 Jun 2013 19:37:37 -0700
Subject: macintosh: Convert use of typedef ctl_table to struct ctl_table

This typedef is unnecessary and should just be removed.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index 6a82388..80d30e8 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -181,7 +181,7 @@ static void mac_hid_stop_emulation(void)
 	mac_hid_destroy_emumouse();
 }
 
-static int mac_hid_toggle_emumouse(ctl_table *table, int write,
+static int mac_hid_toggle_emumouse(struct ctl_table *table, int write,
 				   void __user *buffer, size_t *lenp,
 				   loff_t *ppos)
 {
@@ -214,7 +214,7 @@ static int mac_hid_toggle_emumouse(ctl_table *table, int write,
 }
 
 /* file(s) in /proc/sys/dev/mac_hid */
-static ctl_table mac_hid_files[] = {
+static struct ctl_table mac_hid_files[] = {
 	{
 		.procname	= "mouse_button_emulation",
 		.data		= &mouse_emulate_buttons,
@@ -240,7 +240,7 @@ static ctl_table mac_hid_files[] = {
 };
 
 /* dir in /proc/sys/dev */
-static ctl_table mac_hid_dir[] = {
+static struct ctl_table mac_hid_dir[] = {
 	{
 		.procname	= "mac_hid",
 		.maxlen		= 0,
@@ -251,7 +251,7 @@ static ctl_table mac_hid_dir[] = {
 };
 
 /* /proc/sys/dev itself, in case that is not there yet */
-static ctl_table mac_hid_root_dir[] = {
+static struct ctl_table mac_hid_root_dir[] = {
 	{
 		.procname	= "dev",
 		.maxlen		= 0,
-- 
cgit v0.10.2


From 061d19f279f9bebbdb1ee48bef8c25e03de32ae2 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 24 Jun 2013 15:30:09 -0400
Subject: powerpc: Delete __cpuinit usage from all users

The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications.  For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.

After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out.  Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.

This removes all the powerpc uses of the __cpuinit macros.  There
are no __CPUINIT users in assembly files in powerpc.

[1] https://lkml.org/lkml/2013/5/20/589

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Josh Boyer <jwboyer@gmail.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 34fd704..c7a8bfc 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -350,8 +350,8 @@ static inline u32 rtas_config_addr(int busno, int devfn, int reg)
 			(devfn << 8) | (reg & 0xff);
 }
 
-extern void __cpuinit rtas_give_timebase(void);
-extern void __cpuinit rtas_take_timebase(void);
+extern void rtas_give_timebase(void);
+extern void rtas_take_timebase(void);
 
 #ifdef CONFIG_PPC_RTAS
 static inline int page_is_rtas_user_buf(unsigned long pfn)
diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 50f261b..0d9cecd 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -22,7 +22,7 @@ extern unsigned long vdso64_rt_sigtramp;
 extern unsigned long vdso32_sigtramp;
 extern unsigned long vdso32_rt_sigtramp;
 
-int __cpuinit vdso_getcpu_init(void);
+int vdso_getcpu_init(void);
 
 #else /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 92c6b00..9262cf2 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -131,7 +131,8 @@ static const char *cache_type_string(const struct cache *cache)
 	return cache_type_info[cache->type].name;
 }
 
-static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode)
+static void cache_init(struct cache *cache, int type, int level,
+		       struct device_node *ofnode)
 {
 	cache->type = type;
 	cache->level = level;
@@ -140,7 +141,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc
 	list_add(&cache->list, &cache_list);
 }
 
-static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level, struct device_node *ofnode)
 {
 	struct cache *cache;
 
@@ -324,7 +325,8 @@ static bool cache_node_is_unified(const struct device_node *np)
 	return of_get_property(np, "cache-unified", NULL);
 }
 
-static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node,
+						  int level)
 {
 	struct cache *cache;
 
@@ -335,7 +337,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *
 	return cache;
 }
 
-static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_split(struct device_node *node,
+						int level)
 {
 	struct cache *dcache, *icache;
 
@@ -357,7 +360,7 @@ err:
 	return NULL;
 }
 
-static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int level)
 {
 	struct cache *cache;
 
@@ -369,7 +372,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in
 	return cache;
 }
 
-static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level)
+static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+						 int level)
 {
 	struct cache *cache;
 
@@ -385,7 +389,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n
 	return cache;
 }
 
-static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger)
+static void link_cache_lists(struct cache *smaller, struct cache *bigger)
 {
 	while (smaller->next_local) {
 		if (smaller->next_local == bigger)
@@ -396,13 +400,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg
 	smaller->next_local = bigger;
 }
 
-static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache)
+static void do_subsidiary_caches_debugcheck(struct cache *cache)
 {
 	WARN_ON_ONCE(cache->level != 1);
 	WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
 }
 
-static void __cpuinit do_subsidiary_caches(struct cache *cache)
+static void do_subsidiary_caches(struct cache *cache)
 {
 	struct device_node *subcache_node;
 	int level = cache->level;
@@ -423,7 +427,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache)
 	}
 }
 
-static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id)
+static struct cache *cache_chain_instantiate(unsigned int cpu_id)
 {
 	struct device_node *cpu_node;
 	struct cache *cpu_cache = NULL;
@@ -448,7 +452,7 @@ out:
 	return cpu_cache;
 }
 
-static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
+static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
 {
 	struct cache_dir *cache_dir;
 	struct device *dev;
@@ -653,7 +657,7 @@ static struct kobj_type cache_index_type = {
 	.default_attrs = cache_index_default_attrs,
 };
 
-static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
+static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
 {
 	const char *cache_name;
 	const char *cache_type;
@@ -696,7 +700,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d
 	kfree(buf);
 }
 
-static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir)
+static void cacheinfo_create_index_dir(struct cache *cache, int index,
+				       struct cache_dir *cache_dir)
 {
 	struct cache_index_dir *index_dir;
 	int rc;
@@ -722,7 +727,8 @@ err:
 	kfree(index_dir);
 }
 
-static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list)
+static void cacheinfo_sysfs_populate(unsigned int cpu_id,
+				     struct cache *cache_list)
 {
 	struct cache_dir *cache_dir;
 	struct cache *cache;
@@ -740,7 +746,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache
 	}
 }
 
-void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
+void cacheinfo_cpu_online(unsigned int cpu_id)
 {
 	struct cache *cache;
 
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 52add6f..80b5ef4 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1172,7 +1172,7 @@ int __init early_init_dt_scan_rtas(unsigned long node,
 static arch_spinlock_t timebase_lock;
 static u64 timebase = 0;
 
-void __cpuinit rtas_give_timebase(void)
+void rtas_give_timebase(void)
 {
 	unsigned long flags;
 
@@ -1189,7 +1189,7 @@ void __cpuinit rtas_give_timebase(void)
 	local_irq_restore(flags);
 }
 
-void __cpuinit rtas_take_timebase(void)
+void rtas_take_timebase(void)
 {
 	while (!timebase)
 		barrier();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ee7ac5e..85398c7 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -480,7 +480,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
 	secondary_ti = current_set[cpu] = ti;
 }
 
-int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc, c;
 
@@ -610,7 +610,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
 }
 
 /* Activate a secondary processor. */
-__cpuinit void start_secondary(void *unused)
+void start_secondary(void *unused)
 {
 	unsigned int cpu = smp_processor_id();
 	struct device_node *l2_cache;
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e68a845..27a90b9 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -341,7 +341,7 @@ static struct device_attribute pa6t_attrs[] = {
 #endif /* HAS_PPC_PMC_PA6T */
 #endif /* HAS_PPC_PMC_CLASSIC */
 
-static void __cpuinit register_cpu_online(unsigned int cpu)
+static void register_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
 	struct device *s = &c->dev;
@@ -502,7 +502,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
+static int sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
@@ -522,7 +522,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
+static struct notifier_block sysfs_cpu_nb = {
 	.notifier_call	= sysfs_cpu_notify,
 };
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 5fc29ad..65ab9e9 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -631,7 +631,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
 	return found;
 }
 
-/* should become __cpuinit when secondary_cpu_time_init also is */
 void start_cpu_decrementer(void)
 {
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index d4f463a..1d9c926 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -711,7 +711,7 @@ static void __init vdso_setup_syscall_map(void)
 }
 
 #ifdef CONFIG_PPC64
-int __cpuinit vdso_getcpu_init(void)
+int vdso_getcpu_init(void)
 {
 	unsigned long cpu, node, val;
 
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 2c9441e..82b1ff7 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
 
 unsigned long tlb_47x_boltmap[1024/8];
 
-static void __cpuinit ppc44x_update_tlb_hwater(void)
+static void ppc44x_update_tlb_hwater(void)
 {
 	extern unsigned int tlb_44x_patch_hwater_D[];
 	extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
 /*
  * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
  */
-static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
 {
 	unsigned int rA;
 	int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 }
 
 #ifdef CONFIG_SMP
-void __cpuinit mmu_init_secondary(int cpu)
+void mmu_init_secondary(int cpu)
 {
 	unsigned long addr;
 	unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8452316..6ecc38b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -807,7 +807,7 @@ void __init early_init_mmu(void)
 }
 
 #ifdef CONFIG_SMP
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
 {
 	/* Initialize hash table for that CPU */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 810f8e4..af3d78e 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -332,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
 
 #ifdef CONFIG_SMP
 
-static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
-					    unsigned long action, void *hcpu)
+static int mmu_context_cpu_notify(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
 
@@ -366,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+static struct notifier_block mmu_context_cpu_nb = {
 	.notifier_call	= mmu_context_cpu_notify,
 };
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 88c0425..c792cd9 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -516,7 +516,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
  * Figure out to which domain a cpu belongs and stick it there.
  * Return the id of the domain used.
  */
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
 {
 	int nid = 0;
 	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
@@ -538,8 +538,7 @@ out:
 	return nid;
 }
 
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
-			     unsigned long action,
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	unsigned long lcpu = (unsigned long)hcpu;
@@ -919,7 +918,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
 	return ret;
 }
 
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
 	.notifier_call = cpu_numa_callback,
 	.priority = 1 /* Must run before sched domains notifier. */
 };
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 6888cad..41cd68d 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -648,7 +648,7 @@ void __init early_init_mmu(void)
 	__early_init_mmu(1);
 }
 
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
 {
 	__early_init_mmu(0);
 }
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 29c6482..af94a71 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1786,7 +1786,7 @@ static void power_pmu_setup(int cpu)
 	cpuhw->mmcr[0] = MMCR0_FC;
 }
 
-static int __cpuinit
+static int
 power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
@@ -1803,7 +1803,7 @@ power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu
 	return NOTIFY_OK;
 }
 
-int __cpuinit register_power_pmu(struct power_pmu *pmu)
+int register_power_pmu(struct power_pmu *pmu)
 {
 	if (ppmu)
 		return -EBUSY;		/* something's already registered */
diff --git a/arch/powerpc/platforms/44x/currituck.c b/arch/powerpc/platforms/44x/currituck.c
index c52e1b3..7f1b71a 100644
--- a/arch/powerpc/platforms/44x/currituck.c
+++ b/arch/powerpc/platforms/44x/currituck.c
@@ -91,12 +91,12 @@ static void __init ppc47x_init_irq(void)
 }
 
 #ifdef CONFIG_SMP
-static void __cpuinit smp_ppc47x_setup_cpu(int cpu)
+static void smp_ppc47x_setup_cpu(int cpu)
 {
 	mpic_setup_this_cpu();
 }
 
-static int __cpuinit smp_ppc47x_kick_cpu(int cpu)
+static int smp_ppc47x_kick_cpu(int cpu)
 {
 	struct device_node *cpunode = of_get_cpu_node(cpu, NULL);
 	const u64 *spin_table_addr_prop;
diff --git a/arch/powerpc/platforms/44x/iss4xx.c b/arch/powerpc/platforms/44x/iss4xx.c
index a28a862..4241bc8 100644
--- a/arch/powerpc/platforms/44x/iss4xx.c
+++ b/arch/powerpc/platforms/44x/iss4xx.c
@@ -81,12 +81,12 @@ static void __init iss4xx_init_irq(void)
 }
 
 #ifdef CONFIG_SMP
-static void __cpuinit smp_iss4xx_setup_cpu(int cpu)
+static void smp_iss4xx_setup_cpu(int cpu)
 {
 	mpic_setup_this_cpu();
 }
 
-static int __cpuinit smp_iss4xx_kick_cpu(int cpu)
+static int smp_iss4xx_kick_cpu(int cpu)
 {
 	struct device_node *cpunode = of_get_cpu_node(cpu, NULL);
 	const u64 *spin_table_addr_prop;
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 6a17599..5ced4f5 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -99,7 +99,7 @@ static void mpc85xx_take_timebase(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __cpuinit smp_85xx_mach_cpu_die(void)
+static void smp_85xx_mach_cpu_die(void)
 {
 	unsigned int cpu = smp_processor_id();
 	u32 tmp;
@@ -141,7 +141,7 @@ static inline u32 read_spin_table_addr_l(void *spin_table)
 	return in_be32(&((struct epapr_spin_table *)spin_table)->addr_l);
 }
 
-static int __cpuinit smp_85xx_kick_cpu(int nr)
+static int smp_85xx_kick_cpu(int nr)
 {
 	unsigned long flags;
 	const u64 *cpu_rel_addr;
@@ -362,7 +362,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC */
 
-static void __cpuinit smp_85xx_setup_cpu(int cpu_nr)
+static void smp_85xx_setup_cpu(int cpu_nr)
 {
 	if (smp_85xx_ops.probe == smp_mpic_probe)
 		mpic_setup_this_cpu();
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index bdb738a..49c9f95 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -885,7 +885,7 @@ static int smp_core99_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata smp_core99_cpu_nb = {
+static struct notifier_block smp_core99_cpu_nb = {
 	.notifier_call	= smp_core99_cpu_notify,
 };
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 77784ae..89e3857 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -40,7 +40,7 @@
 #define DBG(fmt...)
 #endif
 
-static void __cpuinit pnv_smp_setup_cpu(int cpu)
+static void pnv_smp_setup_cpu(int cpu)
 {
 	if (cpu != boot_cpuid)
 		xics_setup_cpu();
-- 
cgit v0.10.2


From 330dae19998072e97f550885a0087df6d6556816 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 30 Jun 2013 12:03:23 +0200
Subject: mac: Make cuda_init_via() __init

cuda_init_via() is called from find_via_cuda() only, which is __init.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 86511c5..d61f271 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -259,7 +259,7 @@ cuda_probe(void)
     } while (0)
 
 static int
-cuda_init_via(void)
+__init cuda_init_via(void)
 {
     out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ);	/* TACK & TIP out */
     out_8(&via[B], in_8(&via[B]) | TACK | TIP);			/* negate them */
-- 
cgit v0.10.2


From cce606feb425093c8371089d392e336d186e125b Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Thu, 16 May 2013 18:20:26 +0800
Subject: powerpc: Set cpu sibling mask before online cpu

It seems following race is possible:

	cpu0					cpux
smp_init->cpu_up->_cpu_up
	__cpu_up
		kick_cpu(1)
-------------------------------------------------------------------------
		waiting online			...
		...				notify CPU_STARTING
							set cpux active
						set cpux online
-------------------------------------------------------------------------
		finish waiting online
		...
sched_init_smp
	init_sched_domains(cpu_active_mask)
		build_sched_domains
						set cpux sibling info
-------------------------------------------------------------------------

Execution of cpu0 and cpux could be concurrent between two separator
lines.

So if the cpux sibling information was set too late (normally
impossible, but could be triggered by adding some delay in
start_secondary, after setting cpu online), build_sched_domains()
running on cpu0 might see cpux active, with an empty sibling mask, then
cause some bad address accessing like following:

[    0.099855] Unable to handle kernel paging request for data at address 0xc00000038518078f
[    0.099868] Faulting instruction address: 0xc0000000000b7a64
[    0.099883] Oops: Kernel access of bad area, sig: 11 [#1]
[    0.099895] PREEMPT SMP NR_CPUS=16 DEBUG_PAGEALLOC NUMA pSeries
[    0.099922] Modules linked in:
[    0.099940] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.10.0-rc1-00120-gb973425-dirty #16
[    0.099956] task: c0000001fed80000 ti: c0000001fed7c000 task.ti: c0000001fed7c000
[    0.099971] NIP: c0000000000b7a64 LR: c0000000000b7a40 CTR: c0000000000b4934
[    0.099985] REGS: c0000001fed7f760 TRAP: 0300   Not tainted  (3.10.0-rc1-00120-gb973425-dirty)
[    0.099997] MSR: 8000000000009032 <SF,EE,ME,IR,DR,RI>  CR: 24272828  XER: 20000003
[    0.100045] SOFTE: 1
[    0.100053] CFAR: c000000000445ee8
[    0.100064] DAR: c00000038518078f, DSISR: 40000000
[    0.100073]
GPR00: 0000000000000080 c0000001fed7f9e0 c000000000c84d48 0000000000000010
GPR04: 0000000000000010 0000000000000000 c0000001fc55e090 0000000000000000
GPR08: ffffffffffffffff c000000000b80b30 c000000000c962d8 00000003845ffc5f
GPR12: 0000000000000000 c00000000f33d000 c00000000000b9e4 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000001 0000000000000000
GPR20: c000000000ccf750 0000000000000000 c000000000c94d48 c0000001fc504000
GPR24: c0000001fc504000 c0000001fecef848 c000000000c94d48 c000000000ccf000
GPR28: c0000001fc522090 0000000000000010 c0000001fecef848 c0000001fed7fae0
[    0.100293] NIP [c0000000000b7a64] .get_group+0x84/0xc4
[    0.100307] LR [c0000000000b7a40] .get_group+0x60/0xc4
[    0.100318] Call Trace:
[    0.100332] [c0000001fed7f9e0] [c0000000000dbce4] .lock_is_held+0xa8/0xd0 (unreliable)
[    0.100354] [c0000001fed7fa70] [c0000000000bf62c] .build_sched_domains+0x728/0xd14
[    0.100375] [c0000001fed7fbe0] [c000000000af67bc] .sched_init_smp+0x4fc/0x654
[    0.100394] [c0000001fed7fce0] [c000000000adce24] .kernel_init_freeable+0x17c/0x30c
[    0.100413] [c0000001fed7fdb0] [c00000000000ba08] .kernel_init+0x24/0x12c
[    0.100431] [c0000001fed7fe30] [c000000000009f74] .ret_from_kernel_thread+0x5c/0x68
[    0.100445] Instruction dump:
[    0.100456] 38800010 38a00000 4838e3f5 60000000 7c6307b4 2fbf0000 419e0040 3d220001
[    0.100496] 78601f24 39491590 e93e0008 7d6a002a <7d69582a> f97f0000 7d4a002a e93e0010
[    0.100559] ---[ end trace 31fd0ba7d8756001 ]---

This patch tries to move the sibling maps updating before
notify_cpu_starting() and cpu online, and a write barrier there to make
sure sibling maps are updated before active and online mask.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 85398c7..38b0ba6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -637,12 +637,10 @@ void start_secondary(void *unused)
 
 	vdso_getcpu_init();
 #endif
-	notify_cpu_starting(cpu);
-	set_cpu_online(cpu, true);
 	/* Update sibling maps */
 	base = cpu_first_thread_sibling(cpu);
 	for (i = 0; i < threads_per_core; i++) {
-		if (cpu_is_offline(base + i))
+		if (cpu_is_offline(base + i) && (cpu != base + i))
 			continue;
 		cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
 		cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
@@ -667,6 +665,10 @@ void start_secondary(void *unused)
 	}
 	of_node_put(l2_cache);
 
+	smp_wmb();
+	notify_cpu_starting(cpu);
+	set_cpu_online(cpu, true);
+
 	local_irq_enable();
 
 	cpu_startup_entry(CPUHP_ONLINE);
-- 
cgit v0.10.2


From 7029705a9d0544186c29ae09708b3e5adb512835 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Tue, 21 May 2013 17:20:50 +0800
Subject: powerpc/nvram64: Need return the related error code on failure occurs

When error occurs, need return the related error code to let upper
caller know about it.

ppc_md.nvram_size() can return the error code (e.g. core99_nvram_size()
in 'arch/powerpc/platforms/powermac/nvram.c').

Also set ret value when only need it, so can save structions for normal
cases.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 48fbc2b..8213ee1 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf,
 	char *tmp = NULL;
 	ssize_t size;
 
-	ret = -ENODEV;
-	if (!ppc_md.nvram_size)
+	if (!ppc_md.nvram_size) {
+		ret = -ENODEV;
 		goto out;
+	}
 
-	ret = 0;
 	size = ppc_md.nvram_size();
-	if (*ppos >= size || size < 0)
+	if (size < 0) {
+		ret = size;
+		goto out;
+	}
+
+	if (*ppos >= size) {
+		ret = 0;
 		goto out;
+	}
 
 	count = min_t(size_t, count, size - *ppos);
 	count = min(count, PAGE_SIZE);
 
-	ret = -ENOMEM;
 	tmp = kmalloc(count, GFP_KERNEL);
-	if (!tmp)
+	if (!tmp) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	ret = ppc_md.nvram_read(tmp, count, ppos);
 	if (ret <= 0)
-- 
cgit v0.10.2


From 91f5af2e6524f795c0ee5d2fdc3be40d0e427ae2 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Tue, 21 May 2013 20:45:40 +0200
Subject: macintosh/windfarm: Remove obsolete cleanup for clientdata

A few new i2c-drivers came into the kernel which clear the clientdata-pointer
on exit or error. This is obsolete meanwhile, the core will do it.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c
index d87f5ee..ad6223e 100644
--- a/drivers/macintosh/windfarm_smu_sat.c
+++ b/drivers/macintosh/windfarm_smu_sat.c
@@ -343,7 +343,6 @@ static int wf_sat_remove(struct i2c_client *client)
 		wf_unregister_sensor(&sens->sens);
 	}
 	sat->i2c = NULL;
-	i2c_set_clientdata(client, NULL);
 	kref_put(&sat->ref, wf_sat_release);
 
 	return 0;
-- 
cgit v0.10.2


From 8246aca7058f3f2c2ae503081777965cd8df7b90 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 20 Mar 2013 14:30:12 +0800
Subject: powerpc/smp: Section mismatch from smp_release_cpus to __initdata
 spinning_secondaries

the smp_release_cpus is a normal funciton and called in normal environments,
  but it calls the __initdata spinning_secondaries.
  need modify spinning_secondaries to match smp_release_cpus.

the related warning:
  (the linker report boot_paca.33377, but it should be spinning_secondaries)

-----------------------------------------------------------------------------

WARNING: arch/powerpc/kernel/built-in.o(.text+0x23176): Section mismatch in reference from the function .smp_release_cpus() to the variable .init.data:boot_paca.33377
The function .smp_release_cpus() references
the variable __initdata boot_paca.33377.
This is often because .smp_release_cpus lacks a __initdata
annotation or the annotation of boot_paca.33377 is wrong.

WARNING: arch/powerpc/kernel/built-in.o(.text+0x231fe): Section mismatch in reference from the function .smp_release_cpus() to the variable .init.data:boot_paca.33377
The function .smp_release_cpus() references
the variable __initdata boot_paca.33377.
This is often because .smp_release_cpus lacks a __initdata
annotation or the annotation of boot_paca.33377 is wrong.

-----------------------------------------------------------------------------

Signed-off-by: Chen Gang <gang.chen@asianux.com>
CC: <stable@vger.kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index e379d3f..389fb807 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -76,7 +76,7 @@
 #endif
 
 int boot_cpuid = 0;
-int __initdata spinning_secondaries;
+int spinning_secondaries;
 u64 ppc64_pft_size;
 
 /* Pick defaults since we might want to patch instructions
-- 
cgit v0.10.2


From ec207dcc91c62aeccb65b36c7764076f5e5aaddb Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Fri, 28 Jun 2013 21:12:14 +0800
Subject: powerpc/eeh: Update MAINTAINERS

Update MAINTAINERS to reflect recent changes.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 5be702c..ebeceeb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3103,6 +3103,13 @@ M:	Maxim Levitsky <maximlevitsky@gmail.com>
 S:	Maintained
 F:	drivers/media/rc/ene_ir.*
 
+ENHANCED ERROR HANDLING (EEH)
+M:	Gavin Shan <shangw@linux.vnet.ibm.com>
+L:	linuxppc-dev@lists.ozlabs.org
+S:	Supported
+F:	Documentation/powerpc/eeh-pci-error-recovery.txt
+F:	arch/powerpc/kernel/eeh*.c
+
 EPSON S1D13XXX FRAMEBUFFER DRIVER
 M:	Kristoffer Ericson <kristoffer.ericson@gmail.com>
 S:	Maintained
@@ -6149,7 +6156,6 @@ M:	Linas Vepstas <linasvepstas@gmail.com>
 L:	linux-pci@vger.kernel.org
 S:	Supported
 F:	Documentation/PCI/pci-error-recovery.txt
-F:	Documentation/powerpc/eeh-pci-error-recovery.txt
 
 PCI SUBSYSTEM
 M:	Bjorn Helgaas <bhelgaas@google.com>
-- 
cgit v0.10.2


From dd023217e17e72b46fb4d49c7734c426938c3dba Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Mon, 24 Jun 2013 22:08:05 -0500
Subject: powerpc/numa: Do not update sysfs cpu registration from invalid
 context

The topology update code that updates the cpu node registration in sysfs
should not be called while in stop_machine(). The register/unregister
calls take a lock and may sleep.

This patch moves these calls outside of the call to stop_machine().

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
CC: <stable@vger.kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c792cd9..0839721 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1432,11 +1432,9 @@ static int update_cpu_topology(void *data)
 		if (cpu != update->cpu)
 			continue;
 
-		unregister_cpu_under_node(update->cpu, update->old_nid);
 		unmap_cpu_from_node(update->cpu);
 		map_cpu_to_node(update->cpu, update->new_nid);
 		vdso_getcpu_init();
-		register_cpu_under_node(update->cpu, update->new_nid);
 	}
 
 	return 0;
@@ -1484,6 +1482,9 @@ int arch_update_cpu_topology(void)
 	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
 
 	for (ud = &updates[0]; ud; ud = ud->next) {
+		unregister_cpu_under_node(ud->cpu, ud->old_nid);
+		register_cpu_under_node(ud->cpu, ud->new_nid);
+
 		dev = get_cpu_device(ud->cpu);
 		if (dev)
 			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
-- 
cgit v0.10.2


From 1d567cb4bd42d560a7621cac6f6aebe87343689e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 25 Jun 2013 17:47:54 +1000
Subject: powerpc: Remove unreachable relocation on exception handlers

We have relocation on exception handlers defined for h_data_storage and
h_instr_storage. However we will never take relocation on exceptions for
these because they can only come from a guest, and we never take
relocation on exceptions when we transition from guest to host.

We also have a handler for hmi_exception (Hypervisor Maintenance) which
is defined in the architecture to never be delivered with relocation on,
see see v2.07 Book III-S section 6.5.

So remove the handlers, leaving a branch to self just to be double extra
paranoid.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.9+]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e783453..dcadf03 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -793,14 +793,10 @@ system_call_relon_pSeries:
 	STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
 
 	. = 0x4e00
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_data_storage_relon_hv
+	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
 
 	. = 0x4e20
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_instr_storage_relon_hv
+	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
 
 	. = 0x4e40
 	SET_SCRATCH0(r13)
@@ -808,9 +804,7 @@ system_call_relon_pSeries:
 	b	emulation_assist_relon_hv
 
 	. = 0x4e60
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	hmi_exception_relon_hv
+	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
 
 	. = 0x4e80
 	SET_SCRATCH0(r13)
@@ -1180,14 +1174,8 @@ tm_unavailable_common:
 __end_handlers:
 
 	/* Equivalents to the above handlers for relocation-on interrupt vectors */
-	STD_RELON_EXCEPTION_HV_OOL(0xe00, h_data_storage)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00)
-	STD_RELON_EXCEPTION_HV_OOL(0xe20, h_instr_storage)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20)
 	STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40)
-	STD_RELON_EXCEPTION_HV_OOL(0xe60, hmi_exception)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60)
 	MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80)
 
-- 
cgit v0.10.2


From c9f69518e5f08170bc857984a077f693d63171df Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 25 Jun 2013 17:47:55 +1000
Subject: powerpc: Remove KVMTEST from RELON exception handlers

KVMTEST is a macro which checks whether we are taking an exception from
guest context, if so we branch out of line and eventually call into the
KVM code to handle the switch.

When running real guests on bare metal (HV KVM) the hardware ensures
that we never take a relocation on exception when transitioning from
guest to host. For PR KVM we disable relocation on exceptions ourself in
kvmppc_core_init_vm(), as of commit a413f47 "Disable relocation on
exceptions whenever PR KVM is active".

So convert all the RELON macros to use NOTEST, and drop the remaining
KVM_HANDLER() definitions we have for 0xe40 and 0xe80.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.9+]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 8e5fae8..484cb09 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -358,12 +358,12 @@ label##_relon_pSeries:					\
 	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);		/* save r13 */	\
 	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
-				       EXC_STD, KVMTEST_PR, vec)
+				       EXC_STD, NOTEST, vec)
 
 #define STD_RELON_EXCEPTION_PSERIES_OOL(vec, label)		\
 	.globl label##_relon_pSeries;				\
 label##_relon_pSeries:						\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, vec);	\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_STD)
 
 #define STD_RELON_EXCEPTION_HV(loc, vec, label)		\
@@ -374,12 +374,12 @@ label##_relon_hv:					\
 	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);	/* save r13 */		\
 	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
-				       EXC_HV, KVMTEST, vec)
+				       EXC_HV, NOTEST, vec)
 
 #define STD_RELON_EXCEPTION_HV_OOL(vec, label)			\
 	.globl label##_relon_hv;				\
 label##_relon_hv:						\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_HV)
 
 /* This associate vector numbers with bits in paca->irq_happened */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index dcadf03..89eba1f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1175,9 +1175,7 @@ __end_handlers:
 
 	/* Equivalents to the above handlers for relocation-on interrupt vectors */
 	STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40)
 	MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80)
 
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
-- 
cgit v0.10.2


From 021424a1fce335e05807fd770eb8e1da30a63eea Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michaele@au1.ibm.com>
Date: Tue, 25 Jun 2013 17:47:56 +1000
Subject: powerpc: Rename and flesh out the facility unavailable exception
 handler

The exception at 0xf60 is not the TM (Transactional Memory) unavailable
exception, it is the "Facility Unavailable Exception", rename it as
such.

Flesh out the handler to acknowledge the fact that it can be called for
many reasons, one of which is TM being unavailable.

Use STD_EXCEPTION_COMMON() for the exception body, for some reason we
had it open-coded, I've checked the generated code is identical.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 89eba1f..6ee74f8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -341,10 +341,11 @@ vsx_unavailable_pSeries_1:
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	vsx_unavailable_pSeries
 
+facility_unavailable_trampoline:
 	. = 0xf60
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	tm_unavailable_pSeries
+	b	facility_unavailable_pSeries
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
@@ -522,7 +523,7 @@ denorm_done:
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
-	STD_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable)
+	STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
 
 /*
@@ -829,11 +830,11 @@ vsx_unavailable_relon_pSeries_1:
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	vsx_unavailable_relon_pSeries
 
-tm_unavailable_relon_pSeries_1:
+facility_unavailable_relon_trampoline:
 	. = 0x4f60
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	tm_unavailable_relon_pSeries
+	b	facility_unavailable_relon_pSeries
 
 	STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
 #ifdef CONFIG_PPC_DENORMALISATION
@@ -1159,15 +1160,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	bl	.vsx_unavailable_exception
 	b	.ret_from_except
 
-	.align	7
-	.globl tm_unavailable_common
-tm_unavailable_common:
-	EXCEPTION_PROLOG_COMMON(0xf60, PACA_EXGEN)
-	bl	.save_nvgprs
-	DISABLE_INTS
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	.tm_unavailable_exception
-	b	.ret_from_except
+	STD_EXCEPTION_COMMON(0xf60, facility_unavailable, .facility_unavailable_exception)
 
 	.align	7
 	.globl	__end_handlers
@@ -1180,7 +1173,7 @@ __end_handlers:
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
-	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable)
+	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 300daf3..11f4488 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1286,25 +1286,42 @@ void vsx_unavailable_exception(struct pt_regs *regs)
 	die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT);
 }
 
-void tm_unavailable_exception(struct pt_regs *regs)
+void facility_unavailable_exception(struct pt_regs *regs)
 {
+	static char *facility_strings[] = {
+		"FPU",
+		"VMX/VSX",
+		"DSCR",
+		"PMU SPRs",
+		"BHRB",
+		"TM",
+		"AT",
+		"EBB",
+		"TAR",
+	};
+	char *facility;
+	u64 value;
+
+	value = mfspr(SPRN_FSCR) >> 56;
+
 	/* We restore the interrupt state now */
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	/* Currently we never expect a TMU exception.  Catch
-	 * this and kill the process!
-	 */
-	printk(KERN_EMERG "Unexpected TM unavailable exception at %lx "
-	       "(msr %lx)\n",
-	       regs->nip, regs->msr);
+	if (value < ARRAY_SIZE(facility_strings))
+		facility = facility_strings[value];
+	else
+		facility = "unknown";
+
+	pr_err("Facility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
+		facility, regs->nip, regs->msr);
 
 	if (user_mode(regs)) {
 		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 		return;
 	}
 
-	die("Unexpected TM unavailable exception", regs, SIGABRT);
+	die("Unexpected facility unavailable exception", regs, SIGABRT);
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-- 
cgit v0.10.2


From b14b6260efeee6eb8942c6e6420e31281892acb6 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 25 Jun 2013 17:47:57 +1000
Subject: powerpc: Wire up the HV facility unavailable exception

Similar to the facility unavailble exception, except the facilities are
controlled by HFSCR.

Adapt the facility_unavailable_exception() so it can be called for
either the regular or Hypervisor facility unavailable exceptions.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6ee74f8..359d949 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -347,6 +347,12 @@ facility_unavailable_trampoline:
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	facility_unavailable_pSeries
 
+hv_facility_unavailable_trampoline:
+	. = 0xf80
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	facility_unavailable_hv
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
 	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
@@ -525,6 +531,8 @@ denorm_done:
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
 	STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
+	STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
 
 /*
  * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
@@ -836,6 +844,12 @@ facility_unavailable_relon_trampoline:
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	facility_unavailable_relon_pSeries
 
+hv_facility_unavailable_relon_trampoline:
+	. = 0x4f80
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	facility_unavailable_relon_hv
+
 	STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
 #ifdef CONFIG_PPC_DENORMALISATION
 	. = 0x5500
@@ -1174,6 +1188,7 @@ __end_handlers:
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
 	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+	STD_RELON_EXCEPTION_HV_OOL(0xf80, facility_unavailable)
 
 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
 /*
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11f4488..4a87fcb 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1299,10 +1299,18 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		"EBB",
 		"TAR",
 	};
-	char *facility;
+	char *facility, *prefix;
 	u64 value;
 
-	value = mfspr(SPRN_FSCR) >> 56;
+	if (regs->trap == 0xf60) {
+		value = mfspr(SPRN_FSCR);
+		prefix = "";
+	} else {
+		value = mfspr(SPRN_HFSCR);
+		prefix = "Hypervisor ";
+	}
+
+	value = value >> 56;
 
 	/* We restore the interrupt state now */
 	if (!arch_irq_disabled_regs(regs))
@@ -1313,8 +1321,8 @@ void facility_unavailable_exception(struct pt_regs *regs)
 	else
 		facility = "unknown";
 
-	pr_err("Facility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
-		facility, regs->nip, regs->msr);
+	pr_err("%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
+		prefix, facility, regs->nip, regs->msr);
 
 	if (user_mode(regs)) {
 		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-- 
cgit v0.10.2


From d8bec4c9cd58f6d3679e09b7293851fb92ad7557 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:10 +1000
Subject: powerpc/perf: Check that events only include valid bits on Power8

A mistake we have made in the past is that we pull out the fields we
need from the event code, but don't check that there are no unknown bits
set. This means that we can't ever assign meaning to those unknown bits
in future.

Although we have once again failed to do this at release, it is still
early days for Power8 so I think we can still slip this in and get away
with it.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index f7d1c4f..84cdc6d 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -109,6 +109,16 @@
 #define EVENT_IS_MARKED		(EVENT_MARKED_MASK << EVENT_MARKED_SHIFT)
 #define EVENT_PSEL_MASK		0xff	/* PMCxSEL value */
 
+#define EVENT_VALID_MASK	\
+	((EVENT_THRESH_MASK    << EVENT_THRESH_SHIFT)		|	\
+	 (EVENT_SAMPLE_MASK    << EVENT_SAMPLE_SHIFT)		|	\
+	 (EVENT_CACHE_SEL_MASK << EVENT_CACHE_SEL_SHIFT)	|	\
+	 (EVENT_PMC_MASK       << EVENT_PMC_SHIFT)		|	\
+	 (EVENT_UNIT_MASK      << EVENT_UNIT_SHIFT)		|	\
+	 (EVENT_COMBINE_MASK   << EVENT_COMBINE_SHIFT)		|	\
+	 (EVENT_MARKED_MASK    << EVENT_MARKED_SHIFT)		|	\
+	  EVENT_PSEL_MASK)
+
 /* MMCRA IFM bits - POWER8 */
 #define	POWER8_MMCRA_IFM1		0x0000000040000000UL
 #define	POWER8_MMCRA_IFM2		0x0000000080000000UL
@@ -212,6 +222,9 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
 
 	mask = value = 0;
 
+	if (event & ~EVENT_VALID_MASK)
+		return -1;
+
 	pmc   = (event >> EVENT_PMC_SHIFT)       & EVENT_PMC_MASK;
 	unit  = (event >> EVENT_UNIT_SHIFT)      & EVENT_UNIT_MASK;
 	cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK;
-- 
cgit v0.10.2


From 378a6ee99e4a431ec84e4e61893445c041c93007 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:11 +1000
Subject: powerpc/perf: Rework disable logic in pmu_disable()

In pmu_disable() we disable the PMU by setting the FC (Freeze Counters)
bit in MMCR0. In order to do this we have to read/modify/write MMCR0.

It's possible that we read a value from MMCR0 which has PMAO (PMU Alert
Occurred) set. When we write that value back it will cause an interrupt
to occur. We will then end up in the PMU interrupt handler even though
we are supposed to have just disabled the PMU.

We can avoid this by making sure we never write PMAO back. We should not
lose interrupts because when the PMU is re-enabled the overflowed values
will cause another interrupt.

We also reorder the clearing of SAMPLE_ENABLE so that is done after the
PMU is frozen. Otherwise there is a small window between the clearing of
SAMPLE_ENABLE and the setting of FC where we could take an interrupt and
incorrectly see SAMPLE_ENABLE not set. This would for example change the
logic in perf_read_regs().

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index af94a71..5d502bf 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -75,6 +75,7 @@ static unsigned int freeze_events_kernel = MMCR0_FCS;
 
 #define MMCR0_FCHV		0
 #define MMCR0_PMCjCE		MMCR0_PMCnCE
+#define MMCR0_PMAO		0
 
 #define SPRN_MMCRA		SPRN_MMCR2
 #define MMCRA_SAMPLE_ENABLE	0
@@ -852,7 +853,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
 static void power_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
-	unsigned long flags;
+	unsigned long flags, val;
 
 	if (!ppmu)
 		return;
@@ -860,9 +861,6 @@ static void power_pmu_disable(struct pmu *pmu)
 	cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	if (!cpuhw->disabled) {
-		cpuhw->disabled = 1;
-		cpuhw->n_added = 0;
-
 		/*
 		 * Check if we ever enabled the PMU on this cpu.
 		 */
@@ -872,6 +870,21 @@ static void power_pmu_disable(struct pmu *pmu)
 		}
 
 		/*
+		 * Set the 'freeze counters' bit, clear PMAO.
+		 */
+		val  = mfspr(SPRN_MMCR0);
+		val |= MMCR0_FC;
+		val &= ~MMCR0_PMAO;
+
+		/*
+		 * The barrier is to make sure the mtspr has been
+		 * executed and the PMU has frozen the events etc.
+		 * before we return.
+		 */
+		write_mmcr0(cpuhw, val);
+		mb();
+
+		/*
 		 * Disable instruction sampling if it was enabled
 		 */
 		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
@@ -880,14 +893,8 @@ static void power_pmu_disable(struct pmu *pmu)
 			mb();
 		}
 
-		/*
-		 * Set the 'freeze counters' bit.
-		 * The barrier is to make sure the mtspr has been
-		 * executed and the PMU has frozen the events
-		 * before we return.
-		 */
-		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
-		mb();
+		cpuhw->disabled = 1;
+		cpuhw->n_added = 0;
 	}
 	local_irq_restore(flags);
 }
-- 
cgit v0.10.2


From 7a7a41f9d5b28ac3a916b057a7d3cd3f435ee9a6 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:12 +1000
Subject: powerpc/perf: Freeze PMC5/6 if we're not using them

On Power8 we can freeze PMC5 and 6 if we're not using them. Normally they
run all the time.

As noticed by Anshuman, we should unfreeze them when we disable the PMU
as there are legacy tools which expect them to run all the time.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4a9e408..362142b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -626,6 +626,7 @@
 #define   MMCR0_TRIGGER	0x00002000UL /* TRIGGER enable */
 #define   MMCR0_PMAO	0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */
 #define   MMCR0_SHRFC	0x00000040UL /* SHRre freeze conditions between threads */
+#define   MMCR0_FC56	0x00000010UL /* freeze counters 5 and 6 */
 #define   MMCR0_FCTI	0x00000008UL /* freeze counters in tags inactive mode */
 #define   MMCR0_FCTA	0x00000004UL /* freeze counters in tags active mode */
 #define   MMCR0_FCWAIT	0x00000002UL /* freeze counter in WAIT state */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 5d502bf..517a135 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -75,6 +75,7 @@ static unsigned int freeze_events_kernel = MMCR0_FCS;
 
 #define MMCR0_FCHV		0
 #define MMCR0_PMCjCE		MMCR0_PMCnCE
+#define MMCR0_FC56		0
 #define MMCR0_PMAO		0
 
 #define SPRN_MMCRA		SPRN_MMCR2
@@ -870,11 +871,11 @@ static void power_pmu_disable(struct pmu *pmu)
 		}
 
 		/*
-		 * Set the 'freeze counters' bit, clear PMAO.
+		 * Set the 'freeze counters' bit, clear PMAO/FC56.
 		 */
 		val  = mfspr(SPRN_MMCR0);
 		val |= MMCR0_FC;
-		val &= ~MMCR0_PMAO;
+		val &= ~(MMCR0_PMAO | MMCR0_FC56);
 
 		/*
 		 * The barrier is to make sure the mtspr has been
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 84cdc6d..d59f5b2 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -391,6 +391,10 @@ static int power8_compute_mmcr(u64 event[], int n_ev,
 	if (pmc_inuse & 0x7c)
 		mmcr[0] |= MMCR0_PMCjCE;
 
+	/* If we're not using PMC 5 or 6, freeze them */
+	if (!(pmc_inuse & 0x60))
+		mmcr[0] |= MMCR0_FC56;
+
 	mmcr[1] = mmcr1;
 	mmcr[2] = mmcra;
 
-- 
cgit v0.10.2


From 0a48843d6c5114cfa4a9540ee4d6af87628cec01 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:13 +1000
Subject: powerpc/perf: Use existing out label in power_pmu_enable()

In power_pmu_enable() we can use the existing out label to reduce the
number of return paths.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 517a135..1bb26d5 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -919,12 +919,13 @@ static void power_pmu_enable(struct pmu *pmu)
 
 	if (!ppmu)
 		return;
+
 	local_irq_save(flags);
+
 	cpuhw = &__get_cpu_var(cpu_hw_events);
-	if (!cpuhw->disabled) {
-		local_irq_restore(flags);
-		return;
-	}
+	if (!cpuhw->disabled)
+		goto out;
+
 	cpuhw->disabled = 0;
 
 	/*
-- 
cgit v0.10.2


From 4ea355b5368bde0574c12430df53334c4be3bdcf Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:14 +1000
Subject: powerpc/perf: Don't enable if we have zero events

In power_pmu_enable() we still enable the PMU even if we have zero
events. This should have no effect but doesn't make much sense. Instead
just return after telling the hypervisor that we are not using the PMCs.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
CC: <stable@vger.kernel.org> [v3.10]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 1bb26d5..c91dc43 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -926,6 +926,11 @@ static void power_pmu_enable(struct pmu *pmu)
 	if (!cpuhw->disabled)
 		goto out;
 
+	if (cpuhw->n_events == 0) {
+		ppc_set_pmu_inuse(0);
+		goto out;
+	}
+
 	cpuhw->disabled = 0;
 
 	/*
@@ -937,8 +942,6 @@ static void power_pmu_enable(struct pmu *pmu)
 	if (!cpuhw->n_added) {
 		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-		if (cpuhw->n_events == 0)
-			ppc_set_pmu_inuse(0);
 		goto out_enable;
 	}
 
-- 
cgit v0.10.2


From 2ac138ca21ad26c988ce7c91d27327f85beb7519 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:15 +1000
Subject: powerpc/perf: Drop MMCRA from thread_struct

In commit 59affcd "Context switch more PMU related SPRs" I added more
PMU SPRs to thread_struct, later modified in commit b11ae95. To add
insult to injury it turns out we don't need to switch MMCRA as it's
only user readable, and the value is recomputed by the PMU code.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 9fe1129..3f19df3 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -289,7 +289,6 @@ struct thread_struct {
 	unsigned long	sier;
 	unsigned long	mmcr0;
 	unsigned long	mmcr2;
-	unsigned long	mmcra;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index dc684b1..c7e8afc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -132,7 +132,6 @@ int main(void)
 	DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier));
 	DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0));
 	DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2));
-	DEFINE(THREAD_MMCRA, offsetof(struct thread_struct, mmcra));
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
-- 
cgit v0.10.2


From 330a1eb7775ba876dbd46b9885556e57f705e3d4 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:16 +1000
Subject: powerpc/perf: Core EBB support for 64-bit book3s

Add support for EBB (Event Based Branches) on 64-bit book3s. See the
included documentation for more details.

EBBs are a feature which allows the hardware to branch directly to a
specified user space address when a PMU event overflows. This can be
used by programs for self-monitoring with no kernel involvement in the
inner loop.

Most of the logic is in the generic book3s code, primarily to avoid a
proliferation of PMU callbacks.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX
index dd9e9280..05026ce 100644
--- a/Documentation/powerpc/00-INDEX
+++ b/Documentation/powerpc/00-INDEX
@@ -14,6 +14,8 @@ hvcs.txt
 	- IBM "Hypervisor Virtual Console Server" Installation Guide
 mpc52xx.txt
 	- Linux 2.6.x on MPC52xx family
+pmu-ebb.txt
+	- Description of the API for using the PMU with Event Based Branches.
 qe_firmware.txt
 	- describes the layout of firmware binaries for the Freescale QUICC
 	  Engine and the code that parses and uploads the microcode therein.
diff --git a/Documentation/powerpc/pmu-ebb.txt b/Documentation/powerpc/pmu-ebb.txt
new file mode 100644
index 0000000..73cd163
--- /dev/null
+++ b/Documentation/powerpc/pmu-ebb.txt
@@ -0,0 +1,137 @@
+PMU Event Based Branches
+========================
+
+Event Based Branches (EBBs) are a feature which allows the hardware to
+branch directly to a specified user space address when certain events occur.
+
+The full specification is available in Power ISA v2.07:
+
+  https://www.power.org/documentation/power-isa-version-2-07/
+
+One type of event for which EBBs can be configured is PMU exceptions. This
+document describes the API for configuring the Power PMU to generate EBBs,
+using the Linux perf_events API.
+
+
+Terminology
+-----------
+
+Throughout this document we will refer to an "EBB event" or "EBB events". This
+just refers to a struct perf_event which has set the "EBB" flag in its
+attr.config. All events which can be configured on the hardware PMU are
+possible "EBB events".
+
+
+Background
+----------
+
+When a PMU EBB occurs it is delivered to the currently running process. As such
+EBBs can only sensibly be used by programs for self-monitoring.
+
+It is a feature of the perf_events API that events can be created on other
+processes, subject to standard permission checks. This is also true of EBB
+events, however unless the target process enables EBBs (via mtspr(BESCR)) no
+EBBs will ever be delivered.
+
+This makes it possible for a process to enable EBBs for itself, but not
+actually configure any events. At a later time another process can come along
+and attach an EBB event to the process, which will then cause EBBs to be
+delivered to the first process. It's not clear if this is actually useful.
+
+
+When the PMU is configured for EBBs, all PMU interrupts are delivered to the
+user process. This means once an EBB event is scheduled on the PMU, no non-EBB
+events can be configured. This means that EBB events can not be run
+concurrently with regular 'perf' commands, or any other perf events.
+
+It is however safe to run 'perf' commands on a process which is using EBBs. The
+kernel will in general schedule the EBB event, and perf will be notified that
+its events could not run.
+
+The exclusion between EBB events and regular events is implemented using the
+existing "pinned" and "exclusive" attributes of perf_events. This means EBB
+events will be given priority over other events, unless they are also pinned.
+If an EBB event and a regular event are both pinned, then whichever is enabled
+first will be scheduled and the other will be put in error state. See the
+section below titled "Enabling an EBB event" for more information.
+
+
+Creating an EBB event
+---------------------
+
+To request that an event is counted using EBB, the event code should have bit
+63 set.
+
+EBB events must be created with a particular, and restrictive, set of
+attributes - this is so that they interoperate correctly with the rest of the
+perf_events subsystem.
+
+An EBB event must be created with the "pinned" and "exclusive" attributes set.
+Note that if you are creating a group of EBB events, only the leader can have
+these attributes set.
+
+An EBB event must NOT set any of the "inherit", "sample_period", "freq" or
+"enable_on_exec" attributes.
+
+An EBB event must be attached to a task. This is specified to perf_event_open()
+by passing a pid value, typically 0 indicating the current task.
+
+All events in a group must agree on whether they want EBB. That is all events
+must request EBB, or none may request EBB.
+
+EBB events must specify the PMC they are to be counted on. This ensures
+userspace is able to reliably determine which PMC the event is scheduled on.
+
+
+Enabling an EBB event
+---------------------
+
+Once an EBB event has been successfully opened, it must be enabled with the
+perf_events API. This can be achieved either via the ioctl() interface, or the
+prctl() interface.
+
+However, due to the design of the perf_events API, enabling an event does not
+guarantee that it has been scheduled on the PMU. To ensure that the EBB event
+has been scheduled on the PMU, you must perform a read() on the event. If the
+read() returns EOF, then the event has not been scheduled and EBBs are not
+enabled.
+
+This behaviour occurs because the EBB event is pinned and exclusive. When the
+EBB event is enabled it will force all other non-pinned events off the PMU. In
+this case the enable will be successful. However if there is already an event
+pinned on the PMU then the enable will not be successful.
+
+
+Reading an EBB event
+--------------------
+
+It is possible to read() from an EBB event. However the results are
+meaningless. Because interrupts are being delivered to the user process the
+kernel is not able to count the event, and so will return a junk value.
+
+
+Closing an EBB event
+--------------------
+
+When an EBB event is finished with, you can close it using close() as for any
+regular event. If this is the last EBB event the PMU will be deconfigured and
+no further PMU EBBs will be delivered.
+
+
+EBB Handler
+-----------
+
+The EBB handler is just regular userspace code, however it must be written in
+the style of an interrupt handler. When the handler is entered all registers
+are live (possibly) and so must be saved somehow before the handler can invoke
+other code.
+
+It's up to the program how to handle this. For C programs a relatively simple
+option is to create an interrupt frame on the stack and save registers there.
+
+Fork
+----
+
+EBB events are not inherited across fork. If the child process wishes to use
+EBBs it should open a new event for itself. Similarly the EBB state in
+BESCR/EBBHR/EBBRR is cleared across fork().
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index f265049..2dd7bfc 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -60,6 +60,7 @@ struct power_pmu {
 #define PPMU_HAS_SSLOT		0x00000020 /* Has sampled slot in MMCRA */
 #define PPMU_HAS_SIER		0x00000040 /* Has SIER */
 #define PPMU_BHRB		0x00000080 /* has BHRB feature enabled */
+#define PPMU_EBB		0x00000100 /* supports event based branch */
 
 /*
  * Values for flags to get_alternatives()
@@ -68,6 +69,11 @@ struct power_pmu {
 #define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
 #define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
 
+/*
+ * We use the event config bit 63 as a flag to request EBB.
+ */
+#define EVENT_CONFIG_EBB_SHIFT	63
+
 extern int register_power_pmu(struct power_pmu *);
 
 struct pt_regs;
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 3f19df3..47a35b0 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -287,8 +287,9 @@ struct thread_struct {
 	unsigned long	siar;
 	unsigned long	sdar;
 	unsigned long	sier;
-	unsigned long	mmcr0;
 	unsigned long	mmcr2;
+	unsigned 	mmcr0;
+	unsigned 	used_ebb;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 362142b..5d7d9c2 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -621,6 +621,9 @@
 #define   MMCR0_PMXE	0x04000000UL /* performance monitor exception enable */
 #define   MMCR0_FCECE	0x02000000UL /* freeze ctrs on enabled cond or event */
 #define   MMCR0_TBEE	0x00400000UL /* time base exception enable */
+#define   MMCR0_EBE	0x00100000UL /* Event based branch enable */
+#define   MMCR0_PMCC	0x000c0000UL /* PMC control */
+#define   MMCR0_PMCC_U6	0x00080000UL /* PMC1-6 are R/W by user (PR) */
 #define   MMCR0_PMC1CE	0x00008000UL /* PMC1 count enable*/
 #define   MMCR0_PMCjCE	0x00004000UL /* PMCj count enable*/
 #define   MMCR0_TRIGGER	0x00002000UL /* TRIGGER enable */
@@ -674,6 +677,11 @@
 #define   SIER_SIAR_VALID	0x0400000	/* SIAR contents valid */
 #define   SIER_SDAR_VALID	0x0200000	/* SDAR contents valid */
 
+/* When EBB is enabled, some of MMCR0/MMCR2/SIER are user accessible */
+#define MMCR0_USER_MASK	(MMCR0_FC | MMCR0_PMXE | MMCR0_PMAO)
+#define MMCR2_USER_MASK	0x4020100804020000UL /* (FC1P|FC2P|FC3P|FC4P|FC5P|FC6P) */
+#define SIER_USER_MASK	0x7fffffUL
+
 #define SPRN_PA6T_MMCR0 795
 #define   PA6T_MMCR0_EN0	0x0000000000000001UL
 #define   PA6T_MMCR0_EN1	0x0000000000000002UL
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 200d763..49a13e0 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -67,4 +67,18 @@ static inline void flush_spe_to_thread(struct task_struct *t)
 }
 #endif
 
+static inline void clear_task_ebb(struct task_struct *t)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+    /* EBB perf events are not inherited, so clear all EBB state. */
+    t->thread.bescr = 0;
+    t->thread.mmcr2 = 0;
+    t->thread.mmcr0 = 0;
+    t->thread.siar = 0;
+    t->thread.sdar = 0;
+    t->thread.sier = 0;
+    t->thread.used_ebb = 0;
+#endif
+}
+
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b0f3e3f..f8a76e6 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	flush_altivec_to_thread(src);
 	flush_vsx_to_thread(src);
 	flush_spe_to_thread(src);
+
 	*dst = *src;
+
+	clear_task_ebb(dst);
+
 	return 0;
 }
 
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index c91dc43..a3985ae 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -77,6 +77,9 @@ static unsigned int freeze_events_kernel = MMCR0_FCS;
 #define MMCR0_PMCjCE		MMCR0_PMCnCE
 #define MMCR0_FC56		0
 #define MMCR0_PMAO		0
+#define MMCR0_EBE		0
+#define MMCR0_PMCC		0
+#define MMCR0_PMCC_U6		0
 
 #define SPRN_MMCRA		SPRN_MMCR2
 #define MMCRA_SAMPLE_ENABLE	0
@@ -104,6 +107,15 @@ static inline int siar_valid(struct pt_regs *regs)
 	return 1;
 }
 
+static bool is_ebb_event(struct perf_event *event) { return false; }
+static int ebb_event_check(struct perf_event *event) { return 0; }
+static void ebb_event_add(struct perf_event *event) { }
+static void ebb_switch_out(unsigned long mmcr0) { }
+static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
+{
+	return mmcr0;
+}
+
 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
 void power_pmu_flush_branch_stack(void) {}
@@ -464,6 +476,89 @@ void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
 	return;
 }
 
+static bool is_ebb_event(struct perf_event *event)
+{
+	/*
+	 * This could be a per-PMU callback, but we'd rather avoid the cost. We
+	 * check that the PMU supports EBB, meaning those that don't can still
+	 * use bit 63 of the event code for something else if they wish.
+	 */
+	return (ppmu->flags & PPMU_EBB) &&
+	       ((event->attr.config >> EVENT_CONFIG_EBB_SHIFT) & 1);
+}
+
+static int ebb_event_check(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+
+	/* Event and group leader must agree on EBB */
+	if (is_ebb_event(leader) != is_ebb_event(event))
+		return -EINVAL;
+
+	if (is_ebb_event(event)) {
+		if (!(event->attach_state & PERF_ATTACH_TASK))
+			return -EINVAL;
+
+		if (!leader->attr.pinned || !leader->attr.exclusive)
+			return -EINVAL;
+
+		if (event->attr.inherit || event->attr.sample_period ||
+		    event->attr.enable_on_exec || event->attr.freq)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void ebb_event_add(struct perf_event *event)
+{
+	if (!is_ebb_event(event) || current->thread.used_ebb)
+		return;
+
+	/*
+	 * IFF this is the first time we've added an EBB event, set
+	 * PMXE in the user MMCR0 so we can detect when it's cleared by
+	 * userspace. We need this so that we can context switch while
+	 * userspace is in the EBB handler (where PMXE is 0).
+	 */
+	current->thread.used_ebb = 1;
+	current->thread.mmcr0 |= MMCR0_PMXE;
+}
+
+static void ebb_switch_out(unsigned long mmcr0)
+{
+	if (!(mmcr0 & MMCR0_EBE))
+		return;
+
+	current->thread.siar  = mfspr(SPRN_SIAR);
+	current->thread.sier  = mfspr(SPRN_SIER);
+	current->thread.sdar  = mfspr(SPRN_SDAR);
+	current->thread.mmcr0 = mmcr0 & MMCR0_USER_MASK;
+	current->thread.mmcr2 = mfspr(SPRN_MMCR2) & MMCR2_USER_MASK;
+}
+
+static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
+{
+	if (!ebb)
+		goto out;
+
+	/* Enable EBB and read/write to all 6 PMCs for userspace */
+	mmcr0 |= MMCR0_EBE | MMCR0_PMCC_U6;
+
+	/* Add any bits from the user reg, FC or PMAO */
+	mmcr0 |= current->thread.mmcr0;
+
+	/* Be careful not to set PMXE if userspace had it cleared */
+	if (!(current->thread.mmcr0 & MMCR0_PMXE))
+		mmcr0 &= ~MMCR0_PMXE;
+
+	mtspr(SPRN_SIAR, current->thread.siar);
+	mtspr(SPRN_SIER, current->thread.sier);
+	mtspr(SPRN_SDAR, current->thread.sdar);
+	mtspr(SPRN_MMCR2, current->thread.mmcr2);
+out:
+	return mmcr0;
+}
 #endif /* CONFIG_PPC64 */
 
 static void perf_event_interrupt(struct pt_regs *regs);
@@ -734,6 +829,13 @@ static void power_pmu_read(struct perf_event *event)
 
 	if (!event->hw.idx)
 		return;
+
+	if (is_ebb_event(event)) {
+		val = read_pmc(event->hw.idx);
+		local64_set(&event->hw.prev_count, val);
+		return;
+	}
+
 	/*
 	 * Performance monitor interrupts come even when interrupts
 	 * are soft-disabled, as long as interrupts are hard-enabled.
@@ -854,7 +956,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
 static void power_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
-	unsigned long flags, val;
+	unsigned long flags, mmcr0, val;
 
 	if (!ppmu)
 		return;
@@ -871,11 +973,11 @@ static void power_pmu_disable(struct pmu *pmu)
 		}
 
 		/*
-		 * Set the 'freeze counters' bit, clear PMAO/FC56.
+		 * Set the 'freeze counters' bit, clear EBE/PMCC/PMAO/FC56.
 		 */
-		val  = mfspr(SPRN_MMCR0);
+		val  = mmcr0 = mfspr(SPRN_MMCR0);
 		val |= MMCR0_FC;
-		val &= ~(MMCR0_PMAO | MMCR0_FC56);
+		val &= ~(MMCR0_EBE | MMCR0_PMCC | MMCR0_PMAO | MMCR0_FC56);
 
 		/*
 		 * The barrier is to make sure the mtspr has been
@@ -896,7 +998,10 @@ static void power_pmu_disable(struct pmu *pmu)
 
 		cpuhw->disabled = 1;
 		cpuhw->n_added = 0;
+
+		ebb_switch_out(mmcr0);
 	}
+
 	local_irq_restore(flags);
 }
 
@@ -911,15 +1016,15 @@ static void power_pmu_enable(struct pmu *pmu)
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
 	long i;
-	unsigned long val;
+	unsigned long val, mmcr0;
 	s64 left;
 	unsigned int hwc_index[MAX_HWEVENTS];
 	int n_lim;
 	int idx;
+	bool ebb;
 
 	if (!ppmu)
 		return;
-
 	local_irq_save(flags);
 
 	cpuhw = &__get_cpu_var(cpu_hw_events);
@@ -934,6 +1039,13 @@ static void power_pmu_enable(struct pmu *pmu)
 	cpuhw->disabled = 0;
 
 	/*
+	 * EBB requires an exclusive group and all events must have the EBB
+	 * flag set, or not set, so we can just check a single event. Also we
+	 * know we have at least one event.
+	 */
+	ebb = is_ebb_event(cpuhw->event[0]);
+
+	/*
 	 * If we didn't change anything, or only removed events,
 	 * no need to recalculate MMCR* settings and reset the PMCs.
 	 * Just reenable the PMU with the current MMCR* settings
@@ -1008,25 +1120,34 @@ static void power_pmu_enable(struct pmu *pmu)
 			++n_lim;
 			continue;
 		}
-		val = 0;
-		if (event->hw.sample_period) {
-			left = local64_read(&event->hw.period_left);
-			if (left < 0x80000000L)
-				val = 0x80000000L - left;
+
+		if (ebb)
+			val = local64_read(&event->hw.prev_count);
+		else {
+			val = 0;
+			if (event->hw.sample_period) {
+				left = local64_read(&event->hw.period_left);
+				if (left < 0x80000000L)
+					val = 0x80000000L - left;
+			}
+			local64_set(&event->hw.prev_count, val);
 		}
-		local64_set(&event->hw.prev_count, val);
+
 		event->hw.idx = idx;
 		if (event->hw.state & PERF_HES_STOPPED)
 			val = 0;
 		write_pmc(idx, val);
+
 		perf_event_update_userpage(event);
 	}
 	cpuhw->n_limited = n_lim;
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
+	mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]);
+
 	mb();
-	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, mmcr0);
 
 	/*
 	 * Enable instruction sampling if necessary
@@ -1124,6 +1245,8 @@ static int power_pmu_add(struct perf_event *event, int ef_flags)
 	event->hw.config = cpuhw->events[n0];
 
 nocheck:
+	ebb_event_add(event);
+
 	++cpuhw->n_events;
 	++cpuhw->n_added;
 
@@ -1484,6 +1607,11 @@ static int power_pmu_event_init(struct perf_event *event)
 		}
 	}
 
+	/* Extra checks for EBB */
+	err = ebb_event_check(event);
+	if (err)
+		return err;
+
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware events in the group.  We assume the event
@@ -1523,6 +1651,13 @@ static int power_pmu_event_init(struct perf_event *event)
 	local64_set(&event->hw.period_left, event->hw.last_period);
 
 	/*
+	 * For EBB events we just context switch the PMC value, we don't do any
+	 * of the sample_period logic. We use hw.prev_count for this.
+	 */
+	if (is_ebb_event(event))
+		local64_set(&event->hw.prev_count, 0);
+
+	/*
 	 * See if we need to reserve the PMU.
 	 * If no events are currently in use, then we have to take a
 	 * mutex to ensure that we don't race with another task doing
-- 
cgit v0.10.2


From 4df489991182d3a9337c0a4b1563077c0004f1ba Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:17 +1000
Subject: powerpc/perf: Add power8 EBB support

Add logic to the power8 PMU code to support EBB. Future processors would
also be expected to implement similar constraints. At that time we could
possibly factor these out into common code.

Finally mark the power8 PMU as supporting EBB, which is the actual
enable switch which allows EBBs to be configured.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index d59f5b2..96a64d6 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -31,9 +31,9 @@
  *
  *        60        56        52        48        44        40        36        32
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
- *                                     [      thresh_cmp     ]   [  thresh_ctl   ]
- *                                                                       |
- *                                       thresh start/stop OR FAB match -*
+ *   |                                 [      thresh_cmp     ]   [  thresh_ctl   ]
+ *   |                                                                   |
+ *   *- EBB (Linux)                      thresh start/stop OR FAB match -*
  *
  *        28        24        20        16        12         8         4         0
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
@@ -85,6 +85,7 @@
  *
  */
 
+#define EVENT_EBB_MASK		1ull
 #define EVENT_THR_CMP_SHIFT	40	/* Threshold CMP value */
 #define EVENT_THR_CMP_MASK	0x3ff
 #define EVENT_THR_CTL_SHIFT	32	/* Threshold control value (start/stop) */
@@ -117,6 +118,7 @@
 	 (EVENT_UNIT_MASK      << EVENT_UNIT_SHIFT)		|	\
 	 (EVENT_COMBINE_MASK   << EVENT_COMBINE_SHIFT)		|	\
 	 (EVENT_MARKED_MASK    << EVENT_MARKED_SHIFT)		|	\
+	 (EVENT_EBB_MASK       << EVENT_CONFIG_EBB_SHIFT)	|	\
 	  EVENT_PSEL_MASK)
 
 /* MMCRA IFM bits - POWER8 */
@@ -140,10 +142,10 @@
  *
  *        28        24        20        16        12         8         4         0
  * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
- *                       [ ]   [  sample ]   [     ]   [6] [5]   [4] [3]   [2] [1]
- *                        |                     |
- *      L1 I/D qualifier -*                     |      Count of events for each PMC.
- *                                              |        p1, p2, p3, p4, p5, p6.
+ *                   |   [ ]   [  sample ]   [     ]   [6] [5]   [4] [3]   [2] [1]
+ *              EBB -*    |                     |
+ *                        |                     |      Count of events for each PMC.
+ *      L1 I/D qualifier -*                     |        p1, p2, p3, p4, p5, p6.
  *                     nc - number of counters -*
  *
  * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints
@@ -159,6 +161,9 @@
 #define CNST_THRESH_VAL(v)	(((v) & EVENT_THRESH_MASK) << 32)
 #define CNST_THRESH_MASK	CNST_THRESH_VAL(EVENT_THRESH_MASK)
 
+#define CNST_EBB_VAL(v)		(((v) & EVENT_EBB_MASK) << 24)
+#define CNST_EBB_MASK		CNST_EBB_VAL(EVENT_EBB_MASK)
+
 #define CNST_L1_QUAL_VAL(v)	(((v) & 3) << 22)
 #define CNST_L1_QUAL_MASK	CNST_L1_QUAL_VAL(3)
 
@@ -217,7 +222,7 @@ static inline bool event_is_fab_match(u64 event)
 
 static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
-	unsigned int unit, pmc, cache;
+	unsigned int unit, pmc, cache, ebb;
 	unsigned long mask, value;
 
 	mask = value = 0;
@@ -225,9 +230,13 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
 	if (event & ~EVENT_VALID_MASK)
 		return -1;
 
-	pmc   = (event >> EVENT_PMC_SHIFT)       & EVENT_PMC_MASK;
-	unit  = (event >> EVENT_UNIT_SHIFT)      & EVENT_UNIT_MASK;
-	cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK;
+	pmc   = (event >> EVENT_PMC_SHIFT)        & EVENT_PMC_MASK;
+	unit  = (event >> EVENT_UNIT_SHIFT)       & EVENT_UNIT_MASK;
+	cache = (event >> EVENT_CACHE_SEL_SHIFT)  & EVENT_CACHE_SEL_MASK;
+	ebb   = (event >> EVENT_CONFIG_EBB_SHIFT) & EVENT_EBB_MASK;
+
+	/* Clear the EBB bit in the event, so event checks work below */
+	event &= ~(EVENT_EBB_MASK << EVENT_CONFIG_EBB_SHIFT);
 
 	if (pmc) {
 		if (pmc > 6)
@@ -297,6 +306,18 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long
 		value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT);
 	}
 
+	if (!pmc && ebb)
+		/* EBB events must specify the PMC */
+		return -1;
+
+	/*
+	 * All events must agree on EBB, either all request it or none.
+	 * EBB events are pinned & exclusive, so this should never actually
+	 * hit, but we leave it as a fallback in case.
+	 */
+	mask  |= CNST_EBB_VAL(ebb);
+	value |= CNST_EBB_MASK;
+
 	*maskp = mask;
 	*valp = value;
 
@@ -591,7 +612,7 @@ static struct power_pmu power8_pmu = {
 	.get_constraint		= power8_get_constraint,
 	.get_alternatives	= power8_get_alternatives,
 	.disable_pmc		= power8_disable_pmc,
-	.flags			= PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB,
+	.flags			= PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_BHRB | PPMU_EBB,
 	.n_generic		= ARRAY_SIZE(power8_generic_events),
 	.generic_events		= power8_generic_events,
 	.attr_groups		= power8_pmu_attr_groups,
-- 
cgit v0.10.2


From 6e0b8bc965d25a8e0701eaca3fca5941b4f4b2b2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 28 Jun 2013 18:15:18 +1000
Subject: powerpc/pseries: Inform the hypervisor we are using EBB regs

On LPAR systems we need to inform the hypervisor that we are using the
EBB registers. We do this by setting a bit in the Virtual Processor Area
(VPA) - formerly known as the lppaca.

For now we do this always, ie. we do not dynamically enable/disable.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h
index b1e7f2a..9b12f88 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -66,7 +66,8 @@ struct lppaca {
 
 	u8	reserved6[48];
 	u8	cede_latency_hint;
-	u8	reserved7[7];
+	u8	ebb_regs_in_use;
+	u8	reserved7[6];
 	u8	dtl_enable_mask;	/* Dispatch Trace Log mask */
 	u8	donate_dedicated_cpu;	/* Donate dedicated CPU cycles */
 	u8	fpregs_in_use;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index fd0f2f2..02d6e21 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -71,6 +71,9 @@ void vpa_init(int cpu)
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
 		lppaca_of(cpu).vmxregs_in_use = 1;
 
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		lppaca_of(cpu).ebb_regs_in_use = 1;
+
 	addr = __pa(&lppaca_of(cpu));
 	ret = register_vpa(hwcpu, addr);
 
-- 
cgit v0.10.2


From 74251fe21bfa9310ddba9e0436d1fcf389e602ee Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 1 Jul 2013 17:54:09 +1000
Subject: powerpc/powernv: Fix iommu initialization again

So because those things always end up in trainwrecks... In 7846de406
we moved back the iommu initialization earlier, essentially undoing
37f02195b which was causing us endless trouble... except that in the
meantime we had merged 959c9bdd58 (to workaround the original breakage)
which is now ... broken :-)

This fixes it by doing a partial revert of the latter (we keep the
ppc_md. path which will be needed in the hotplug case, which happens
also during some EEH error recovery situations).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: <stable@vger.kernel.org> [v3.10]

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index c393bf5..49b57b9 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -443,6 +443,17 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 	set_iommu_table_base(&pdev->dev, &pe->tce32_table);
 }
 
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		set_iommu_table_base(&dev->dev, &pe->tce32_table);
+		if (dev->subordinate)
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+	}
+}
+
 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 					 u64 *startp, u64 *endp)
 {
@@ -599,6 +610,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	iommu_init_table(tbl, phb->hose->node);
 	iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number);
 
+	if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+	else
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+
 	return;
  fail:
 	/* XXX Failure: Try to fallback to 64-bit only ? */
@@ -670,6 +686,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	}
 	iommu_init_table(tbl, phb->hose->node);
 
+	if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+	else
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+
 	return;
 fail:
 	if (pe->tce32_seg >= 0)
-- 
cgit v0.10.2


From 6bbbca735936e15b9431882eceddcf6dff76e03c Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 14:02:56 +0530
Subject: pstore: Pass header size in the pstore write callback

Header size is needed to distinguish between header and the dump data.
Incorporate the addition of new argument (hsize) in the pstore write
callback.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 14cc486..3f0e7d6 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -502,6 +502,7 @@ static int nvram_pstore_open(struct pstore_info *psi)
  * @part:               pstore writes data to registered buffer in parts,
  *                      part number will indicate the same.
  * @count:              Indicates oops count
+ * @hsize:              Size of header added by pstore
  * @size:               number of bytes written to the registered buffer
  * @psi:                registered pstore_info structure
  *
@@ -512,7 +513,8 @@ static int nvram_pstore_open(struct pstore_info *psi)
 static int nvram_pstore_write(enum pstore_type_id type,
 				enum kmsg_dump_reason reason,
 				u64 *id, unsigned int part, int count,
-				size_t size, struct pstore_info *psi)
+				size_t hsize, size_t size,
+				struct pstore_info *psi)
 {
 	int rc;
 	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6d894bf..a9cf960 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -935,7 +935,7 @@ static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, int *count,
 			   struct timespec *time, char **buf,
 			   struct pstore_info *psi);
 static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
-		       u64 *id, unsigned int part, int count,
+		       u64 *id, unsigned int part, int count, size_t hsize,
 		       size_t size, struct pstore_info *psi);
 static int erst_clearer(enum pstore_type_id type, u64 id, int count,
 			struct timespec time, struct pstore_info *psi);
@@ -1055,7 +1055,7 @@ out:
 }
 
 static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
-		       u64 *id, unsigned int part, int count,
+		       u64 *id, unsigned int part, int count, size_t hsize,
 		       size_t size, struct pstore_info *psi)
 {
 	struct cper_pstore_record *rcd = (struct cper_pstore_record *)
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 202d2c8..452800e0 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -104,7 +104,7 @@ static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
 
 static int efi_pstore_write(enum pstore_type_id type,
 		enum kmsg_dump_reason reason, u64 *id,
-		unsigned int part, int count, size_t size,
+		unsigned int part, int count, size_t hsize, size_t size,
 		struct pstore_info *psi)
 {
 	char name[DUMP_NAME_LEN];
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038..4637ec4 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 			break;
 
 		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-				    oopscount, hsize + len, psinfo);
+				    oopscount, hsize, hsize + len, psinfo);
 		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_new_entry = 1;
 
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
 		memcpy(psinfo->buf, s, c);
-		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
+		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
 static int pstore_write_compat(enum pstore_type_id type,
 			       enum kmsg_dump_reason reason,
 			       u64 *id, unsigned int part, int count,
-			       size_t size, struct pstore_info *psi)
+			       size_t hsize, size_t size,
+			       struct pstore_info *psi)
 {
-	return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
+	return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
+			     size, psi);
 }
 
 /*
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1376e5a..c6bb77c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
 static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 					    enum kmsg_dump_reason reason,
 					    u64 *id, unsigned int part,
-					    const char *buf, size_t size,
+					    const char *buf,
+					    size_t hsize, size_t size,
 					    struct pstore_info *psi)
 {
 	struct ramoops_context *cxt = psi->data;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 656699f..4aa80ba 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -58,12 +58,12 @@ struct pstore_info {
 			struct pstore_info *psi);
 	int		(*write)(enum pstore_type_id type,
 			enum kmsg_dump_reason reason, u64 *id,
-			unsigned int part, int count, size_t size,
-			struct pstore_info *psi);
+			unsigned int part, int count, size_t hsize,
+			size_t size, struct pstore_info *psi);
 	int		(*write_buf)(enum pstore_type_id type,
 			enum kmsg_dump_reason reason, u64 *id,
-			unsigned int part, const char *buf, size_t size,
-			struct pstore_info *psi);
+			unsigned int part, const char *buf, size_t hsize,
+			size_t size, struct pstore_info *psi);
 	int		(*erase)(enum pstore_type_id type, u64 id,
 			int count, struct timespec time,
 			struct pstore_info *psi);
-- 
cgit v0.10.2


From fbfe86fc0c53911cf243479f7d26c37dcb1243f1 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 14:03:04 +0530
Subject: powerpc/pseries: Re-organise the oops compression code

nvram_compress() and zip_oops() is used by the nvram_pstore_write
API to compress oops messages hence re-organise the functions
accordingly to avoid forward declarations.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 3f0e7d6..588bab5 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -486,6 +486,58 @@ static int clobbering_unread_rtas_event(void)
 						NVRAM_RTAS_READ_TIMEOUT);
 }
 
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+							size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+						MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+								oops_data_sz);
+	if (zipped_len < 0) {
+		pr_err("nvram: compression failed; returned %d\n", zipped_len);
+		pr_err("nvram: logging uncompressed oops/panic report\n");
+		return -1;
+	}
+	oops_hdr->version = OOPS_HDR_VERSION;
+	oops_hdr->report_length = (u16) zipped_len;
+	oops_hdr->timestamp = get_seconds();
+	return 0;
+}
+
 #ifdef CONFIG_PSTORE
 static int nvram_pstore_open(struct pstore_info *psi)
 {
@@ -759,58 +811,6 @@ int __init pSeries_nvram_init(void)
 }
 
 
-/* Derived from logfs_compress() */
-static int nvram_compress(const void *in, void *out, size_t inlen,
-							size_t outlen)
-{
-	int err, ret;
-
-	ret = -EIO;
-	err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
-						MEM_LEVEL, Z_DEFAULT_STRATEGY);
-	if (err != Z_OK)
-		goto error;
-
-	stream.next_in = in;
-	stream.avail_in = inlen;
-	stream.total_in = 0;
-	stream.next_out = out;
-	stream.avail_out = outlen;
-	stream.total_out = 0;
-
-	err = zlib_deflate(&stream, Z_FINISH);
-	if (err != Z_STREAM_END)
-		goto error;
-
-	err = zlib_deflateEnd(&stream);
-	if (err != Z_OK)
-		goto error;
-
-	if (stream.total_out >= stream.total_in)
-		goto error;
-
-	ret = stream.total_out;
-error:
-	return ret;
-}
-
-/* Compress the text from big_oops_buf into oops_buf. */
-static int zip_oops(size_t text_len)
-{
-	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
-	int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
-								oops_data_sz);
-	if (zipped_len < 0) {
-		pr_err("nvram: compression failed; returned %d\n", zipped_len);
-		pr_err("nvram: logging uncompressed oops/panic report\n");
-		return -1;
-	}
-	oops_hdr->version = OOPS_HDR_VERSION;
-	oops_hdr->report_length = (u16) zipped_len;
-	oops_hdr->timestamp = get_seconds();
-	return 0;
-}
-
 /*
  * This is our kmsg_dump callback, called after an oops or panic report
  * has been written to the printk buffer.  We want to capture as much
-- 
cgit v0.10.2


From 40847e56609c21692ebe593abbf72c1e7f4af206 Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 14:03:13 +0530
Subject: powerpc/pseries: Support compression of oops text via pstore

The patch set supports compression of oops messages while writing to NVRAM,
this helps in capturing more of oops data to lnx,oops-log. The pstore file
for oops messages will be in decompressed format making it readable.

In case compression fails, the patch takes care of copying the header added
by pstore and last oops_data_sz bytes of big_oops_buf to NVRAM so that we
have recent oops messages in lnx,oops-log.

In case decompression fails, it will result in absence of oops file but still
have files (in /dev/pstore) for other partitions.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 588bab5..9f8671a 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -539,6 +539,65 @@ static int zip_oops(size_t text_len)
 }
 
 #ifdef CONFIG_PSTORE
+/* Derived from logfs_uncompress */
+int nvram_decompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	return ret;
+}
+
+static int unzip_oops(char *oops_buf, char *big_buf)
+{
+	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
+	u64 timestamp = oops_hdr->timestamp;
+	char *big_oops_data = NULL;
+	char *oops_data_buf = NULL;
+	size_t big_oops_data_sz;
+	int unzipped_len;
+
+	big_oops_data = big_buf + sizeof(struct oops_log_info);
+	big_oops_data_sz = big_oops_buf_sz - sizeof(struct oops_log_info);
+	oops_data_buf = oops_buf + sizeof(struct oops_log_info);
+
+	unzipped_len = nvram_decompress(oops_data_buf, big_oops_data,
+					oops_hdr->report_length,
+					big_oops_data_sz);
+
+	if (unzipped_len < 0) {
+		pr_err("nvram: decompression failed; returned %d\n",
+								unzipped_len);
+		return -1;
+	}
+	oops_hdr = (struct oops_log_info *)big_buf;
+	oops_hdr->version = OOPS_HDR_VERSION;
+	oops_hdr->report_length = (u16) unzipped_len;
+	oops_hdr->timestamp = timestamp;
+	return 0;
+}
+
 static int nvram_pstore_open(struct pstore_info *psi)
 {
 	/* Reset the iterator to start reading partitions again */
@@ -569,6 +628,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
 				struct pstore_info *psi)
 {
 	int rc;
+	unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
 	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
 
 	/* part 1 has the recent messages from printk buffer */
@@ -579,8 +639,30 @@ static int nvram_pstore_write(enum pstore_type_id type,
 	oops_hdr->version = OOPS_HDR_VERSION;
 	oops_hdr->report_length = (u16) size;
 	oops_hdr->timestamp = get_seconds();
+
+	if (big_oops_buf) {
+		rc = zip_oops(size);
+		/*
+		 * If compression fails copy recent log messages from
+		 * big_oops_buf to oops_data.
+		 */
+		if (rc != 0) {
+			size_t diff = size - oops_data_sz + hsize;
+
+			if (size > oops_data_sz) {
+				memcpy(oops_data, big_oops_buf, hsize);
+				memcpy(oops_data + hsize, big_oops_buf + diff,
+					oops_data_sz - hsize);
+
+				oops_hdr->report_length = (u16) oops_data_sz;
+			} else
+				memcpy(oops_data, big_oops_buf, size);
+		} else
+			err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+	}
+
 	rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
-		(int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC,
+		(int) (sizeof(*oops_hdr) + oops_hdr->report_length), err_type,
 		count);
 
 	if (rc != 0)
@@ -602,10 +684,11 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 	struct oops_log_info *oops_hdr;
 	unsigned int err_type, id_no, size = 0;
 	struct nvram_os_partition *part = NULL;
-	char *buff = NULL;
-	int sig = 0;
+	char *buff = NULL, *big_buff = NULL;
+	int rc, sig = 0;
 	loff_t p;
 
+read_partition:
 	read_type++;
 
 	switch (nvram_type_ids[read_type]) {
@@ -668,6 +751,25 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 	if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
 		oops_hdr = (struct oops_log_info *)buff;
 		*buf = buff + sizeof(*oops_hdr);
+
+		if (err_type == ERR_TYPE_KERNEL_PANIC_GZ) {
+			big_buff = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+			if (!big_buff)
+				return -ENOMEM;
+
+			rc = unzip_oops(buff, big_buff);
+
+			if (rc != 0) {
+				kfree(buff);
+				kfree(big_buff);
+				goto read_partition;
+			}
+
+			oops_hdr = (struct oops_log_info *)big_buff;
+			*buf = big_buff + sizeof(*oops_hdr);
+			kfree(buff);
+		}
+
 		time->tv_sec = oops_hdr->timestamp;
 		time->tv_nsec = 0;
 		return oops_hdr->report_length;
@@ -689,17 +791,18 @@ static int nvram_pstore_init(void)
 {
 	int rc = 0;
 
-	nvram_pstore_info.buf = oops_data;
-	nvram_pstore_info.bufsize = oops_data_sz;
+	if (big_oops_buf) {
+		nvram_pstore_info.buf = big_oops_buf;
+		nvram_pstore_info.bufsize = big_oops_buf_sz;
+	} else {
+		nvram_pstore_info.buf = oops_data;
+		nvram_pstore_info.bufsize = oops_data_sz;
+	}
 
 	rc = pstore_register(&nvram_pstore_info);
 	if (rc != 0)
 		pr_err("nvram: pstore_register() failed, defaults to "
 				"kmsg_dump; returned %d\n", rc);
-	else
-		/*TODO: Support compression when pstore is configured */
-		pr_info("nvram: Compression of oops text supported only when "
-				"pstore is not configured");
 
 	return rc;
 }
@@ -733,11 +836,6 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
 	oops_data = oops_buf + sizeof(struct oops_log_info);
 	oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info);
 
-	rc = nvram_pstore_init();
-
-	if (!rc)
-		return;
-
 	/*
 	 * Figure compression (preceded by elimination of each line's <n>
 	 * severity prefix) will reduce the oops/panic report to at most
@@ -761,6 +859,11 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists)
 		stream.workspace = NULL;
 	}
 
+	rc = nvram_pstore_init();
+
+	if (!rc)
+		return;
+
 	rc = kmsg_dump_register(&nvram_kmsg_dumper);
 	if (rc != 0) {
 		pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
-- 
cgit v0.10.2


From e2a800beaca1f580945773e57d1a0e7cd37b1056 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 1 Jul 2013 14:19:50 +1000
Subject: powerpc/hw_brk: Fix off by one error when validating DAWR region end

The Data Address Watchpoint Register (DAWR) on POWER8 can take a 512
byte range but this range must not cross a 512 byte boundary.

Unfortunately we were off by one when calculating the end of the region,
hence we were not allowing some breakpoint regions which were actually
valid.  This fixes this error.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Reported-by: Edjunior Barbosa Machado <emachado@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org # 3.9+
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 1150ae7..f0b47d1 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -176,7 +176,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 		length_max = 512 ; /* 64 doublewords */
 		/* DAWR region can't cross 512 boundary */
 		if ((bp->attr.bp_addr >> 10) != 
-		    ((bp->attr.bp_addr + bp->attr.bp_len) >> 10))
+		    ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10))
 			return -EINVAL;
 	}
 	if (info->len >
-- 
cgit v0.10.2


From c039e3a8ddd52139d0f81711ecd757772f868b22 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 2 Jul 2013 08:13:52 +1000
Subject: powerpc: Handle both new style and old style reserve maps

When Jeremy introduced the new device-tree based reserve map, he made
the code in early_reserve_mem_dt() bail out if it found one, thus not
reserving the initrd nor processing the old style map.

I hit problems with variants of kexec that didn't put the initrd in
the new style map either. While these could/will be fixed, I believe
we should be safe here and rather reserve more than not enough.

We could have a firmware passing stuff via the new style map, and
in the middle, a kexec that knows nothing about it and adding other
things to the old style map.

I don't see a big issue with processing both and reserving everything
that needs to be. memblock_reserve() supports overlaps fine these days.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9c753bc..eb23ac9 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -559,7 +559,7 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start,
 }
 #endif
 
-static bool __init early_reserve_mem_dt(void)
+static void __init early_reserve_mem_dt(void)
 {
 	unsigned long i, len, dt_root;
 	const __be32 *prop;
@@ -569,7 +569,9 @@ static bool __init early_reserve_mem_dt(void)
 	prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
 
 	if (!prop)
-		return false;
+		return;
+
+	DBG("Found new-style reserved-ranges\n");
 
 	/* Each reserved range is an (address,size) pair, 2 cells each,
 	 * totalling 4 cells per range. */
@@ -579,11 +581,11 @@ static bool __init early_reserve_mem_dt(void)
 		base = of_read_number(prop + (i * 4) + 0, 2);
 		size = of_read_number(prop + (i * 4) + 2, 2);
 
-		if (size)
+		if (size) {
+			DBG("reserving: %llx -> %llx\n", base, size);
 			memblock_reserve(base, size);
+		}
 	}
-
-	return true;
 }
 
 static void __init early_reserve_mem(void)
@@ -601,20 +603,16 @@ static void __init early_reserve_mem(void)
 	self_size = initial_boot_params->totalsize;
 	memblock_reserve(self_base, self_size);
 
-	/*
-	 * Try looking for reserved-regions property in the DT first; if
-	 * it's present, it'll contain all of the necessary reservation
-	 * info
-	 */
-	if (early_reserve_mem_dt())
-		return;
+	/* Look for the new "reserved-regions" property in the DT */
+	early_reserve_mem_dt();
 
 #ifdef CONFIG_BLK_DEV_INITRD
-	/* then reserve the initrd, if any */
-	if (initrd_start && (initrd_end > initrd_start))
+	/* Then reserve the initrd, if any */
+	if (initrd_start && (initrd_end > initrd_start)) {
 		memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
 			_ALIGN_UP(initrd_end, PAGE_SIZE) -
 			_ALIGN_DOWN(initrd_start, PAGE_SIZE));
+	}
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 #ifdef CONFIG_PPC32
@@ -626,6 +624,8 @@ static void __init early_reserve_mem(void)
 		u32 base_32, size_32;
 		u32 *reserve_map_32 = (u32 *)reserve_map;
 
+		DBG("Found old 32-bit reserve map\n");
+
 		while (1) {
 			base_32 = *(reserve_map_32++);
 			size_32 = *(reserve_map_32++);
@@ -640,6 +640,9 @@ static void __init early_reserve_mem(void)
 		return;
 	}
 #endif
+	DBG("Processing reserve map\n");
+
+	/* Handle the reserve map in the fdt blob if it exists */
 	while (1) {
 		base = *(reserve_map++);
 		size = *(reserve_map++);
-- 
cgit v0.10.2


From 86d379690c3b005418fafc9afdcdfc731a043862 Mon Sep 17 00:00:00 2001
From: Hongtao Jia <hongtao.jia@freescale.com>
Date: Wed, 10 Apr 2013 10:52:55 +0800
Subject: powerpc/mpic: Add get_version API both for internal and external use

MPIC version is useful information for both mpic_alloc() and mpic_init().
The patch provide an API to get MPIC version for reusing the code.
Also, some other IP block may need MPIC version for their own use.
The API for external use is also provided.

Signed-off-by: Jia Hongtao <hongtao.jia@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h
index c0f9ef9..ea6bf72 100644
--- a/arch/powerpc/include/asm/mpic.h
+++ b/arch/powerpc/include/asm/mpic.h
@@ -393,6 +393,9 @@ struct mpic
 #define	MPIC_REGSET_STANDARD		MPIC_REGSET(0)	/* Original MPIC */
 #define	MPIC_REGSET_TSI108		MPIC_REGSET(1)	/* Tsi108/109 PIC */
 
+/* Get the version of primary MPIC */
+extern u32 fsl_mpic_primary_get_version(void);
+
 /* Allocate the controller structure and setup the linux irq descs
  * for the range if interrupts passed in. No HW initialization is
  * actually performed.
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 3cc2f91..1a4e19c 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1173,10 +1173,33 @@ static struct irq_domain_ops mpic_host_ops = {
 	.xlate = mpic_host_xlate,
 };
 
+static u32 fsl_mpic_get_version(struct mpic *mpic)
+{
+	u32 brr1;
+
+	if (!(mpic->flags & MPIC_FSL))
+		return 0;
+
+	brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs,
+			MPIC_FSL_BRR1);
+
+	return brr1 & MPIC_FSL_BRR1_VER;
+}
+
 /*
  * Exported functions
  */
 
+u32 fsl_mpic_primary_get_version(void)
+{
+	struct mpic *mpic = mpic_primary;
+
+	if (mpic)
+		return fsl_mpic_get_version(mpic);
+
+	return 0;
+}
+
 struct mpic * __init mpic_alloc(struct device_node *node,
 				phys_addr_t phys_addr,
 				unsigned int flags,
@@ -1323,7 +1346,6 @@ struct mpic * __init mpic_alloc(struct device_node *node,
 	mpic_map(mpic, mpic->paddr, &mpic->tmregs, MPIC_INFO(TIMER_BASE), 0x1000);
 
 	if (mpic->flags & MPIC_FSL) {
-		u32 brr1;
 		int ret;
 
 		/*
@@ -1334,9 +1356,7 @@ struct mpic * __init mpic_alloc(struct device_node *node,
 		mpic_map(mpic, mpic->paddr, &mpic->thiscpuregs,
 			 MPIC_CPU_THISBASE, 0x1000);
 
-		brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs,
-				MPIC_FSL_BRR1);
-		fsl_version = brr1 & MPIC_FSL_BRR1_VER;
+		fsl_version = fsl_mpic_get_version(mpic);
 
 		/* Error interrupt mask register (EIMR) is required for
 		 * handling individual device error interrupts. EIMR
@@ -1526,9 +1546,7 @@ void __init mpic_init(struct mpic *mpic)
 	mpic_cpu_write(MPIC_INFO(CPU_CURRENT_TASK_PRI), 0xf);
 
 	if (mpic->flags & MPIC_FSL) {
-		u32 brr1 = _mpic_read(mpic->reg_type, &mpic->thiscpuregs,
-				      MPIC_FSL_BRR1);
-		u32 version = brr1 & MPIC_FSL_BRR1_VER;
+		u32 version = fsl_mpic_get_version(mpic);
 
 		/*
 		 * Timer group B is present at the latest in MPIC 3.1 (e.g.
-- 
cgit v0.10.2


From 2dd1c132d5c22677843658f5736f0de1cdf57bc1 Mon Sep 17 00:00:00 2001
From: Chunhe Lan <Chunhe.Lan@freescale.com>
Date: Fri, 19 Apr 2013 16:13:18 +0800
Subject: powerpc/fsl: Enable CONFIG_E1000E in mpc85xx_smp_defconfig

On the most boards of Freescale platform, they use the PCI-Express
Intel(R) PRO/1000 gigabit ethernet card to work. So enable the
corresponding driver for it.

Signed-off-by: Chunhe Lan <Chunhe.Lan@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/configs/mpc85xx_smp_defconfig b/arch/powerpc/configs/mpc85xx_smp_defconfig
index 165e6b3..152fa05 100644
--- a/arch/powerpc/configs/mpc85xx_smp_defconfig
+++ b/arch/powerpc/configs/mpc85xx_smp_defconfig
@@ -131,6 +131,7 @@ CONFIG_DUMMY=y
 CONFIG_FS_ENET=y
 CONFIG_UCC_GETH=y
 CONFIG_GIANFAR=y
+CONFIG_E1000E=y
 CONFIG_MARVELL_PHY=y
 CONFIG_DAVICOM_PHY=y
 CONFIG_CICADA_PHY=y
-- 
cgit v0.10.2


From 7601f59765a48bef329a429101eabcb0e2503634 Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Thu, 18 Apr 2013 07:26:11 +0200
Subject: powerpc/8xx: Erroneous double irq_eoi() on CPM IRQ in MPC8xx

irq_eoi() is already called by generic_handle_irq() so
it shall not be called a again

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 806cbbd..587a282 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -219,19 +219,12 @@ void mpc8xx_restart(char *cmd)
 
 static void cpm_cascade(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_chip *chip;
-	int cascade_irq;
-
-	if ((cascade_irq = cpm_get_irq()) >= 0) {
-		struct irq_desc *cdesc = irq_to_desc(cascade_irq);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	int cascade_irq = cpm_get_irq();
 
+	if (cascade_irq >= 0)
 		generic_handle_irq(cascade_irq);
 
-		chip = irq_desc_get_chip(cdesc);
-		chip->irq_eoi(&cdesc->irq_data);
-	}
-
-	chip = irq_desc_get_chip(desc);
 	chip->irq_eoi(&desc->irq_data);
 }
 
-- 
cgit v0.10.2


From 9837b43c5f3514e5d28f65f1513f4dc6759d2810 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 11 Apr 2013 09:32:34 +0800
Subject: powerpc/85xx: enable coreint for all the 64bit boards

With the patch 7230c564 (powerpc: Rework lazy-interrupt handling),
it seems that the coreint works pretty well on the 85xx 64bit kernel.
So use the coreint by default for these boards.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/platforms/85xx/p5020_ds.c b/arch/powerpc/platforms/85xx/p5020_ds.c
index 753a42c..39cfa40 100644
--- a/arch/powerpc/platforms/85xx/p5020_ds.c
+++ b/arch/powerpc/platforms/85xx/p5020_ds.c
@@ -75,12 +75,7 @@ define_machine(p5020_ds) {
 #ifdef CONFIG_PCI
 	.pcibios_fixup_bus	= fsl_pcibios_fixup_bus,
 #endif
-/* coreint doesn't play nice with lazy EE, use legacy mpic for now */
-#ifdef CONFIG_PPC64
-	.get_irq		= mpic_get_irq,
-#else
 	.get_irq		= mpic_get_coreint_irq,
-#endif
 	.restart		= fsl_rstcr_restart,
 	.calibrate_decr		= generic_calibrate_decr,
 	.progress		= udbg_progress,
diff --git a/arch/powerpc/platforms/85xx/p5040_ds.c b/arch/powerpc/platforms/85xx/p5040_ds.c
index 1138185..f70e74c 100644
--- a/arch/powerpc/platforms/85xx/p5040_ds.c
+++ b/arch/powerpc/platforms/85xx/p5040_ds.c
@@ -66,12 +66,7 @@ define_machine(p5040_ds) {
 #ifdef CONFIG_PCI
 	.pcibios_fixup_bus	= fsl_pcibios_fixup_bus,
 #endif
-/* coreint doesn't play nice with lazy EE, use legacy mpic for now */
-#ifdef CONFIG_PPC64
-	.get_irq		= mpic_get_irq,
-#else
 	.get_irq		= mpic_get_coreint_irq,
-#endif
 	.restart		= fsl_rstcr_restart,
 	.calibrate_decr		= generic_calibrate_decr,
 	.progress		= udbg_progress,
diff --git a/arch/powerpc/platforms/85xx/t4240_qds.c b/arch/powerpc/platforms/85xx/t4240_qds.c
index 5998e9f..91ead6b 100644
--- a/arch/powerpc/platforms/85xx/t4240_qds.c
+++ b/arch/powerpc/platforms/85xx/t4240_qds.c
@@ -75,12 +75,7 @@ define_machine(t4240_qds) {
 #ifdef CONFIG_PCI
 	.pcibios_fixup_bus	= fsl_pcibios_fixup_bus,
 #endif
-/* coreint doesn't play nice with lazy EE, use legacy mpic for now */
-#ifdef CONFIG_PPC64
-	.get_irq		= mpic_get_irq,
-#else
 	.get_irq		= mpic_get_coreint_irq,
-#endif
 	.restart		= fsl_rstcr_restart,
 	.calibrate_decr		= generic_calibrate_decr,
 	.progress		= udbg_progress,
-- 
cgit v0.10.2


From 5ff04b7287d87c1db74f47360365905ed9a97ff7 Mon Sep 17 00:00:00 2001
From: "Dongsheng.wang@freescale.com" <Dongsheng.wang@freescale.com>
Date: Tue, 9 Apr 2013 10:22:29 +0800
Subject: powerpc/mpic: add irq_set_wake support

Add irq_set_wake support. Just add IRQF_NO_SUSPEND to desc->action->flag.
So the wake up interrupt will not be disable in suspend_device_irqs.

Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 1a4e19c..4635d11 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -920,6 +920,22 @@ int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type)
 	return IRQ_SET_MASK_OK_NOCOPY;
 }
 
+static int mpic_irq_set_wake(struct irq_data *d, unsigned int on)
+{
+	struct irq_desc *desc = container_of(d, struct irq_desc, irq_data);
+	struct mpic *mpic = mpic_from_irq_data(d);
+
+	if (!(mpic->flags & MPIC_FSL))
+		return -ENXIO;
+
+	if (on)
+		desc->action->flags |= IRQF_NO_SUSPEND;
+	else
+		desc->action->flags &= ~IRQF_NO_SUSPEND;
+
+	return 0;
+}
+
 void mpic_set_vector(unsigned int virq, unsigned int vector)
 {
 	struct mpic *mpic = mpic_from_irq(virq);
@@ -957,6 +973,7 @@ static struct irq_chip mpic_irq_chip = {
 	.irq_unmask	= mpic_unmask_irq,
 	.irq_eoi	= mpic_end_irq,
 	.irq_set_type	= mpic_set_irq_type,
+	.irq_set_wake	= mpic_irq_set_wake,
 };
 
 #ifdef CONFIG_SMP
@@ -971,6 +988,7 @@ static struct irq_chip mpic_tm_chip = {
 	.irq_mask	= mpic_mask_tm,
 	.irq_unmask	= mpic_unmask_tm,
 	.irq_eoi	= mpic_end_irq,
+	.irq_set_wake	= mpic_irq_set_wake,
 };
 
 #ifdef CONFIG_MPIC_U3_HT_IRQS
-- 
cgit v0.10.2


From 36ca09be6ff77a4e5b54b8b68ed7be7aa184250b Mon Sep 17 00:00:00 2001
From: "Dongsheng.wang@freescale.com" <Dongsheng.wang@freescale.com>
Date: Tue, 9 Apr 2013 10:22:30 +0800
Subject: powerpc/mpic: add global timer support

The MPIC global timer is a hardware timer inside the Freescale PIC complying
with OpenPIC standard. When the specified interval times out, the hardware
timer generates an interrupt. The driver currently is only tested on fsl chip,
but it can potentially support other global timers complying to OpenPIC
standard.

The two independent groups of global timer on fsl chip, group A and group B,
are identical in their functionality, except that they appear at different
locations within the PIC register map. The hardware timer can be cascaded to
create timers larger than the default 31-bit global timers. Timer cascade
fields allow configuration of up to two 63-bit timers. But These two groups
of timers cannot be cascaded together.

It can be used as a wakeup source for low power modes. It also could be used
as periodical timer for protocols, drivers and etc.

Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/mpic_timer.h b/arch/powerpc/include/asm/mpic_timer.h
new file mode 100644
index 0000000..0e23cd4
--- /dev/null
+++ b/arch/powerpc/include/asm/mpic_timer.h
@@ -0,0 +1,46 @@
+/*
+ * arch/powerpc/include/asm/mpic_timer.h
+ *
+ * Header file for Mpic Global Timer
+ *
+ * Copyright 2013 Freescale Semiconductor, Inc.
+ *
+ * Author: Wang Dongsheng <Dongsheng.Wang@freescale.com>
+ *	   Li Yang <leoli@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __MPIC_TIMER__
+#define __MPIC_TIMER__
+
+#include <linux/interrupt.h>
+#include <linux/time.h>
+
+struct mpic_timer {
+	void			*dev;
+	struct cascade_priv	*cascade_handle;
+	unsigned int		num;
+	unsigned int		irq;
+};
+
+#ifdef CONFIG_MPIC_TIMER
+struct mpic_timer *mpic_request_timer(irq_handler_t fn,  void *dev,
+		const struct timeval *time);
+void mpic_start_timer(struct mpic_timer *handle);
+void mpic_stop_timer(struct mpic_timer *handle);
+void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time);
+void mpic_free_timer(struct mpic_timer *handle);
+#else
+struct mpic_timer *mpic_request_timer(irq_handler_t fn,  void *dev,
+		const struct timeval *time) { return NULL; }
+void mpic_start_timer(struct mpic_timer *handle) { }
+void mpic_stop_timer(struct mpic_timer *handle) { }
+void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time) { }
+void mpic_free_timer(struct mpic_timer *handle) { }
+#endif
+
+#endif
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index bed8c60..7d07d9e 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -86,6 +86,18 @@ config MPIC
 	bool
 	default n
 
+config MPIC_TIMER
+	bool "MPIC Global Timer"
+	depends on MPIC && FSL_SOC
+	default n
+	help
+	  The MPIC global timer is a hardware timer inside the
+	  Freescale PIC complying with OpenPIC standard. When the
+	  specified interval times out, the hardware timer generates
+	  an interrupt. The driver currently is only tested on fsl
+	  chip, but it can potentially support other global timers
+	  complying with the OpenPIC standard.
+
 config PPC_EPAPR_HV_PIC
 	bool
 	default n
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 99464a7..2eb72c1 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -4,6 +4,7 @@ ccflags-$(CONFIG_PPC64)		:= $(NO_MINIMAL_TOC)
 
 mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o
 obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y)
+obj-$(CONFIG_MPIC_TIMER)        += mpic_timer.o
 mpic-msgr-obj-$(CONFIG_MPIC_MSGR)	+= mpic_msgr.o
 obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y)
 obj-$(CONFIG_PPC_EPAPR_HV_PIC)	+= ehv_pic.o
diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c
new file mode 100644
index 0000000..c06db92
--- /dev/null
+++ b/arch/powerpc/sysdev/mpic_timer.c
@@ -0,0 +1,593 @@
+/*
+ * MPIC timer driver
+ *
+ * Copyright 2013 Freescale Semiconductor, Inc.
+ * Author: Dongsheng Wang <Dongsheng.Wang@freescale.com>
+ *	   Li Yang <leoli@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/syscore_ops.h>
+#include <sysdev/fsl_soc.h>
+#include <asm/io.h>
+
+#include <asm/mpic_timer.h>
+
+#define FSL_GLOBAL_TIMER		0x1
+
+/* Clock Ratio
+ * Divide by 64 0x00000300
+ * Divide by 32 0x00000200
+ * Divide by 16 0x00000100
+ * Divide by  8 0x00000000 (Hardware default div)
+ */
+#define MPIC_TIMER_TCR_CLKDIV		0x00000300
+
+#define MPIC_TIMER_TCR_ROVR_OFFSET	24
+
+#define TIMER_STOP			0x80000000
+#define TIMERS_PER_GROUP		4
+#define MAX_TICKS			(~0U >> 1)
+#define MAX_TICKS_CASCADE		(~0U)
+#define TIMER_OFFSET(num)		(1 << (TIMERS_PER_GROUP - 1 - num))
+
+/* tv_usec should be less than ONE_SECOND, otherwise use tv_sec */
+#define ONE_SECOND			1000000
+
+struct timer_regs {
+	u32	gtccr;
+	u32	res0[3];
+	u32	gtbcr;
+	u32	res1[3];
+	u32	gtvpr;
+	u32	res2[3];
+	u32	gtdr;
+	u32	res3[3];
+};
+
+struct cascade_priv {
+	u32 tcr_value;			/* TCR register: CASC & ROVR value */
+	unsigned int cascade_map;	/* cascade map */
+	unsigned int timer_num;		/* cascade control timer */
+};
+
+struct timer_group_priv {
+	struct timer_regs __iomem	*regs;
+	struct mpic_timer		timer[TIMERS_PER_GROUP];
+	struct list_head		node;
+	unsigned int			timerfreq;
+	unsigned int			idle;
+	unsigned int			flags;
+	spinlock_t			lock;
+	void __iomem			*group_tcr;
+};
+
+static struct cascade_priv cascade_timer[] = {
+	/* cascade timer 0 and 1 */
+	{0x1, 0xc, 0x1},
+	/* cascade timer 1 and 2 */
+	{0x2, 0x6, 0x2},
+	/* cascade timer 2 and 3 */
+	{0x4, 0x3, 0x3}
+};
+
+static LIST_HEAD(timer_group_list);
+
+static void convert_ticks_to_time(struct timer_group_priv *priv,
+		const u64 ticks, struct timeval *time)
+{
+	u64 tmp_sec;
+
+	time->tv_sec = (__kernel_time_t)div_u64(ticks, priv->timerfreq);
+	tmp_sec = (u64)time->tv_sec * (u64)priv->timerfreq;
+
+	time->tv_usec = (__kernel_suseconds_t)
+		div_u64((ticks - tmp_sec) * 1000000, priv->timerfreq);
+
+	return;
+}
+
+/* the time set by the user is converted to "ticks" */
+static int convert_time_to_ticks(struct timer_group_priv *priv,
+		const struct timeval *time, u64 *ticks)
+{
+	u64 max_value;		/* prevent u64 overflow */
+	u64 tmp = 0;
+
+	u64 tmp_sec;
+	u64 tmp_ms;
+	u64 tmp_us;
+
+	max_value = div_u64(ULLONG_MAX, priv->timerfreq);
+
+	if (time->tv_sec > max_value ||
+			(time->tv_sec == max_value && time->tv_usec > 0))
+		return -EINVAL;
+
+	tmp_sec = (u64)time->tv_sec * (u64)priv->timerfreq;
+	tmp += tmp_sec;
+
+	tmp_ms = time->tv_usec / 1000;
+	tmp_ms = div_u64((u64)tmp_ms * (u64)priv->timerfreq, 1000);
+	tmp += tmp_ms;
+
+	tmp_us = time->tv_usec % 1000;
+	tmp_us = div_u64((u64)tmp_us * (u64)priv->timerfreq, 1000000);
+	tmp += tmp_us;
+
+	*ticks = tmp;
+
+	return 0;
+}
+
+/* detect whether there is a cascade timer available */
+static struct mpic_timer *detect_idle_cascade_timer(
+					struct timer_group_priv *priv)
+{
+	struct cascade_priv *casc_priv;
+	unsigned int map;
+	unsigned int array_size = ARRAY_SIZE(cascade_timer);
+	unsigned int num;
+	unsigned int i;
+	unsigned long flags;
+
+	casc_priv = cascade_timer;
+	for (i = 0; i < array_size; i++) {
+		spin_lock_irqsave(&priv->lock, flags);
+		map = casc_priv->cascade_map & priv->idle;
+		if (map == casc_priv->cascade_map) {
+			num = casc_priv->timer_num;
+			priv->timer[num].cascade_handle = casc_priv;
+
+			/* set timer busy */
+			priv->idle &= ~casc_priv->cascade_map;
+			spin_unlock_irqrestore(&priv->lock, flags);
+			return &priv->timer[num];
+		}
+		spin_unlock_irqrestore(&priv->lock, flags);
+		casc_priv++;
+	}
+
+	return NULL;
+}
+
+static int set_cascade_timer(struct timer_group_priv *priv, u64 ticks,
+		unsigned int num)
+{
+	struct cascade_priv *casc_priv;
+	u32 tcr;
+	u32 tmp_ticks;
+	u32 rem_ticks;
+
+	/* set group tcr reg for cascade */
+	casc_priv = priv->timer[num].cascade_handle;
+	if (!casc_priv)
+		return -EINVAL;
+
+	tcr = casc_priv->tcr_value |
+		(casc_priv->tcr_value << MPIC_TIMER_TCR_ROVR_OFFSET);
+	setbits32(priv->group_tcr, tcr);
+
+	tmp_ticks = div_u64_rem(ticks, MAX_TICKS_CASCADE, &rem_ticks);
+
+	out_be32(&priv->regs[num].gtccr, 0);
+	out_be32(&priv->regs[num].gtbcr, tmp_ticks | TIMER_STOP);
+
+	out_be32(&priv->regs[num - 1].gtccr, 0);
+	out_be32(&priv->regs[num - 1].gtbcr, rem_ticks);
+
+	return 0;
+}
+
+static struct mpic_timer *get_cascade_timer(struct timer_group_priv *priv,
+					u64 ticks)
+{
+	struct mpic_timer *allocated_timer;
+
+	/* Two cascade timers: Support the maximum time */
+	const u64 max_ticks = (u64)MAX_TICKS * (u64)MAX_TICKS_CASCADE;
+	int ret;
+
+	if (ticks > max_ticks)
+		return NULL;
+
+	/* detect idle timer */
+	allocated_timer = detect_idle_cascade_timer(priv);
+	if (!allocated_timer)
+		return NULL;
+
+	/* set ticks to timer */
+	ret = set_cascade_timer(priv, ticks, allocated_timer->num);
+	if (ret < 0)
+		return NULL;
+
+	return allocated_timer;
+}
+
+static struct mpic_timer *get_timer(const struct timeval *time)
+{
+	struct timer_group_priv *priv;
+	struct mpic_timer *timer;
+
+	u64 ticks;
+	unsigned int num;
+	unsigned int i;
+	unsigned long flags;
+	int ret;
+
+	list_for_each_entry(priv, &timer_group_list, node) {
+		ret = convert_time_to_ticks(priv, time, &ticks);
+		if (ret < 0)
+			return NULL;
+
+		if (ticks > MAX_TICKS) {
+			if (!(priv->flags & FSL_GLOBAL_TIMER))
+				return NULL;
+
+			timer = get_cascade_timer(priv, ticks);
+			if (!timer)
+				continue;
+
+			return timer;
+		}
+
+		for (i = 0; i < TIMERS_PER_GROUP; i++) {
+			/* one timer: Reverse allocation */
+			num = TIMERS_PER_GROUP - 1 - i;
+			spin_lock_irqsave(&priv->lock, flags);
+			if (priv->idle & (1 << i)) {
+				/* set timer busy */
+				priv->idle &= ~(1 << i);
+				/* set ticks & stop timer */
+				out_be32(&priv->regs[num].gtbcr,
+					ticks | TIMER_STOP);
+				out_be32(&priv->regs[num].gtccr, 0);
+				priv->timer[num].cascade_handle = NULL;
+				spin_unlock_irqrestore(&priv->lock, flags);
+				return &priv->timer[num];
+			}
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * mpic_start_timer - start hardware timer
+ * @handle: the timer to be started.
+ *
+ * It will do ->fn(->dev) callback from the hardware interrupt at
+ * the ->timeval point in the future.
+ */
+void mpic_start_timer(struct mpic_timer *handle)
+{
+	struct timer_group_priv *priv = container_of(handle,
+			struct timer_group_priv, timer[handle->num]);
+
+	clrbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);
+}
+EXPORT_SYMBOL(mpic_start_timer);
+
+/**
+ * mpic_stop_timer - stop hardware timer
+ * @handle: the timer to be stoped
+ *
+ * The timer periodically generates an interrupt. Unless user stops the timer.
+ */
+void mpic_stop_timer(struct mpic_timer *handle)
+{
+	struct timer_group_priv *priv = container_of(handle,
+			struct timer_group_priv, timer[handle->num]);
+	struct cascade_priv *casc_priv;
+
+	setbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);
+
+	casc_priv = priv->timer[handle->num].cascade_handle;
+	if (casc_priv) {
+		out_be32(&priv->regs[handle->num].gtccr, 0);
+		out_be32(&priv->regs[handle->num - 1].gtccr, 0);
+	} else {
+		out_be32(&priv->regs[handle->num].gtccr, 0);
+	}
+}
+EXPORT_SYMBOL(mpic_stop_timer);
+
+/**
+ * mpic_get_remain_time - get timer time
+ * @handle: the timer to be selected.
+ * @time: time for timer
+ *
+ * Query timer remaining time.
+ */
+void mpic_get_remain_time(struct mpic_timer *handle, struct timeval *time)
+{
+	struct timer_group_priv *priv = container_of(handle,
+			struct timer_group_priv, timer[handle->num]);
+	struct cascade_priv *casc_priv;
+
+	u64 ticks;
+	u32 tmp_ticks;
+
+	casc_priv = priv->timer[handle->num].cascade_handle;
+	if (casc_priv) {
+		tmp_ticks = in_be32(&priv->regs[handle->num].gtccr);
+		ticks = ((u64)tmp_ticks & UINT_MAX) * (u64)MAX_TICKS_CASCADE;
+		tmp_ticks = in_be32(&priv->regs[handle->num - 1].gtccr);
+		ticks += tmp_ticks;
+	} else {
+		ticks = in_be32(&priv->regs[handle->num].gtccr);
+	}
+
+	convert_ticks_to_time(priv, ticks, time);
+}
+EXPORT_SYMBOL(mpic_get_remain_time);
+
+/**
+ * mpic_free_timer - free hardware timer
+ * @handle: the timer to be removed.
+ *
+ * Free the timer.
+ *
+ * Note: can not be used in interrupt context.
+ */
+void mpic_free_timer(struct mpic_timer *handle)
+{
+	struct timer_group_priv *priv = container_of(handle,
+			struct timer_group_priv, timer[handle->num]);
+
+	struct cascade_priv *casc_priv;
+	unsigned long flags;
+
+	mpic_stop_timer(handle);
+
+	casc_priv = priv->timer[handle->num].cascade_handle;
+
+	free_irq(priv->timer[handle->num].irq, priv->timer[handle->num].dev);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	if (casc_priv) {
+		u32 tcr;
+		tcr = casc_priv->tcr_value | (casc_priv->tcr_value <<
+					MPIC_TIMER_TCR_ROVR_OFFSET);
+		clrbits32(priv->group_tcr, tcr);
+		priv->idle |= casc_priv->cascade_map;
+		priv->timer[handle->num].cascade_handle = NULL;
+	} else {
+		priv->idle |= TIMER_OFFSET(handle->num);
+	}
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+EXPORT_SYMBOL(mpic_free_timer);
+
+/**
+ * mpic_request_timer - get a hardware timer
+ * @fn: interrupt handler function
+ * @dev: callback function of the data
+ * @time: time for timer
+ *
+ * This executes the "request_irq", returning NULL
+ * else "handle" on success.
+ */
+struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev,
+					const struct timeval *time)
+{
+	struct mpic_timer *allocated_timer;
+	int ret;
+
+	if (list_empty(&timer_group_list))
+		return NULL;
+
+	if (!(time->tv_sec + time->tv_usec) ||
+			time->tv_sec < 0 || time->tv_usec < 0)
+		return NULL;
+
+	if (time->tv_usec > ONE_SECOND)
+		return NULL;
+
+	allocated_timer = get_timer(time);
+	if (!allocated_timer)
+		return NULL;
+
+	ret = request_irq(allocated_timer->irq, fn,
+			IRQF_TRIGGER_LOW, "global-timer", dev);
+	if (ret) {
+		mpic_free_timer(allocated_timer);
+		return NULL;
+	}
+
+	allocated_timer->dev = dev;
+
+	return allocated_timer;
+}
+EXPORT_SYMBOL(mpic_request_timer);
+
+static int timer_group_get_freq(struct device_node *np,
+			struct timer_group_priv *priv)
+{
+	u32 div;
+
+	if (priv->flags & FSL_GLOBAL_TIMER) {
+		struct device_node *dn;
+
+		dn = of_find_compatible_node(NULL, NULL, "fsl,mpic");
+		if (dn) {
+			of_property_read_u32(dn, "clock-frequency",
+					&priv->timerfreq);
+			of_node_put(dn);
+		}
+	}
+
+	if (priv->timerfreq <= 0)
+		return -EINVAL;
+
+	if (priv->flags & FSL_GLOBAL_TIMER) {
+		div = (1 << (MPIC_TIMER_TCR_CLKDIV >> 8)) * 8;
+		priv->timerfreq /= div;
+	}
+
+	return 0;
+}
+
+static int timer_group_get_irq(struct device_node *np,
+		struct timer_group_priv *priv)
+{
+	const u32 all_timer[] = { 0, TIMERS_PER_GROUP };
+	const u32 *p;
+	u32 offset;
+	u32 count;
+
+	unsigned int i;
+	unsigned int j;
+	unsigned int irq_index = 0;
+	unsigned int irq;
+	int len;
+
+	p = of_get_property(np, "fsl,available-ranges", &len);
+	if (p && len % (2 * sizeof(u32)) != 0) {
+		pr_err("%s: malformed available-ranges property.\n",
+				np->full_name);
+		return -EINVAL;
+	}
+
+	if (!p) {
+		p = all_timer;
+		len = sizeof(all_timer);
+	}
+
+	len /= 2 * sizeof(u32);
+
+	for (i = 0; i < len; i++) {
+		offset = p[i * 2];
+		count = p[i * 2 + 1];
+		for (j = 0; j < count; j++) {
+			irq = irq_of_parse_and_map(np, irq_index);
+			if (!irq) {
+				pr_err("%s: irq parse and map failed.\n",
+						np->full_name);
+				return -EINVAL;
+			}
+
+			/* Set timer idle */
+			priv->idle |= TIMER_OFFSET((offset + j));
+			priv->timer[offset + j].irq = irq;
+			priv->timer[offset + j].num = offset + j;
+			irq_index++;
+		}
+	}
+
+	return 0;
+}
+
+static void timer_group_init(struct device_node *np)
+{
+	struct timer_group_priv *priv;
+	unsigned int i = 0;
+	int ret;
+
+	priv = kzalloc(sizeof(struct timer_group_priv), GFP_KERNEL);
+	if (!priv) {
+		pr_err("%s: cannot allocate memory for group.\n",
+				np->full_name);
+		return;
+	}
+
+	if (of_device_is_compatible(np, "fsl,mpic-global-timer"))
+		priv->flags |= FSL_GLOBAL_TIMER;
+
+	priv->regs = of_iomap(np, i++);
+	if (!priv->regs) {
+		pr_err("%s: cannot ioremap timer register address.\n",
+				np->full_name);
+		goto out;
+	}
+
+	if (priv->flags & FSL_GLOBAL_TIMER) {
+		priv->group_tcr = of_iomap(np, i++);
+		if (!priv->group_tcr) {
+			pr_err("%s: cannot ioremap tcr address.\n",
+					np->full_name);
+			goto out;
+		}
+	}
+
+	ret = timer_group_get_freq(np, priv);
+	if (ret < 0) {
+		pr_err("%s: cannot get timer frequency.\n", np->full_name);
+		goto out;
+	}
+
+	ret = timer_group_get_irq(np, priv);
+	if (ret < 0) {
+		pr_err("%s: cannot get timer irqs.\n", np->full_name);
+		goto out;
+	}
+
+	spin_lock_init(&priv->lock);
+
+	/* Init FSL timer hardware */
+	if (priv->flags & FSL_GLOBAL_TIMER)
+		setbits32(priv->group_tcr, MPIC_TIMER_TCR_CLKDIV);
+
+	list_add_tail(&priv->node, &timer_group_list);
+
+	return;
+
+out:
+	if (priv->regs)
+		iounmap(priv->regs);
+
+	if (priv->group_tcr)
+		iounmap(priv->group_tcr);
+
+	kfree(priv);
+}
+
+static void mpic_timer_resume(void)
+{
+	struct timer_group_priv *priv;
+
+	list_for_each_entry(priv, &timer_group_list, node) {
+		/* Init FSL timer hardware */
+		if (priv->flags & FSL_GLOBAL_TIMER)
+			setbits32(priv->group_tcr, MPIC_TIMER_TCR_CLKDIV);
+	}
+}
+
+static const struct of_device_id mpic_timer_ids[] = {
+	{ .compatible = "fsl,mpic-global-timer", },
+	{},
+};
+
+static struct syscore_ops mpic_timer_syscore_ops = {
+	.resume = mpic_timer_resume,
+};
+
+static int __init mpic_timer_init(void)
+{
+	struct device_node *np = NULL;
+
+	for_each_matching_node(np, mpic_timer_ids)
+		timer_group_init(np);
+
+	register_syscore_ops(&mpic_timer_syscore_ops);
+
+	if (list_empty(&timer_group_list))
+		return -ENODEV;
+
+	return 0;
+}
+subsys_initcall(mpic_timer_init);
-- 
cgit v0.10.2


From 9e6f31a9dbc58f6a5661f8dc8c919441b8d3ced4 Mon Sep 17 00:00:00 2001
From: "Dongsheng.wang@freescale.com" <Dongsheng.wang@freescale.com>
Date: Tue, 9 Apr 2013 10:22:31 +0800
Subject: powerpc/mpic: create mpic subsystem object

Register a mpic subsystem at /sys/devices/system/

Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h
index ea6bf72..4a1ac9f 100644
--- a/arch/powerpc/include/asm/mpic.h
+++ b/arch/powerpc/include/asm/mpic.h
@@ -339,6 +339,8 @@ struct mpic
 #endif
 };
 
+extern struct bus_type mpic_subsys;
+
 /*
  * MPIC flags (passed to mpic_alloc)
  *
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 4635d11..1be54fa 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -48,6 +48,12 @@
 #define DBG(fmt...)
 #endif
 
+struct bus_type mpic_subsys = {
+	.name = "mpic",
+	.dev_name = "mpic",
+};
+EXPORT_SYMBOL_GPL(mpic_subsys);
+
 static struct mpic *mpics;
 static struct mpic *mpic_primary;
 static DEFINE_RAW_SPINLOCK(mpic_lock);
@@ -2035,6 +2041,8 @@ static struct syscore_ops mpic_syscore_ops = {
 static int mpic_init_sys(void)
 {
 	register_syscore_ops(&mpic_syscore_ops);
+	subsys_system_register(&mpic_subsys, NULL);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From a63b3bc7db32b63bfe5f48fa8582f931db81c86e Mon Sep 17 00:00:00 2001
From: "Dongsheng.wang@freescale.com" <Dongsheng.wang@freescale.com>
Date: Tue, 9 Apr 2013 10:22:32 +0800
Subject: powerpc/fsl: add MPIC timer wakeup support

The driver provides a way to wake up the system by the MPIC timer.

For example,
echo 5 > /sys/devices/system/mpic/timer_wakeup
echo standby > /sys/power/state

After 5 seconds the MPIC timer will generate an interrupt to wake up
the system.

Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
Signed-off-by: Li Yang <leoli@freescale.com>

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 7d07d9e..0847ee6 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -98,6 +98,15 @@ config MPIC_TIMER
 	  chip, but it can potentially support other global timers
 	  complying with the OpenPIC standard.
 
+config FSL_MPIC_TIMER_WAKEUP
+	tristate "Freescale MPIC global timer wakeup driver"
+	depends on FSL_SOC &&  MPIC_TIMER && PM
+	default n
+	help
+	  The driver provides a way to wake up the system by MPIC
+	  timer.
+	  e.g. "echo 5 > /sys/devices/system/mpic/timer_wakeup"
+
 config PPC_EPAPR_HV_PIC
 	bool
 	default n
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 2eb72c1..f67ac90 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -5,6 +5,7 @@ ccflags-$(CONFIG_PPC64)		:= $(NO_MINIMAL_TOC)
 mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o
 obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y)
 obj-$(CONFIG_MPIC_TIMER)        += mpic_timer.o
+obj-$(CONFIG_FSL_MPIC_TIMER_WAKEUP)	+= fsl_mpic_timer_wakeup.o
 mpic-msgr-obj-$(CONFIG_MPIC_MSGR)	+= mpic_msgr.o
 obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y) $(mpic-msgr-obj-y)
 obj-$(CONFIG_PPC_EPAPR_HV_PIC)	+= ehv_pic.o
diff --git a/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
new file mode 100644
index 0000000..1707bf0
--- /dev/null
+++ b/arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
@@ -0,0 +1,161 @@
+/*
+ * MPIC timer wakeup driver
+ *
+ * Copyright 2013 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+
+#include <asm/mpic_timer.h>
+#include <asm/mpic.h>
+
+struct fsl_mpic_timer_wakeup {
+	struct mpic_timer *timer;
+	struct work_struct free_work;
+};
+
+static struct fsl_mpic_timer_wakeup *fsl_wakeup;
+static DEFINE_MUTEX(sysfs_lock);
+
+static void fsl_free_resource(struct work_struct *ws)
+{
+	struct fsl_mpic_timer_wakeup *wakeup =
+		container_of(ws, struct fsl_mpic_timer_wakeup, free_work);
+
+	mutex_lock(&sysfs_lock);
+
+	if (wakeup->timer) {
+		disable_irq_wake(wakeup->timer->irq);
+		mpic_free_timer(wakeup->timer);
+	}
+
+	wakeup->timer = NULL;
+	mutex_unlock(&sysfs_lock);
+}
+
+static irqreturn_t fsl_mpic_timer_irq(int irq, void *dev_id)
+{
+	struct fsl_mpic_timer_wakeup *wakeup = dev_id;
+
+	schedule_work(&wakeup->free_work);
+
+	return wakeup->timer ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static ssize_t fsl_timer_wakeup_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct timeval interval;
+	int val = 0;
+
+	mutex_lock(&sysfs_lock);
+	if (fsl_wakeup->timer) {
+		mpic_get_remain_time(fsl_wakeup->timer, &interval);
+		val = interval.tv_sec + 1;
+	}
+	mutex_unlock(&sysfs_lock);
+
+	return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t fsl_timer_wakeup_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf,
+				size_t count)
+{
+	struct timeval interval;
+	int ret;
+
+	interval.tv_usec = 0;
+	if (kstrtol(buf, 0, &interval.tv_sec))
+		return -EINVAL;
+
+	mutex_lock(&sysfs_lock);
+
+	if (fsl_wakeup->timer) {
+		disable_irq_wake(fsl_wakeup->timer->irq);
+		mpic_free_timer(fsl_wakeup->timer);
+		fsl_wakeup->timer = NULL;
+	}
+
+	if (!interval.tv_sec) {
+		mutex_unlock(&sysfs_lock);
+		return count;
+	}
+
+	fsl_wakeup->timer = mpic_request_timer(fsl_mpic_timer_irq,
+						fsl_wakeup, &interval);
+	if (!fsl_wakeup->timer) {
+		mutex_unlock(&sysfs_lock);
+		return -EINVAL;
+	}
+
+	ret = enable_irq_wake(fsl_wakeup->timer->irq);
+	if (ret) {
+		mpic_free_timer(fsl_wakeup->timer);
+		fsl_wakeup->timer = NULL;
+		mutex_unlock(&sysfs_lock);
+
+		return ret;
+	}
+
+	mpic_start_timer(fsl_wakeup->timer);
+
+	mutex_unlock(&sysfs_lock);
+
+	return count;
+}
+
+static struct device_attribute mpic_attributes = __ATTR(timer_wakeup, 0644,
+			fsl_timer_wakeup_show, fsl_timer_wakeup_store);
+
+static int __init fsl_wakeup_sys_init(void)
+{
+	int ret;
+
+	fsl_wakeup = kzalloc(sizeof(struct fsl_mpic_timer_wakeup), GFP_KERNEL);
+	if (!fsl_wakeup)
+		return -ENOMEM;
+
+	INIT_WORK(&fsl_wakeup->free_work, fsl_free_resource);
+
+	ret = device_create_file(mpic_subsys.dev_root, &mpic_attributes);
+	if (ret)
+		kfree(fsl_wakeup);
+
+	return ret;
+}
+
+static void __exit fsl_wakeup_sys_exit(void)
+{
+	device_remove_file(mpic_subsys.dev_root, &mpic_attributes);
+
+	mutex_lock(&sysfs_lock);
+
+	if (fsl_wakeup->timer) {
+		disable_irq_wake(fsl_wakeup->timer->irq);
+		mpic_free_timer(fsl_wakeup->timer);
+	}
+
+	kfree(fsl_wakeup);
+
+	mutex_unlock(&sysfs_lock);
+}
+
+module_init(fsl_wakeup_sys_init);
+module_exit(fsl_wakeup_sys_exit);
+
+MODULE_DESCRIPTION("Freescale MPIC global timer wakeup driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wang Dongsheng <dongsheng.wang@freescale.com>");
-- 
cgit v0.10.2


From 1d8b368ab4aacfc3f864655baad4d31a3028ec1a Mon Sep 17 00:00:00 2001
From: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Date: Tue, 2 Jul 2013 11:06:54 +0530
Subject: pstore: Add hsize argument in write_buf call of pstore_ftrace_call

Incorporate the addition of hsize argument in write_buf callback
of pstore. This was forgotten in

    6bbbca735936e15b9431882eceddcf6dff76e03c
    pstore: Pass header size in the pstore write callback

Causing a build failure when ftrace and pstore are enabled.

Signed-off-by: Aruna Balakrishnaiah <aruna@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b1280..76a4eeb 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	rec.parent_ip = parent_ip;
 	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
 	psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
-			  sizeof(rec), psinfo);
+			  0, sizeof(rec), psinfo);
 
 	local_irq_restore(flags);
 }
-- 
cgit v0.10.2