From 1a18a66446f3f289b05b634f18012424d82aa63a Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Fri, 17 Jan 2014 12:25:28 +0800 Subject: powerpc: Set the correct ksp_limit on ppc32 when switching to irq stack Guenter Roeck has got the following call trace on a p2020 board: Kernel stack overflow in process eb3e5a00, r1=eb79df90 CPU: 0 PID: 2838 Comm: ssh Not tainted 3.13.0-rc8-juniper-00146-g19eca00 #4 task: eb3e5a00 ti: c0616000 task.ti: ef440000 NIP: c003a420 LR: c003a410 CTR: c0017518 REGS: eb79dee0 TRAP: 0901 Not tainted (3.13.0-rc8-juniper-00146-g19eca00) MSR: 00029000 CR: 24008444 XER: 00000000 GPR00: c003a410 eb79df90 eb3e5a00 00000000 eb05d900 00000001 65d87646 00000000 GPR08: 00000000 020b8000 00000000 00000000 44008442 NIP [c003a420] __do_softirq+0x94/0x1ec LR [c003a410] __do_softirq+0x84/0x1ec Call Trace: [eb79df90] [c003a410] __do_softirq+0x84/0x1ec (unreliable) [eb79dfe0] [c003a970] irq_exit+0xbc/0xc8 [eb79dff0] [c000cc1c] call_do_irq+0x24/0x3c [ef441f20] [c00046a8] do_IRQ+0x8c/0xf8 [ef441f40] [c000e7f4] ret_from_except+0x0/0x18 --- Exception: 501 at 0xfcda524 LR = 0x10024900 Instruction dump: 7c781b78 3b40000a 3a73b040 543c0024 3a800000 3b3913a0 7ef5bb78 48201bf9 5463103a 7d3b182e 7e89b92e 7c008146 <3ba00000> 7e7e9b78 48000014 57fff87f Kernel panic - not syncing: kernel stack overflow CPU: 0 PID: 2838 Comm: ssh Not tainted 3.13.0-rc8-juniper-00146-g19eca00 #4 Call Trace: The reason is that we have used the wrong register to calculate the ksp_limit in commit cbc9565ee826 (powerpc: Remove ksp_limit on ppc64). Just fix it. As suggested by Benjamin Herrenschmidt, also add the C prototype of the function in the comment in order to avoid such kind of errors in the future. Cc: stable@vger.kernel.org # 3.12 Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Kevin Hao Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 879f096..7c6bb4b 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -57,11 +57,14 @@ _GLOBAL(call_do_softirq) mtlr r0 blr +/* + * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); + */ _GLOBAL(call_do_irq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r3,THREAD_INFO_GAP + addi r11,r4,THREAD_INFO_GAP stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) mr r1,r4 stw r10,8(r1) -- cgit v0.10.2 From b020cc6c03a37c3526fcb1dff274f649257949e0 Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Fri, 17 Jan 2014 11:56:51 -0200 Subject: powerpc/pseries: Fix regression on PCI link speed Commit 5091f0c (powerpc/pseries: Fix PCIE link speed endian issue) introduced a regression on the PCI link speed detection using the device-tree property. The ibm,pcie-link-speed-stats property is composed of two 32-bit integers, the first one being the maxinum link speed and the second the current link speed. The changes introduced by the aforementioned commit are considering just the first integer. Fix this issue by changing how the property is accessed, using the helper functions to properly access the array of values. The explicit byte swapping is not needed anymore here, since it's done by the helper functions. Signed-off-by: Kleber Sacilotto de Souza Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 70670a2..a6f7a14 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -113,7 +113,8 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) { struct device_node *dn, *pdn; struct pci_bus *bus; - const __be32 *pcie_link_speed_stats; + u32 pcie_link_speed_stats[2]; + int rc; bus = bridge->bus; @@ -122,20 +123,21 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) return 0; for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) { - pcie_link_speed_stats = of_get_property(pdn, - "ibm,pcie-link-speed-stats", NULL); - if (pcie_link_speed_stats) + rc = of_property_read_u32_array(pdn, + "ibm,pcie-link-speed-stats", + &pcie_link_speed_stats[0], 2); + if (!rc) break; } of_node_put(pdn); - if (!pcie_link_speed_stats) { + if (rc) { pr_err("no ibm,pcie-link-speed-stats property\n"); return 0; } - switch (be32_to_cpup(pcie_link_speed_stats)) { + switch (pcie_link_speed_stats[0]) { case 0x01: bus->max_bus_speed = PCIE_SPEED_2_5GT; break; @@ -147,7 +149,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) break; } - switch (be32_to_cpup(pcie_link_speed_stats)) { + switch (pcie_link_speed_stats[1]) { case 0x01: bus->cur_bus_speed = PCIE_SPEED_2_5GT; break; -- cgit v0.10.2 From 49d9684a54d21930372b7fb0d3d7b5617f621706 Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Fri, 17 Jan 2014 11:56:52 -0200 Subject: powerpc/pseries: Add Gen3 definitions for PCIE link speed Rev3 of the PCI Express Base Specification defines a Supported Link Speeds Vector where the bit definitions within this field are: Bit 0 - 2.5 GT/s Bit 1 - 5.0 GT/s Bit 2 - 8.0 GT/s This vector definition is used by the platform firmware to export the maximum and current link speeds of the PCI bus via the "ibm,pcie-link-speed-stats" device-tree property. This patch updates pseries_root_bridge_prepare() to detect Gen3 speed buses (defined by 0x04). Signed-off-by: Kleber Sacilotto de Souza Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index a6f7a14..c413ec1 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -144,6 +144,9 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) case 0x02: bus->max_bus_speed = PCIE_SPEED_5_0GT; break; + case 0x04: + bus->max_bus_speed = PCIE_SPEED_8_0GT; + break; default: bus->max_bus_speed = PCI_SPEED_UNKNOWN; break; @@ -156,6 +159,9 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) case 0x02: bus->cur_bus_speed = PCIE_SPEED_5_0GT; break; + case 0x04: + bus->cur_bus_speed = PCIE_SPEED_8_0GT; + break; default: bus->cur_bus_speed = PCI_SPEED_UNKNOWN; break; -- cgit v0.10.2 From 88247e8d7ba6639f2c199e147ebbc91f7673150c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 12 Feb 2014 09:13:36 +0530 Subject: powerpc/mm: Add new "set" flag argument to pte/pmd update function pte_update() is a powerpc-ism used to change the bits of a PTE when the access permission is being restricted (a flush is potentially needed). It uses atomic operations on when needed and handles the hash synchronization on hash based processors. It is currently only used to clear PTE bits and so the current implementation doesn't provide a way to also set PTE bits. The new _PAGE_NUMA bit, when set, is actually restricting access so it must use that function too, so this change adds the ability for pte_update() to also set bits. We will use this later to set the _PAGE_NUMA bit. Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index d750336..623f297 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_PPC64 - return __pte(pte_update(mm, addr, ptep, ~0UL, 1)); + return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); #else return __pte(pte_update(ptep, ~0UL, 0)); #endif diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index bc141c9..eb92610 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, + unsigned long set, int huge) { #ifdef PTE_ATOMIC_UPDATES @@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct *mm, andi. %1,%0,%6\n\ bne- 1b \n\ andc %1,%0,%4 \n\ + or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne- 1b" : "=&r" (old), "=&r" (tmp), "=m" (*ptep) - : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY) + : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set) : "cc" ); #else unsigned long old = pte_val(*ptep); - *ptep = __pte(old & ~clr); + *ptep = __pte((old & ~clr) | set); #endif /* huge pages use the old page table lock */ if (!huge) @@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) + if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) return 0; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0); + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old & _PAGE_ACCESSED) != 0; } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG @@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, if ((pte_val(*ptep) & _PAGE_RW) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 0); + pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, @@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, if ((pte_val(*ptep) & _PAGE_RW) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 1); + pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); } /* @@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0); + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); return __pte(old); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t * ptep) { - pte_update(mm, addr, ptep, ~0UL, 0); + pte_update(mm, addr, ptep, ~0UL, 0, 0); } @@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma, extern unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr); + pmd_t *pmdp, + unsigned long clr, + unsigned long set); static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED); + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); return ((old & _PAGE_ACCESSED) != 0); } @@ -542,7 +546,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, if ((pmd_val(*pmdp) & _PAGE_RW) == 0) return; - pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW); + pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0); } #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 65b7b65..62bf5e8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -510,7 +510,8 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, } unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr) + pmd_t *pmdp, unsigned long clr, + unsigned long set) { unsigned long old, tmp; @@ -526,14 +527,15 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, andi. %1,%0,%6\n\ bne- 1b \n\ andc %1,%0,%4 \n\ + or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne- 1b" : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY) + : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) : "cc" ); #else old = pmd_val(*pmdp); - *pmdp = __pmd(old & ~clr); + *pmdp = __pmd((old & ~clr) | set); #endif if (old & _PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp); @@ -708,7 +710,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT); + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); } /* @@ -835,7 +837,7 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long old; pgtable_t *pgtable_slot; - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL); + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); old_pmd = __pmd(old); /* * We have pmd == none and we are holding page_table_lock. diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index a770df2d..6c0b1f5 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -78,7 +78,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; npages > 0; --npages) { - pte_update(mm, addr, pte, 0, 0); + pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } -- cgit v0.10.2 From 9d85d5863fa4818eb7fa306563bf830c0210c3a6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 12 Feb 2014 09:13:37 +0530 Subject: mm: Dirty accountable change only apply to non prot numa case So move it within the if loop Acked-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt diff --git a/mm/mprotect.c b/mm/mprotect.c index 7332c17..33eab90 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_numa(ptent)) ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we + * know to be dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; } else { struct page *page; @@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } } } - - /* - * Avoid taking write faults for pages we know to be - * dirty. - */ - if (dirty_accountable && pte_dirty(ptent)) { - ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) pages++; - - /* Only !prot_numa always clears the pte */ - if (!prot_numa) - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v0.10.2 From 56eecdb912b536a4fa97fb5bfe5a940a54d79be6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 12 Feb 2014 09:13:38 +0530 Subject: mm: Use ptep/pmdp_set_numa() for updating _PAGE_NUMA bit Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions when using a hash table MMU for various reasons (the flush is handled as part of the PTE modification when necessary). ppc64 thus doesn't implement flush_tlb_range for hash based MMUs. Additionally ppc64 require the tlb flushing to be batched within ptl locks. The reason to do that is to ensure that the hash page table is in sync with linux page table. We track the hpte index in linux pte and if we clear them without flushing hash and drop the ptl lock, we can have another cpu update the pte and can end up with duplicate entry in the hash table, which is fatal. We also want to keep set_pte_at simpler by not requiring them to do hash flush for performance reason. We do that by assuming that set_pte_at() is never *ever* called on a PTE that is already valid. This was the case until the NUMA code went in which broke that assumption. Fix that by introducing a new pair of helpers to set _PAGE_NUMA in a way similar to ptep/pmdp_set_wrprotect(), with a generic implementation using set_pte_at() and a powerpc specific one using the appropriate mechanism needed to keep the hash table in sync. Acked-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index f83b6f3..3ebb188 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -75,12 +75,34 @@ static inline pte_t pte_mknuma(pte_t pte) return pte; } +#define ptep_set_numa ptep_set_numa +static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if ((pte_val(*ptep) & _PAGE_PRESENT) == 0) + VM_BUG_ON(1); + + pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); + return; +} + #define pmd_numa pmd_numa static inline int pmd_numa(pmd_t pmd) { return pte_numa(pmd_pte(pmd)); } +#define pmdp_set_numa pmdp_set_numa +static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0) + VM_BUG_ON(1); + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); + return; +} + #define pmd_mknonnuma pmd_mknonnuma static inline pmd_t pmd_mknonnuma(pmd_t pmd) { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8e4f41d..34c7bdc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -701,6 +701,18 @@ static inline pte_t pte_mknuma(pte_t pte) } #endif +#ifndef ptep_set_numa +static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t ptent = *ptep; + + ptent = pte_mknuma(ptent); + set_pte_at(mm, addr, ptep, ptent); + return; +} +#endif + #ifndef pmd_mknuma static inline pmd_t pmd_mknuma(pmd_t pmd) { @@ -708,6 +720,18 @@ static inline pmd_t pmd_mknuma(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } #endif + +#ifndef pmdp_set_numa +static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + pmd = pmd_mknuma(pmd); + set_pmd_at(mm, addr, pmdp, pmd); + return; +} +#endif #else extern int pte_numa(pte_t pte); extern int pmd_numa(pmd_t pmd); @@ -715,6 +739,8 @@ extern pte_t pte_mknonnuma(pte_t pte); extern pmd_t pmd_mknonnuma(pmd_t pmd); extern pte_t pte_mknuma(pte_t pte); extern pmd_t pmd_mknuma(pmd_t pmd); +extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ #else static inline int pmd_numa(pmd_t pmd) @@ -742,10 +768,23 @@ static inline pte_t pte_mknuma(pte_t pte) return pte; } +static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + return; +} + + static inline pmd_t pmd_mknuma(pmd_t pmd) { return pmd; } + +static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + return ; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82166bf..da23eb9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1545,6 +1545,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); @@ -1557,16 +1558,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { - entry = *pmd; - entry = pmd_mknuma(entry); + pmdp_set_numa(mm, addr, pmd); ret = HPAGE_PMD_NR; } } - - /* Set PMD if cleared earlier */ - if (ret == HPAGE_PMD_NR) - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(ptl); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 33eab90..769a67a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,12 +69,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { struct page *page; - ptent = *pte; page = vm_normal_page(vma, addr, oldpte); if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, pte, ptent); + ptep_set_numa(mm, addr, pte); updated = true; } } -- cgit v0.10.2 From a0a4419e302fedb548d56129e02130347810f892 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 12 Feb 2014 17:17:05 +1100 Subject: powerpc: Link VDSOs at 0x0 perf is failing to resolve symbols in the VDSO. A while (1) gettimeofday() loop shows: 93.99% [vdso] [.] 0x00000000000005e0 3.12% test [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18 2.81% test [.] main The reason for this is that we are linking our VDSO shared libraries at 1MB, which is a little weird. Even though this is uncommon, Alan points out that it is valid and we should probably fix perf userspace. Regardless, I can't see a reason why we are doing this. The code is all position independent and we never rely on the VDSO ending up at 1M (and we never place it there on 64bit tasks). Changing our link address to 0x0 fixes perf VDSO symbol resolution: 73.18% [vdso] [.] 0x000000000000060c 12.39% [vdso] [.] __kernel_gettimeofday 3.58% test [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18 2.94% [vdso] [.] __kernel_datapage_offset 2.90% test [.] main We still have some local symbol resolution issues that will be fixed in a subsequent patch. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 0d9cecd..c53f5f6 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -4,11 +4,11 @@ #ifdef __KERNEL__ /* Default link addresses for the vDSOs */ -#define VDSO32_LBASE 0x100000 -#define VDSO64_LBASE 0x100000 +#define VDSO32_LBASE 0x0 +#define VDSO64_LBASE 0x0 /* Default map addresses for 32bit vDSO */ -#define VDSO32_MBASE VDSO32_LBASE +#define VDSO32_MBASE 0x100000 #define VDSO_VERSION_STRING LINUX_2.6.15 -- cgit v0.10.2 From 24b659a13866b935eca72748ce725279bd3c4466 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 12 Feb 2014 17:18:50 +1100 Subject: powerpc: Use unstripped VDSO image for more accurate profiling data We are seeing a lot of hits in the VDSO that are not resolved by perf. A while(1) gettimeofday() loop shows the issue: 27.64% [vdso] [.] 0x000000000000060c 22.57% [vdso] [.] 0x0000000000000628 16.88% [vdso] [.] 0x0000000000000610 12.39% [vdso] [.] __kernel_gettimeofday 6.09% [vdso] [.] 0x00000000000005f8 3.58% test [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18 2.94% [vdso] [.] __kernel_datapage_offset 2.90% test [.] main We are using a stripped VDSO image which means only symbols with relocation info can be resolved. There isn't a lot of point to stripping the VDSO, the debug info is only about 1kB: 4680 arch/powerpc/kernel/vdso64/vdso64.so 5815 arch/powerpc/kernel/vdso64/vdso64.so.dbg By using the unstripped image, we can resolve all the symbols in the VDSO and the perf profile data looks much better: 76.53% [vdso] [.] __do_get_tspec 12.20% [vdso] [.] __kernel_gettimeofday 5.05% [vdso] [.] __get_datapage 3.20% test [.] main 2.92% test [.] 00000037.plt_call.gettimeofday@@GLIBC_2.18 Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S index 79683d0..6ac107a 100644 --- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S +++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S @@ -6,7 +6,7 @@ .globl vdso32_start, vdso32_end .balign PAGE_SIZE vdso32_start: - .incbin "arch/powerpc/kernel/vdso32/vdso32.so" + .incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg" .balign PAGE_SIZE vdso32_end: diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S index 8df9e24..df60fca 100644 --- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S +++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S @@ -6,7 +6,7 @@ .globl vdso64_start, vdso64_end .balign PAGE_SIZE vdso64_start: - .incbin "arch/powerpc/kernel/vdso64/vdso64.so" + .incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg" .balign PAGE_SIZE vdso64_end: -- cgit v0.10.2 From 5b2e198e50f6ba57081586b853163ea1bb95f1a8 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Feb 2014 15:24:54 +0800 Subject: powerpc/powernv: Rework EEH reset When doing reset in order to recover the affected PE, we issue hot reset on PE primary bus if it's not root bus. Otherwise, we issue hot or fundamental reset on root port or PHB accordingly. For the later case, we didn't cover the situation where PE only includes root port and it potentially causes kernel crash upon EEH error to the PE. The patch reworks the logic of EEH reset to improve the code readability and also avoid the kernel crash. Cc: stable@vger.kernel.org Reported-by: Thadeu Lima de Souza Cascardo Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index e1e7161..fcb79cf 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -489,8 +489,7 @@ static int ioda_eeh_bridge_reset(struct pci_controller *hose, static int ioda_eeh_reset(struct eeh_pe *pe, int option) { struct pci_controller *hose = pe->phb; - struct eeh_dev *edev; - struct pci_dev *dev; + struct pci_bus *bus; int ret; /* @@ -519,31 +518,11 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option) if (pe->type & EEH_PE_PHB) { ret = ioda_eeh_phb_reset(hose, option); } else { - if (pe->type & EEH_PE_DEVICE) { - /* - * If it's device PE, we didn't refer to the parent - * PCI bus yet. So we have to figure it out indirectly. - */ - edev = list_first_entry(&pe->edevs, - struct eeh_dev, list); - dev = eeh_dev_to_pci_dev(edev); - dev = dev->bus->self; - } else { - /* - * If it's bus PE, the parent PCI bus is already there - * and just pick it up. - */ - dev = pe->bus->self; - } - - /* - * Do reset based on the fact that the direct upstream bridge - * is root bridge (port) or not. - */ - if (dev->bus->number == 0) + bus = eeh_pe_bus_get(pe); + if (pci_is_root_bus(bus)) ret = ioda_eeh_root_reset(hose, option); else - ret = ioda_eeh_bridge_reset(hose, dev, option); + ret = ioda_eeh_bridge_reset(hose, bus->self, option); } return ret; -- cgit v0.10.2 From 2ec5a0adf60c23bb6b0a95d3b96a8c1ff1e1aa5a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Feb 2014 15:24:55 +0800 Subject: powerpc/eeh: Cleanup on eeh_subsystem_enabled The patch cleans up variable eeh_subsystem_enabled so that we needn't refer the variable directly from external. Instead, we will use function eeh_enabled() and eeh_set_enable() to operate the variable. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 9e39ceb..d4dd41f 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -172,10 +172,20 @@ struct eeh_ops { }; extern struct eeh_ops *eeh_ops; -extern int eeh_subsystem_enabled; +extern bool eeh_subsystem_enabled; extern raw_spinlock_t confirm_error_lock; extern int eeh_probe_mode; +static inline bool eeh_enabled(void) +{ + return eeh_subsystem_enabled; +} + +static inline void eeh_set_enable(bool mode) +{ + eeh_subsystem_enabled = mode; +} + #define EEH_PROBE_MODE_DEV (1<<0) /* From PCI device */ #define EEH_PROBE_MODE_DEVTREE (1<<1) /* From device tree */ @@ -246,7 +256,7 @@ void eeh_remove_device(struct pci_dev *); * If this macro yields TRUE, the caller relays to eeh_check_failure() * which does further tests out of line. */ -#define EEH_POSSIBLE_ERROR(val, type) ((val) == (type)~0 && eeh_subsystem_enabled) +#define EEH_POSSIBLE_ERROR(val, type) ((val) == (type)~0 && eeh_enabled()) /* * Reads from a device which has been isolated by EEH will return @@ -257,6 +267,13 @@ void eeh_remove_device(struct pci_dev *); #else /* !CONFIG_EEH */ +static inline bool eeh_enabled(void) +{ + return false; +} + +static inline void eeh_set_enable(bool mode) { } + static inline int eeh_init(void) { return 0; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 148db72..f22f7b6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -89,7 +89,7 @@ /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; -int eeh_subsystem_enabled; +bool eeh_subsystem_enabled = false; EXPORT_SYMBOL(eeh_subsystem_enabled); /* @@ -364,7 +364,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) eeh_stats.total_mmio_ffs++; - if (!eeh_subsystem_enabled) + if (!eeh_enabled()) return 0; if (!edev) { @@ -822,7 +822,7 @@ int eeh_init(void) return ret; } - if (eeh_subsystem_enabled) + if (eeh_enabled()) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); else pr_warning("EEH: No capable adapters found\n"); @@ -897,7 +897,7 @@ void eeh_add_device_late(struct pci_dev *dev) struct device_node *dn; struct eeh_dev *edev; - if (!dev || !eeh_subsystem_enabled) + if (!dev || !eeh_enabled()) return; pr_debug("EEH: Adding device %s\n", pci_name(dev)); @@ -1005,7 +1005,7 @@ void eeh_remove_device(struct pci_dev *dev) { struct eeh_dev *edev; - if (!dev || !eeh_subsystem_enabled) + if (!dev || !eeh_enabled()) return; edev = pci_dev_to_eeh_dev(dev); @@ -1045,7 +1045,7 @@ void eeh_remove_device(struct pci_dev *dev) static int proc_eeh_show(struct seq_file *m, void *v) { - if (0 == eeh_subsystem_enabled) { + if (!eeh_enabled()) { seq_printf(m, "EEH Subsystem is globally disabled\n"); seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs); } else { diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index a79fddc..a59788e 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -145,7 +145,7 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_subsystem_enabled = 1; + eeh_set_enable(true); /* Save memory bars */ eeh_save_bars(edev); diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 9ef3cc8..8a8f047 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -265,7 +265,7 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag) enable = 1; if (enable) { - eeh_subsystem_enabled = 1; + eeh_set_enable(true); eeh_add_to_parent_pe(edev); pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n", -- cgit v0.10.2 From 66f9af83e56bfa12964d251df9d60fb571579913 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Feb 2014 15:24:56 +0800 Subject: powerpc/eeh: Disable EEH on reboot We possiblly detect EEH errors during reboot, particularly in kexec path, but it's impossible for device drivers and EEH core to handle or recover them properly. The patch registers one reboot notifier for EEH and disable EEH subsystem during reboot. That means the EEH errors is going to be cleared by hardware reset or second kernel during early stage of PCI probe. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index f22f7b6..e7b76a6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -747,6 +748,17 @@ int __exit eeh_ops_unregister(const char *name) return -EEXIST; } +static int eeh_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + eeh_set_enable(false); + return NOTIFY_DONE; +} + +static struct notifier_block eeh_reboot_nb = { + .notifier_call = eeh_reboot_notifier, +}; + /** * eeh_init - EEH initialization * @@ -778,6 +790,14 @@ int eeh_init(void) if (machine_is(powernv) && cnt++ <= 0) return ret; + /* Register reboot notifier */ + ret = register_reboot_notifier(&eeh_reboot_nb); + if (ret) { + pr_warn("%s: Failed to register notifier (%d)\n", + __func__, ret); + return ret; + } + /* call platform initialization function */ if (!eeh_ops) { pr_warning("%s: Platform EEH operation not found\n", diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index fcb79cf..f514743 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -44,7 +44,8 @@ static int ioda_eeh_event(struct notifier_block *nb, /* We simply send special EEH event */ if ((changed_evts & OPAL_EVENT_PCI_ERROR) && - (events & OPAL_EVENT_PCI_ERROR)) + (events & OPAL_EVENT_PCI_ERROR) && + eeh_enabled()) eeh_send_failure_event(NULL); return 0; -- cgit v0.10.2